diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21498 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9952904238618525, + "eval_steps": 500, + "global_step": 1431, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020931449502878076, + "grad_norm": 8.275816917419434, + "learning_rate": 0.0, + "logits/chosen": 3.5, + "logits/rejected": 3.40625, + "logps/chosen": -356.0, + "logps/rejected": -272.0, + "loss": 0.6944, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.00811767578125, + "rewards/margins": -0.0093994140625, + "rewards/rejected": 0.001251220703125, + "step": 1 + }, + { + "epoch": 0.004186289900575615, + "grad_norm": 8.14901351928711, + "learning_rate": 8.859191006777895e-08, + "logits/chosen": 3.6875, + "logits/rejected": 4.1875, + "logps/chosen": -472.0, + "logps/rejected": -290.0, + "loss": 0.6949, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010009765625, + "rewards/margins": 0.0050048828125, + "rewards/rejected": 0.0050048828125, + "step": 2 + }, + { + "epoch": 0.006279434850863423, + "grad_norm": 7.643013000488281, + "learning_rate": 1.404148553246907e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.8125, + "logps/chosen": -342.0, + "logps/rejected": -422.0, + "loss": 0.6919, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.003753662109375, + "rewards/margins": 0.0150146484375, + "rewards/rejected": -0.01123046875, + "step": 3 + }, + { + "epoch": 0.00837257980115123, + "grad_norm": 9.3782320022583, + "learning_rate": 1.771838201355579e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.625, + "logps/chosen": -378.0, + "logps/rejected": -456.0, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0238037109375, + "rewards/margins": 0.001251220703125, + "rewards/rejected": -0.0250244140625, + "step": 4 + }, + { + "epoch": 0.010465724751439037, + "grad_norm": 13.61230182647705, + "learning_rate": 2.057040449661105e-07, + "logits/chosen": 3.453125, + "logits/rejected": 3.890625, + "logps/chosen": -262.0, + "logps/rejected": -177.0, + "loss": 0.6943, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00750732421875, + "rewards/margins": 0.0087890625, + "rewards/rejected": -0.0162353515625, + "step": 5 + }, + { + "epoch": 0.012558869701726845, + "grad_norm": 8.330711364746094, + "learning_rate": 2.2900676539246965e-07, + "logits/chosen": 3.765625, + "logits/rejected": 4.34375, + "logps/chosen": -466.0, + "logps/rejected": -286.0, + "loss": 0.6939, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00250244140625, + "rewards/margins": 0.01373291015625, + "rewards/rejected": -0.01123046875, + "step": 6 + }, + { + "epoch": 0.014652014652014652, + "grad_norm": 7.784880638122559, + "learning_rate": 2.4870893478326387e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.9375, + "logps/chosen": -398.0, + "logps/rejected": -290.0, + "loss": 0.6938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00750732421875, + "rewards/margins": 0.0250244140625, + "rewards/rejected": -0.017578125, + "step": 7 + }, + { + "epoch": 0.01674515960230246, + "grad_norm": 8.997271537780762, + "learning_rate": 2.6577573020333683e-07, + "logits/chosen": 3.71875, + "logits/rejected": 4.125, + "logps/chosen": -234.0, + "logps/rejected": -211.0, + "loss": 0.6937, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0137939453125, + "rewards/margins": 0.0162353515625, + "rewards/rejected": -0.00250244140625, + "step": 8 + }, + { + "epoch": 0.018838304552590265, + "grad_norm": 7.737761974334717, + "learning_rate": 2.808297106493814e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.453125, + "logps/chosen": -300.0, + "logps/rejected": -316.0, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01123046875, + "rewards/margins": 0.00250244140625, + "rewards/rejected": 0.0087890625, + "step": 9 + }, + { + "epoch": 0.020931449502878074, + "grad_norm": 7.448185443878174, + "learning_rate": 2.942959550338895e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.71875, + "logps/chosen": -284.0, + "logps/rejected": -231.0, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0050048828125, + "rewards/margins": -0.00439453125, + "rewards/rejected": -0.0006256103515625, + "step": 10 + }, + { + "epoch": 0.023024594453165882, + "grad_norm": 8.635788917541504, + "learning_rate": 3.0647765484394645e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.78125, + "logps/chosen": -804.0, + "logps/rejected": -274.0, + "loss": 0.6937, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0150146484375, + "rewards/margins": 0.00506591796875, + "rewards/rejected": 0.010009765625, + "step": 11 + }, + { + "epoch": 0.02511773940345369, + "grad_norm": 8.279386520385742, + "learning_rate": 3.175986754602486e-07, + "logits/chosen": 3.46875, + "logits/rejected": 3.0, + "logps/chosen": -536.0, + "logps/rejected": -692.0, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01251220703125, + "rewards/margins": 0.01190185546875, + "rewards/rejected": 0.0006256103515625, + "step": 12 + }, + { + "epoch": 0.027210884353741496, + "grad_norm": 7.699203014373779, + "learning_rate": 3.2782902272079295e-07, + "logits/chosen": 3.421875, + "logits/rejected": 3.90625, + "logps/chosen": -252.0, + "logps/rejected": -118.0, + "loss": 0.6937, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02880859375, + "rewards/margins": -0.0306396484375, + "rewards/rejected": 0.0018768310546875, + "step": 13 + }, + { + "epoch": 0.029304029304029304, + "grad_norm": 7.581473350524902, + "learning_rate": 3.373008448510428e-07, + "logits/chosen": 2.28125, + "logits/rejected": 2.625, + "logps/chosen": -364.0, + "logps/rejected": -151.0, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0050048828125, + "rewards/margins": 0.002532958984375, + "rewards/rejected": -0.00750732421875, + "step": 14 + }, + { + "epoch": 0.03139717425431711, + "grad_norm": 8.212568283081055, + "learning_rate": 3.461189002908012e-07, + "logits/chosen": 3.546875, + "logits/rejected": 4.0625, + "logps/chosen": -416.0, + "logps/rejected": -172.0, + "loss": 0.6929, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027587890625, + "rewards/margins": 0.018798828125, + "rewards/rejected": 0.0087890625, + "step": 15 + }, + { + "epoch": 0.03349031920460492, + "grad_norm": 8.286090850830078, + "learning_rate": 3.543676402711158e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.765625, + "logps/chosen": -688.0, + "logps/rejected": -488.0, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01123046875, + "rewards/margins": 0.036376953125, + "rewards/rejected": -0.0250244140625, + "step": 16 + }, + { + "epoch": 0.035583464154892726, + "grad_norm": 7.6797285079956055, + "learning_rate": 3.621161404374383e-07, + "logits/chosen": 3.40625, + "logits/rejected": 3.4375, + "logps/chosen": -268.0, + "logps/rejected": -190.0, + "loss": 0.6897, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01123046875, + "rewards/margins": 0.0137939453125, + "rewards/rejected": -0.00250244140625, + "step": 17 + }, + { + "epoch": 0.03767660910518053, + "grad_norm": 8.51090145111084, + "learning_rate": 3.6942162071716033e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.796875, + "logps/chosen": -548.0, + "logps/rejected": -338.0, + "loss": 0.6935, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.010009765625, + "rewards/margins": 0.0162353515625, + "rewards/rejected": -0.006256103515625, + "step": 18 + }, + { + "epoch": 0.03976975405546834, + "grad_norm": 7.66601037979126, + "learning_rate": 3.76332012245438e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.765625, + "logps/chosen": -140.0, + "logps/rejected": -316.0, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0006256103515625, + "rewards/margins": 0.003173828125, + "rewards/rejected": -0.003753662109375, + "step": 19 + }, + { + "epoch": 0.04186289900575615, + "grad_norm": 8.81503963470459, + "learning_rate": 3.828878651016684e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.34375, + "logps/chosen": -454.0, + "logps/rejected": -324.0, + "loss": 0.6945, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00970458984375, + "rewards/margins": 0.0203857421875, + "rewards/rejected": -0.0106201171875, + "step": 20 + }, + { + "epoch": 0.04395604395604396, + "grad_norm": 7.937436103820801, + "learning_rate": 3.891237901079545e-07, + "logits/chosen": 4.1875, + "logits/rejected": 3.625, + "logps/chosen": -268.0, + "logps/rejected": -392.0, + "loss": 0.6934, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0306396484375, + "rewards/margins": 0.03564453125, + "rewards/rejected": -0.0050048828125, + "step": 21 + }, + { + "epoch": 0.046049188906331764, + "grad_norm": 7.528475284576416, + "learning_rate": 3.9506956491172536e-07, + "logits/chosen": 3.625, + "logits/rejected": 4.0625, + "logps/chosen": -508.0, + "logps/rejected": -374.0, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0150146484375, + "rewards/margins": 0.0, + "rewards/rejected": 0.0150146484375, + "step": 22 + }, + { + "epoch": 0.04814233385661957, + "grad_norm": 7.722219467163086, + "learning_rate": 4.007509939970292e-07, + "logits/chosen": 3.46875, + "logits/rejected": 3.640625, + "logps/chosen": -376.0, + "logps/rejected": -296.0, + "loss": 0.6883, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0006256103515625, + "rewards/margins": -0.0006256103515625, + "rewards/rejected": 0.0, + "step": 23 + }, + { + "epoch": 0.05023547880690738, + "grad_norm": 8.647167205810547, + "learning_rate": 4.061905855280276e-07, + "logits/chosen": 3.6875, + "logits/rejected": 3.71875, + "logps/chosen": -117.0, + "logps/rejected": -167.0, + "loss": 0.6911, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00250244140625, + "rewards/margins": -0.014404296875, + "rewards/rejected": 0.01190185546875, + "step": 24 + }, + { + "epoch": 0.052328623757195186, + "grad_norm": 7.382556438446045, + "learning_rate": 4.11408089932221e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -398.0, + "logps/rejected": -548.0, + "loss": 0.6914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01251220703125, + "rewards/margins": -0.0224609375, + "rewards/rejected": 0.010009765625, + "step": 25 + }, + { + "epoch": 0.05442176870748299, + "grad_norm": 8.106797218322754, + "learning_rate": 4.1642093278857186e-07, + "logits/chosen": 2.390625, + "logits/rejected": 3.5625, + "logps/chosen": -680.0, + "logps/rejected": -316.0, + "loss": 0.6872, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021240234375, + "rewards/margins": 0.032470703125, + "rewards/rejected": -0.01123046875, + "step": 26 + }, + { + "epoch": 0.0565149136577708, + "grad_norm": 8.886155128479004, + "learning_rate": 4.212445659740721e-07, + "logits/chosen": 3.578125, + "logits/rejected": 4.0, + "logps/chosen": -490.0, + "logps/rejected": -238.0, + "loss": 0.6898, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00750732421875, + "rewards/margins": -0.01251220703125, + "rewards/rejected": 0.0050048828125, + "step": 27 + }, + { + "epoch": 0.05860805860805861, + "grad_norm": 8.035849571228027, + "learning_rate": 4.2589275491882174e-07, + "logits/chosen": 3.21875, + "logits/rejected": 2.875, + "logps/chosen": -212.0, + "logps/rejected": -136.0, + "loss": 0.6891, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00250244140625, + "rewards/margins": 0.014404296875, + "rewards/rejected": -0.01190185546875, + "step": 28 + }, + { + "epoch": 0.06070120355834641, + "grad_norm": 8.825554847717285, + "learning_rate": 4.303778154313212e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.46875, + "logps/chosen": -396.0, + "logps/rejected": -324.0, + "loss": 0.6879, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016845703125, + "rewards/margins": 0.03125, + "rewards/rejected": -0.014404296875, + "step": 29 + }, + { + "epoch": 0.06279434850863422, + "grad_norm": 7.398782253265381, + "learning_rate": 4.347108103585802e-07, + "logits/chosen": 3.78125, + "logits/rejected": 4.4375, + "logps/chosen": -438.0, + "logps/rejected": -374.0, + "loss": 0.6926, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0050048828125, + "rewards/margins": 0.001251220703125, + "rewards/rejected": -0.006256103515625, + "step": 30 + }, + { + "epoch": 0.06488749345892203, + "grad_norm": 7.316285133361816, + "learning_rate": 4.3890171398791635e-07, + "logits/chosen": 3.125, + "logits/rejected": 2.796875, + "logps/chosen": -110.0, + "logps/rejected": -161.0, + "loss": 0.6901, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0087890625, + "rewards/margins": -0.0050048828125, + "rewards/rejected": -0.003753662109375, + "step": 31 + }, + { + "epoch": 0.06698063840920984, + "grad_norm": 8.234468460083008, + "learning_rate": 4.4295955033889476e-07, + "logits/chosen": 3.6875, + "logits/rejected": 4.21875, + "logps/chosen": -584.0, + "logps/rejected": -354.0, + "loss": 0.6857, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.032470703125, + "rewards/margins": 0.0576171875, + "rewards/rejected": -0.0250244140625, + "step": 32 + }, + { + "epoch": 0.06907378335949764, + "grad_norm": 8.064417839050293, + "learning_rate": 4.468925101686371e-07, + "logits/chosen": 3.625, + "logits/rejected": 3.40625, + "logps/chosen": -253.0, + "logps/rejected": -270.0, + "loss": 0.6894, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0050048828125, + "rewards/margins": -0.00250244140625, + "rewards/rejected": 0.00750732421875, + "step": 33 + }, + { + "epoch": 0.07116692830978545, + "grad_norm": 9.39603042602539, + "learning_rate": 4.5070805050521726e-07, + "logits/chosen": 3.40625, + "logits/rejected": 3.34375, + "logps/chosen": -556.0, + "logps/rejected": -552.0, + "loss": 0.6874, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.016845703125, + "rewards/margins": -0.0194091796875, + "rewards/rejected": 0.00250244140625, + "step": 34 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 7.400561809539795, + "learning_rate": 4.5441297974937435e-07, + "logits/chosen": 2.875, + "logits/rejected": 3.09375, + "logps/chosen": -308.0, + "logps/rejected": -560.0, + "loss": 0.6936, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01312255859375, + "rewards/margins": -0.0306396484375, + "rewards/rejected": 0.017578125, + "step": 35 + }, + { + "epoch": 0.07535321821036106, + "grad_norm": 8.39317798614502, + "learning_rate": 4.580135307849393e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.703125, + "logps/chosen": -444.0, + "logps/rejected": -464.0, + "loss": 0.6848, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010009765625, + "rewards/margins": 0.0113525390625, + "rewards/rejected": -0.001251220703125, + "step": 36 + }, + { + "epoch": 0.07744636316064887, + "grad_norm": 7.993171215057373, + "learning_rate": 4.615154240700883e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.140625, + "logps/chosen": -384.0, + "logps/rejected": -426.0, + "loss": 0.6911, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02001953125, + "rewards/margins": 0.030029296875, + "rewards/rejected": -0.010009765625, + "step": 37 + }, + { + "epoch": 0.07953950811093669, + "grad_norm": 7.946194171905518, + "learning_rate": 4.649239223132169e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.40625, + "logps/chosen": -316.0, + "logps/rejected": -506.0, + "loss": 0.6868, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01251220703125, + "rewards/margins": 0.04248046875, + "rewards/rejected": -0.030029296875, + "step": 38 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 8.186020851135254, + "learning_rate": 4.6824387804548366e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.3125, + "logps/chosen": -296.0, + "logps/rejected": -332.0, + "loss": 0.687, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01373291015625, + "rewards/margins": 0.0294189453125, + "rewards/rejected": -0.015625, + "step": 39 + }, + { + "epoch": 0.0837257980115123, + "grad_norm": 7.499300003051758, + "learning_rate": 4.7147977516944737e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.375, + "logps/chosen": -78.5, + "logps/rejected": -73.5, + "loss": 0.6913, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0106201171875, + "rewards/margins": -0.0118408203125, + "rewards/rejected": 0.001251220703125, + "step": 40 + }, + { + "epoch": 0.08581894296180011, + "grad_norm": 7.722729206085205, + "learning_rate": 4.7463576537657413e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.9375, + "logps/chosen": -310.0, + "logps/rejected": -394.0, + "loss": 0.6912, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.006866455078125, + "rewards/margins": 0.0181884765625, + "rewards/rejected": -0.0250244140625, + "step": 41 + }, + { + "epoch": 0.08791208791208792, + "grad_norm": 7.086386680603027, + "learning_rate": 4.777157001757335e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.828125, + "logps/chosen": -458.0, + "logps/rejected": -444.0, + "loss": 0.6914, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0087890625, + "rewards/margins": 0.07666015625, + "rewards/rejected": -0.06787109375, + "step": 42 + }, + { + "epoch": 0.09000523286237572, + "grad_norm": 8.327207565307617, + "learning_rate": 4.807231591525269e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.4375, + "logps/chosen": -202.0, + "logps/rejected": -176.0, + "loss": 0.6819, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02880859375, + "rewards/margins": -0.02880859375, + "rewards/rejected": 0.0, + "step": 43 + }, + { + "epoch": 0.09209837781266353, + "grad_norm": 8.074577331542969, + "learning_rate": 4.836614749795043e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.984375, + "logps/chosen": -348.0, + "logps/rejected": -218.0, + "loss": 0.6867, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0087890625, + "rewards/margins": 0.027587890625, + "rewards/rejected": -0.018798828125, + "step": 44 + }, + { + "epoch": 0.09419152276295134, + "grad_norm": 8.236300468444824, + "learning_rate": 4.865337556154919e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.0625, + "logps/chosen": -276.0, + "logps/rejected": -294.0, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0087890625, + "rewards/margins": 0.0181884765625, + "rewards/rejected": -0.0093994140625, + "step": 45 + }, + { + "epoch": 0.09628466771323914, + "grad_norm": 8.110278129577637, + "learning_rate": 4.893429040648081e-07, + "logits/chosen": 2.640625, + "logits/rejected": 2.96875, + "logps/chosen": -576.0, + "logps/rejected": -544.0, + "loss": 0.6844, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.052490234375, + "rewards/margins": 0.0150146484375, + "rewards/rejected": 0.03759765625, + "step": 46 + }, + { + "epoch": 0.09837781266352695, + "grad_norm": 7.28758430480957, + "learning_rate": 4.920916360113128e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.609375, + "logps/chosen": -356.0, + "logps/rejected": -528.0, + "loss": 0.6908, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018798828125, + "rewards/margins": 0.03369140625, + "rewards/rejected": -0.0150146484375, + "step": 47 + }, + { + "epoch": 0.10047095761381476, + "grad_norm": 7.295600891113281, + "learning_rate": 4.947824955958065e-07, + "logits/chosen": 3.5625, + "logits/rejected": 4.15625, + "logps/chosen": -326.0, + "logps/rejected": -320.0, + "loss": 0.6847, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010009765625, + "rewards/margins": -0.00250244140625, + "rewards/rejected": -0.00750732421875, + "step": 48 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 7.849913597106934, + "learning_rate": 4.974178695665277e-07, + "logits/chosen": 4.21875, + "logits/rejected": 3.921875, + "logps/chosen": -266.0, + "logps/rejected": -348.0, + "loss": 0.6857, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0150146484375, + "rewards/margins": 0.05517578125, + "rewards/rejected": -0.0400390625, + "step": 49 + }, + { + "epoch": 0.10465724751439037, + "grad_norm": 8.052017211914062, + "learning_rate": 5e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.09375, + "logps/chosen": -178.0, + "logps/rejected": -358.0, + "loss": 0.69, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0018768310546875, + "rewards/margins": 0.0281982421875, + "rewards/rejected": -0.030029296875, + "step": 50 + }, + { + "epoch": 0.10675039246467818, + "grad_norm": 7.91984748840332, + "learning_rate": 4.99999353186937e-07, + "logits/chosen": 3.21875, + "logits/rejected": 4.3125, + "logps/chosen": -540.0, + "logps/rejected": -280.0, + "loss": 0.6877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.052490234375, + "rewards/rejected": -0.027587890625, + "step": 51 + }, + { + "epoch": 0.10884353741496598, + "grad_norm": 7.94594144821167, + "learning_rate": 4.999974127510951e-07, + "logits/chosen": 3.71875, + "logits/rejected": 3.96875, + "logps/chosen": -241.0, + "logps/rejected": -238.0, + "loss": 0.6801, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0400390625, + "rewards/margins": 0.0150146484375, + "rewards/rejected": -0.05517578125, + "step": 52 + }, + { + "epoch": 0.1109366823652538, + "grad_norm": 51.00778579711914, + "learning_rate": 4.999941787025163e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.78125, + "logps/chosen": -540.0, + "logps/rejected": -430.0, + "loss": 0.685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0751953125, + "rewards/margins": 0.115234375, + "rewards/rejected": -0.0400390625, + "step": 53 + }, + { + "epoch": 0.1130298273155416, + "grad_norm": 7.948894500732422, + "learning_rate": 4.999896510579369e-07, + "logits/chosen": 3.453125, + "logits/rejected": 3.84375, + "logps/chosen": -528.0, + "logps/rejected": -284.0, + "loss": 0.6866, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01123046875, + "rewards/margins": 0.02880859375, + "rewards/rejected": -0.017578125, + "step": 54 + }, + { + "epoch": 0.1151229722658294, + "grad_norm": 6.681576251983643, + "learning_rate": 4.999838298407872e-07, + "logits/chosen": 3.4375, + "logits/rejected": 3.5625, + "logps/chosen": -245.0, + "logps/rejected": -208.0, + "loss": 0.6892, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.031982421875, + "rewards/margins": -0.014404296875, + "rewards/rejected": -0.017578125, + "step": 55 + }, + { + "epoch": 0.11721611721611722, + "grad_norm": 7.2991156578063965, + "learning_rate": 4.999767150811926e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.515625, + "logps/chosen": -204.0, + "logps/rejected": -124.0, + "loss": 0.6845, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03125, + "rewards/margins": -0.01507568359375, + "rewards/rejected": -0.0162353515625, + "step": 56 + }, + { + "epoch": 0.11930926216640503, + "grad_norm": 7.868374824523926, + "learning_rate": 4.999683068159718e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.359375, + "logps/chosen": -418.0, + "logps/rejected": -344.0, + "loss": 0.6786, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016845703125, + "rewards/margins": -0.0024871826171875, + "rewards/rejected": -0.014404296875, + "step": 57 + }, + { + "epoch": 0.12140240711669283, + "grad_norm": 7.583108901977539, + "learning_rate": 4.999586050886378e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.5625, + "logps/chosen": -490.0, + "logps/rejected": -264.0, + "loss": 0.6882, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0225830078125, + "rewards/margins": 0.02880859375, + "rewards/rejected": -0.006256103515625, + "step": 58 + }, + { + "epoch": 0.12349555206698064, + "grad_norm": 7.541992664337158, + "learning_rate": 4.999476099493974e-07, + "logits/chosen": 2.671875, + "logits/rejected": 2.625, + "logps/chosen": -234.0, + "logps/rejected": -214.0, + "loss": 0.6893, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02001953125, + "rewards/margins": 0.0093994140625, + "rewards/rejected": -0.0294189453125, + "step": 59 + }, + { + "epoch": 0.12558869701726844, + "grad_norm": 7.136630058288574, + "learning_rate": 4.999353214551507e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.40625, + "logps/chosen": -362.0, + "logps/rejected": -230.0, + "loss": 0.6844, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0849609375, + "rewards/margins": -0.05517578125, + "rewards/rejected": -0.030029296875, + "step": 60 + }, + { + "epoch": 0.12768184196755625, + "grad_norm": 7.714498043060303, + "learning_rate": 4.999217396694907e-07, + "logits/chosen": 4.21875, + "logits/rejected": 3.890625, + "logps/chosen": -388.0, + "logps/rejected": -596.0, + "loss": 0.6866, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.026611328125, + "rewards/margins": -0.03271484375, + "rewards/rejected": 0.006256103515625, + "step": 61 + }, + { + "epoch": 0.12977498691784406, + "grad_norm": 7.706767559051514, + "learning_rate": 4.999068646627036e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.09375, + "logps/chosen": -348.0, + "logps/rejected": -508.0, + "loss": 0.6833, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02490234375, + "rewards/margins": 0.04248046875, + "rewards/rejected": -0.0673828125, + "step": 62 + }, + { + "epoch": 0.13186813186813187, + "grad_norm": 7.379638195037842, + "learning_rate": 4.998906965117679e-07, + "logits/chosen": 3.78125, + "logits/rejected": 4.0, + "logps/chosen": -540.0, + "logps/rejected": -364.0, + "loss": 0.6871, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0150146484375, + "rewards/margins": 0.050048828125, + "rewards/rejected": -0.03515625, + "step": 63 + }, + { + "epoch": 0.13396127681841968, + "grad_norm": 7.627968788146973, + "learning_rate": 4.99873235300354e-07, + "logits/chosen": 3.671875, + "logits/rejected": 4.1875, + "logps/chosen": -672.0, + "logps/rejected": -360.0, + "loss": 0.6811, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0712890625, + "rewards/margins": 0.10498046875, + "rewards/rejected": -0.03369140625, + "step": 64 + }, + { + "epoch": 0.1360544217687075, + "grad_norm": 7.326155662536621, + "learning_rate": 4.998544811188243e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.65625, + "logps/chosen": -177.0, + "logps/rejected": -169.0, + "loss": 0.6796, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03759765625, + "rewards/margins": 0.05712890625, + "rewards/rejected": -0.09423828125, + "step": 65 + }, + { + "epoch": 0.13814756671899528, + "grad_norm": 7.775496006011963, + "learning_rate": 4.998344340642319e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.984375, + "logps/chosen": -220.0, + "logps/rejected": -262.0, + "loss": 0.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0262451171875, + "rewards/margins": 0.013671875, + "rewards/rejected": -0.0400390625, + "step": 66 + }, + { + "epoch": 0.1402407116692831, + "grad_norm": 7.598969459533691, + "learning_rate": 4.998130942403208e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.625, + "logps/chosen": -207.0, + "logps/rejected": -243.0, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06005859375, + "rewards/margins": 0.03564453125, + "rewards/rejected": -0.095703125, + "step": 67 + }, + { + "epoch": 0.1423338566195709, + "grad_norm": 7.5228166580200195, + "learning_rate": 4.99790461757525e-07, + "logits/chosen": 3.828125, + "logits/rejected": 4.1875, + "logps/chosen": -388.0, + "logps/rejected": -156.0, + "loss": 0.6775, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03515625, + "rewards/margins": 0.134765625, + "rewards/rejected": -0.10009765625, + "step": 68 + }, + { + "epoch": 0.14442700156985872, + "grad_norm": 7.170711517333984, + "learning_rate": 4.997665367329683e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.96875, + "logps/chosen": -648.0, + "logps/rejected": -498.0, + "loss": 0.6784, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07763671875, + "rewards/margins": 0.10546875, + "rewards/rejected": -0.0274658203125, + "step": 69 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 7.3945536613464355, + "learning_rate": 4.99741319290463e-07, + "logits/chosen": 4.40625, + "logits/rejected": 3.5625, + "logps/chosen": -188.0, + "logps/rejected": -568.0, + "loss": 0.676, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0712890625, + "rewards/margins": -0.03125, + "rewards/rejected": -0.0400390625, + "step": 70 + }, + { + "epoch": 0.14861329147043434, + "grad_norm": 7.3103556632995605, + "learning_rate": 4.9971480956051e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.359375, + "logps/chosen": -284.0, + "logps/rejected": -248.0, + "loss": 0.6843, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09765625, + "rewards/margins": -0.06494140625, + "rewards/rejected": -0.032470703125, + "step": 71 + }, + { + "epoch": 0.15070643642072212, + "grad_norm": 7.618526935577393, + "learning_rate": 4.996870076802977e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.1875, + "logps/chosen": -238.0, + "logps/rejected": -218.0, + "loss": 0.6871, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.061279296875, + "rewards/margins": -0.006591796875, + "rewards/rejected": -0.0546875, + "step": 72 + }, + { + "epoch": 0.15279958137100993, + "grad_norm": 6.8290791511535645, + "learning_rate": 4.996579137937015e-07, + "logits/chosen": 3.46875, + "logits/rejected": 3.625, + "logps/chosen": -356.0, + "logps/rejected": -474.0, + "loss": 0.6848, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1103515625, + "rewards/margins": -0.03564453125, + "rewards/rejected": -0.07421875, + "step": 73 + }, + { + "epoch": 0.15489272632129775, + "grad_norm": 8.002132415771484, + "learning_rate": 4.99627528051283e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.625, + "logps/chosen": -498.0, + "logps/rejected": -440.0, + "loss": 0.6832, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00750732421875, + "rewards/margins": 0.09765625, + "rewards/rejected": -0.10498046875, + "step": 74 + }, + { + "epoch": 0.15698587127158556, + "grad_norm": 7.541519641876221, + "learning_rate": 4.99595850610289e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.765625, + "logps/chosen": -412.0, + "logps/rejected": -154.0, + "loss": 0.6793, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.023681640625, + "rewards/margins": 0.026123046875, + "rewards/rejected": -0.050048828125, + "step": 75 + }, + { + "epoch": 0.15907901622187337, + "grad_norm": 7.712185859680176, + "learning_rate": 4.995628816346507e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.3125, + "logps/chosen": -300.0, + "logps/rejected": -292.0, + "loss": 0.6776, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0250244140625, + "rewards/margins": 0.030029296875, + "rewards/rejected": -0.054931640625, + "step": 76 + }, + { + "epoch": 0.16117216117216118, + "grad_norm": 7.439749240875244, + "learning_rate": 4.995286212949837e-07, + "logits/chosen": 3.703125, + "logits/rejected": 4.03125, + "logps/chosen": -382.0, + "logps/rejected": -210.0, + "loss": 0.6769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0030517578125, + "rewards/margins": 0.11474609375, + "rewards/rejected": -0.1171875, + "step": 77 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 6.7779693603515625, + "learning_rate": 4.994930697685857e-07, + "logits/chosen": 3.84375, + "logits/rejected": 4.0, + "logps/chosen": -174.0, + "logps/rejected": -186.0, + "loss": 0.674, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.025634765625, + "rewards/margins": 0.0106201171875, + "rewards/rejected": -0.036376953125, + "step": 78 + }, + { + "epoch": 0.16535845107273678, + "grad_norm": 7.856261253356934, + "learning_rate": 4.994562272394368e-07, + "logits/chosen": 3.703125, + "logits/rejected": 4.09375, + "logps/chosen": -358.0, + "logps/rejected": -374.0, + "loss": 0.6768, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.1396484375, + "rewards/margins": -0.0849609375, + "rewards/rejected": -0.05517578125, + "step": 79 + }, + { + "epoch": 0.1674515960230246, + "grad_norm": 7.679439544677734, + "learning_rate": 4.994180938981979e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.96875, + "logps/chosen": -384.0, + "logps/rejected": -384.0, + "loss": 0.6809, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00628662109375, + "rewards/margins": 0.05322265625, + "rewards/rejected": -0.046875, + "step": 80 + }, + { + "epoch": 0.1695447409733124, + "grad_norm": 7.106894493103027, + "learning_rate": 4.993786699422098e-07, + "logits/chosen": 2.703125, + "logits/rejected": 3.34375, + "logps/chosen": -394.0, + "logps/rejected": -274.0, + "loss": 0.6748, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.015625, + "rewards/margins": 0.08056640625, + "rewards/rejected": -0.09619140625, + "step": 81 + }, + { + "epoch": 0.17163788592360021, + "grad_norm": 7.259335517883301, + "learning_rate": 4.993379555754923e-07, + "logits/chosen": 2.875, + "logits/rejected": 3.328125, + "logps/chosen": -320.0, + "logps/rejected": -348.0, + "loss": 0.6752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02001953125, + "rewards/margins": 0.05859375, + "rewards/rejected": -0.07861328125, + "step": 82 + }, + { + "epoch": 0.17373103087388803, + "grad_norm": 8.156006813049316, + "learning_rate": 4.992959510087432e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.84375, + "logps/chosen": -564.0, + "logps/rejected": -620.0, + "loss": 0.6873, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.125, + "rewards/margins": -0.0098876953125, + "rewards/rejected": -0.115234375, + "step": 83 + }, + { + "epoch": 0.17582417582417584, + "grad_norm": 8.099892616271973, + "learning_rate": 4.992526564593371e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.21875, + "logps/chosen": -334.0, + "logps/rejected": -276.0, + "loss": 0.6788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05517578125, + "rewards/margins": 0.02490234375, + "rewards/rejected": -0.080078125, + "step": 84 + }, + { + "epoch": 0.17791732077446362, + "grad_norm": 7.183663368225098, + "learning_rate": 4.992080721513243e-07, + "logits/chosen": 3.40625, + "logits/rejected": 3.6875, + "logps/chosen": -316.0, + "logps/rejected": -284.0, + "loss": 0.6828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.054931640625, + "rewards/margins": 0.0751953125, + "rewards/rejected": -0.1298828125, + "step": 85 + }, + { + "epoch": 0.18001046572475143, + "grad_norm": 7.948578357696533, + "learning_rate": 4.991621983154294e-07, + "logits/chosen": 2.875, + "logits/rejected": 3.0625, + "logps/chosen": -656.0, + "logps/rejected": -460.0, + "loss": 0.6808, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0224609375, + "rewards/margins": 0.080078125, + "rewards/rejected": -0.1025390625, + "step": 86 + }, + { + "epoch": 0.18210361067503925, + "grad_norm": 7.284086227416992, + "learning_rate": 4.991150351890505e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.28125, + "logps/chosen": -256.0, + "logps/rejected": -249.0, + "loss": 0.685, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.10498046875, + "rewards/margins": -0.01507568359375, + "rewards/rejected": -0.08984375, + "step": 87 + }, + { + "epoch": 0.18419675562532706, + "grad_norm": 7.705651760101318, + "learning_rate": 4.990665830162581e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.328125, + "logps/chosen": -296.0, + "logps/rejected": -223.0, + "loss": 0.6761, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0908203125, + "rewards/margins": 0.0093994140625, + "rewards/rejected": -0.10009765625, + "step": 88 + }, + { + "epoch": 0.18628990057561487, + "grad_norm": 7.44476318359375, + "learning_rate": 4.99016842047793e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.96875, + "logps/chosen": -151.0, + "logps/rejected": -123.0, + "loss": 0.6743, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.027587890625, + "rewards/margins": -0.002349853515625, + "rewards/rejected": -0.0252685546875, + "step": 89 + }, + { + "epoch": 0.18838304552590268, + "grad_norm": 7.6581950187683105, + "learning_rate": 4.989658125410658e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.09375, + "logps/chosen": -312.0, + "logps/rejected": -338.0, + "loss": 0.678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.054931640625, + "rewards/margins": 0.05322265625, + "rewards/rejected": -0.10791015625, + "step": 90 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 7.140282154083252, + "learning_rate": 4.989134947601555e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.875, + "logps/chosen": -288.0, + "logps/rejected": -364.0, + "loss": 0.6822, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.083984375, + "rewards/margins": 0.017822265625, + "rewards/rejected": -0.10205078125, + "step": 91 + }, + { + "epoch": 0.19256933542647828, + "grad_norm": 7.376110553741455, + "learning_rate": 4.988598889758077e-07, + "logits/chosen": 3.765625, + "logits/rejected": 4.34375, + "logps/chosen": -708.0, + "logps/rejected": -520.0, + "loss": 0.6778, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0400390625, + "rewards/margins": 0.0247802734375, + "rewards/rejected": -0.06494140625, + "step": 92 + }, + { + "epoch": 0.1946624803767661, + "grad_norm": 7.1125664710998535, + "learning_rate": 4.988049954654334e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.59375, + "logps/chosen": -318.0, + "logps/rejected": -422.0, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10498046875, + "rewards/margins": -0.017578125, + "rewards/rejected": -0.08740234375, + "step": 93 + }, + { + "epoch": 0.1967556253270539, + "grad_norm": 8.067298889160156, + "learning_rate": 4.987488145131078e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.1875, + "logps/chosen": -480.0, + "logps/rejected": -302.0, + "loss": 0.6856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0498046875, + "rewards/margins": 0.1279296875, + "rewards/rejected": -0.177734375, + "step": 94 + }, + { + "epoch": 0.1988487702773417, + "grad_norm": 7.559632301330566, + "learning_rate": 4.986913464095686e-07, + "logits/chosen": 3.109375, + "logits/rejected": 2.84375, + "logps/chosen": -426.0, + "logps/rejected": -446.0, + "loss": 0.6753, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0137939453125, + "rewards/margins": 0.2041015625, + "rewards/rejected": -0.189453125, + "step": 95 + }, + { + "epoch": 0.20094191522762953, + "grad_norm": 8.821226119995117, + "learning_rate": 4.986325914522145e-07, + "logits/chosen": 3.78125, + "logits/rejected": 4.125, + "logps/chosen": -512.0, + "logps/rejected": -412.0, + "loss": 0.678, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.125, + "rewards/margins": -0.080078125, + "rewards/rejected": -0.044921875, + "step": 96 + }, + { + "epoch": 0.2030350601779173, + "grad_norm": 7.98090124130249, + "learning_rate": 4.985725499451036e-07, + "logits/chosen": 3.796875, + "logits/rejected": 4.125, + "logps/chosen": -540.0, + "logps/rejected": -412.0, + "loss": 0.6736, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042236328125, + "rewards/margins": 0.125, + "rewards/rejected": -0.1669921875, + "step": 97 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 7.935008525848389, + "learning_rate": 4.985112221989522e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.328125, + "logps/chosen": -342.0, + "logps/rejected": -294.0, + "loss": 0.6784, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04638671875, + "rewards/margins": 0.0162353515625, + "rewards/rejected": -0.0625, + "step": 98 + }, + { + "epoch": 0.20722135007849293, + "grad_norm": 7.756891250610352, + "learning_rate": 4.984486085311325e-07, + "logits/chosen": 3.71875, + "logits/rejected": 3.390625, + "logps/chosen": -668.0, + "logps/rejected": -624.0, + "loss": 0.6766, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06494140625, + "rewards/margins": 0.279296875, + "rewards/rejected": -0.34375, + "step": 99 + }, + { + "epoch": 0.20931449502878074, + "grad_norm": 7.824303150177002, + "learning_rate": 4.983847092656719e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.625, + "logps/chosen": -436.0, + "logps/rejected": -266.0, + "loss": 0.678, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1064453125, + "rewards/margins": -0.015380859375, + "rewards/rejected": -0.09130859375, + "step": 100 + }, + { + "epoch": 0.21140763997906856, + "grad_norm": 7.823368549346924, + "learning_rate": 4.983195247332502e-07, + "logits/chosen": 2.703125, + "logits/rejected": 2.859375, + "logps/chosen": -286.0, + "logps/rejected": -116.5, + "loss": 0.6781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09814453125, + "rewards/margins": 0.027587890625, + "rewards/rejected": -0.1259765625, + "step": 101 + }, + { + "epoch": 0.21350078492935637, + "grad_norm": 7.150810718536377, + "learning_rate": 4.982530552711989e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.4375, + "logps/chosen": -278.0, + "logps/rejected": -366.0, + "loss": 0.6755, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05859375, + "rewards/margins": 0.0986328125, + "rewards/rejected": -0.1572265625, + "step": 102 + }, + { + "epoch": 0.21559392987964415, + "grad_norm": 8.53307819366455, + "learning_rate": 4.981853012234991e-07, + "logits/chosen": 3.71875, + "logits/rejected": 3.5625, + "logps/chosen": -480.0, + "logps/rejected": -672.0, + "loss": 0.6811, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06787109375, + "rewards/margins": 0.01214599609375, + "rewards/rejected": -0.080078125, + "step": 103 + }, + { + "epoch": 0.21768707482993196, + "grad_norm": 7.274529933929443, + "learning_rate": 4.981162629407793e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.5, + "logps/chosen": -820.0, + "logps/rejected": -516.0, + "loss": 0.6868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.050048828125, + "rewards/margins": 0.134765625, + "rewards/rejected": -0.185546875, + "step": 104 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 8.474202156066895, + "learning_rate": 4.980459407803141e-07, + "logits/chosen": 2.671875, + "logits/rejected": 2.890625, + "logps/chosen": -246.0, + "logps/rejected": -230.0, + "loss": 0.6707, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0262451171875, + "rewards/margins": 0.12890625, + "rewards/rejected": -0.1552734375, + "step": 105 + }, + { + "epoch": 0.2218733647305076, + "grad_norm": 8.309959411621094, + "learning_rate": 4.979743351060225e-07, + "logits/chosen": 3.390625, + "logits/rejected": 4.1875, + "logps/chosen": -438.0, + "logps/rejected": -432.0, + "loss": 0.687, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1201171875, + "rewards/margins": 0.0264892578125, + "rewards/rejected": -0.146484375, + "step": 106 + }, + { + "epoch": 0.2239665096807954, + "grad_norm": 8.156225204467773, + "learning_rate": 4.97901446288465e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.859375, + "logps/chosen": -668.0, + "logps/rejected": -552.0, + "loss": 0.6714, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10498046875, + "rewards/margins": 0.057373046875, + "rewards/rejected": -0.162109375, + "step": 107 + }, + { + "epoch": 0.2260596546310832, + "grad_norm": 7.299003601074219, + "learning_rate": 4.978272747048432e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.703125, + "logps/chosen": -382.0, + "logps/rejected": -191.0, + "loss": 0.6648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04248046875, + "rewards/margins": 0.12353515625, + "rewards/rejected": -0.166015625, + "step": 108 + }, + { + "epoch": 0.228152799581371, + "grad_norm": 8.61748218536377, + "learning_rate": 4.977518207389965e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.40625, + "logps/chosen": -242.0, + "logps/rejected": -181.0, + "loss": 0.6707, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.162109375, + "rewards/margins": -0.0771484375, + "rewards/rejected": -0.08544921875, + "step": 109 + }, + { + "epoch": 0.2302459445316588, + "grad_norm": 6.710155010223389, + "learning_rate": 4.97675084781401e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.6875, + "logps/chosen": -500.0, + "logps/rejected": -211.0, + "loss": 0.6597, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06494140625, + "rewards/margins": 0.031494140625, + "rewards/rejected": -0.09619140625, + "step": 110 + }, + { + "epoch": 0.23233908948194662, + "grad_norm": 8.158218383789062, + "learning_rate": 4.975970672291667e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.21875, + "logps/chosen": -368.0, + "logps/rejected": -268.0, + "loss": 0.6567, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04248046875, + "rewards/margins": 0.10595703125, + "rewards/rejected": -0.1484375, + "step": 111 + }, + { + "epoch": 0.23443223443223443, + "grad_norm": 7.508507251739502, + "learning_rate": 4.975177684860365e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.734375, + "logps/chosen": -366.0, + "logps/rejected": -384.0, + "loss": 0.6853, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1328125, + "rewards/margins": -0.060302734375, + "rewards/rejected": -0.072265625, + "step": 112 + }, + { + "epoch": 0.23652537938252224, + "grad_norm": 7.649636268615723, + "learning_rate": 4.974371889623828e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.34375, + "logps/chosen": -394.0, + "logps/rejected": -272.0, + "loss": 0.6573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0037841796875, + "rewards/margins": 0.0927734375, + "rewards/rejected": -0.09619140625, + "step": 113 + }, + { + "epoch": 0.23861852433281006, + "grad_norm": 7.413567543029785, + "learning_rate": 4.973553290752066e-07, + "logits/chosen": 2.671875, + "logits/rejected": 2.6875, + "logps/chosen": -83.5, + "logps/rejected": -131.0, + "loss": 0.6798, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11962890625, + "rewards/margins": 0.01434326171875, + "rewards/rejected": -0.1337890625, + "step": 114 + }, + { + "epoch": 0.24071166928309787, + "grad_norm": 8.337164878845215, + "learning_rate": 4.972721892481346e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0625, + "logps/chosen": -282.0, + "logps/rejected": -318.0, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10546875, + "rewards/margins": 0.0791015625, + "rewards/rejected": -0.1845703125, + "step": 115 + }, + { + "epoch": 0.24280481423338565, + "grad_norm": 7.646633148193359, + "learning_rate": 4.971877699114173e-07, + "logits/chosen": 2.859375, + "logits/rejected": 3.234375, + "logps/chosen": -390.0, + "logps/rejected": -256.0, + "loss": 0.6719, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044921875, + "rewards/margins": 0.08203125, + "rewards/rejected": -0.126953125, + "step": 116 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 7.244144439697266, + "learning_rate": 4.971020715019264e-07, + "logits/chosen": 3.640625, + "logits/rejected": 3.34375, + "logps/chosen": -199.0, + "logps/rejected": -396.0, + "loss": 0.6749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.076171875, + "rewards/margins": 0.07763671875, + "rewards/rejected": -0.1533203125, + "step": 117 + }, + { + "epoch": 0.24699110413396128, + "grad_norm": 7.660480976104736, + "learning_rate": 4.970150944631533e-07, + "logits/chosen": 3.859375, + "logits/rejected": 4.09375, + "logps/chosen": -380.0, + "logps/rejected": -378.0, + "loss": 0.6678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.103515625, + "rewards/margins": 0.1240234375, + "rewards/rejected": -0.2275390625, + "step": 118 + }, + { + "epoch": 0.2490842490842491, + "grad_norm": 8.113422393798828, + "learning_rate": 4.96926839245206e-07, + "logits/chosen": 3.515625, + "logits/rejected": 3.34375, + "logps/chosen": -504.0, + "logps/rejected": -740.0, + "loss": 0.6703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26171875, + "rewards/margins": -0.002685546875, + "rewards/rejected": -0.259765625, + "step": 119 + }, + { + "epoch": 0.25117739403453687, + "grad_norm": 8.210384368896484, + "learning_rate": 4.96837306304807e-07, + "logits/chosen": 3.71875, + "logits/rejected": 4.53125, + "logps/chosen": -640.0, + "logps/rejected": -250.0, + "loss": 0.6781, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1455078125, + "rewards/margins": -0.0225830078125, + "rewards/rejected": -0.12255859375, + "step": 120 + }, + { + "epoch": 0.2532705389848247, + "grad_norm": 6.795380592346191, + "learning_rate": 4.967464961052915e-07, + "logits/chosen": 4.375, + "logits/rejected": 3.4375, + "logps/chosen": -278.0, + "logps/rejected": -298.0, + "loss": 0.6745, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.158203125, + "rewards/margins": 0.02880859375, + "rewards/rejected": -0.1875, + "step": 121 + }, + { + "epoch": 0.2553636839351125, + "grad_norm": 7.234960556030273, + "learning_rate": 4.966544091166043e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.546875, + "logps/chosen": -448.0, + "logps/rejected": -460.0, + "loss": 0.6714, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030029296875, + "rewards/margins": 0.0751953125, + "rewards/rejected": -0.10546875, + "step": 122 + }, + { + "epoch": 0.25745682888540034, + "grad_norm": 8.24029541015625, + "learning_rate": 4.965610458152973e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.09375, + "logps/chosen": -656.0, + "logps/rejected": -506.0, + "loss": 0.6613, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10498046875, + "rewards/margins": 0.1796875, + "rewards/rejected": -0.28515625, + "step": 123 + }, + { + "epoch": 0.2595499738356881, + "grad_norm": 7.639450550079346, + "learning_rate": 4.96466406684528e-07, + "logits/chosen": 3.515625, + "logits/rejected": 4.28125, + "logps/chosen": -784.0, + "logps/rejected": -400.0, + "loss": 0.6781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.080078125, + "rewards/margins": 0.09521484375, + "rewards/rejected": -0.1748046875, + "step": 124 + }, + { + "epoch": 0.2616431187859759, + "grad_norm": 8.00741195678711, + "learning_rate": 4.963704922140558e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.59375, + "logps/chosen": -440.0, + "logps/rejected": -370.0, + "loss": 0.6794, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.216796875, + "rewards/margins": -0.09228515625, + "rewards/rejected": -0.1240234375, + "step": 125 + }, + { + "epoch": 0.26373626373626374, + "grad_norm": 8.1722993850708, + "learning_rate": 4.962733029002401e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.78125, + "logps/chosen": -436.0, + "logps/rejected": -396.0, + "loss": 0.6697, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16796875, + "rewards/margins": 0.0634765625, + "rewards/rejected": -0.232421875, + "step": 126 + }, + { + "epoch": 0.2658294086865515, + "grad_norm": 7.704063415527344, + "learning_rate": 4.961748392460379e-07, + "logits/chosen": 3.71875, + "logits/rejected": 3.59375, + "logps/chosen": -235.0, + "logps/rejected": -346.0, + "loss": 0.6627, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1318359375, + "rewards/margins": 0.0029296875, + "rewards/rejected": -0.134765625, + "step": 127 + }, + { + "epoch": 0.26792255363683937, + "grad_norm": 8.196044921875, + "learning_rate": 4.960751017610008e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.21875, + "logps/chosen": -284.0, + "logps/rejected": -416.0, + "loss": 0.6714, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10498046875, + "rewards/margins": 0.1337890625, + "rewards/rejected": -0.23828125, + "step": 128 + }, + { + "epoch": 0.27001569858712715, + "grad_norm": 7.8072614669799805, + "learning_rate": 4.959740909612723e-07, + "logits/chosen": 3.546875, + "logits/rejected": 3.78125, + "logps/chosen": -308.0, + "logps/rejected": -276.0, + "loss": 0.6706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0255126953125, + "rewards/margins": 0.11328125, + "rewards/rejected": -0.138671875, + "step": 129 + }, + { + "epoch": 0.272108843537415, + "grad_norm": 8.080399513244629, + "learning_rate": 4.958718073695857e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.625, + "logps/chosen": -332.0, + "logps/rejected": -482.0, + "loss": 0.6673, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12255859375, + "rewards/margins": -0.0047607421875, + "rewards/rejected": -0.11767578125, + "step": 130 + }, + { + "epoch": 0.2742019884877028, + "grad_norm": 8.27035903930664, + "learning_rate": 4.957682515152607e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.125, + "logps/chosen": -480.0, + "logps/rejected": -516.0, + "loss": 0.6691, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10498046875, + "rewards/margins": 0.057861328125, + "rewards/rejected": -0.162109375, + "step": 131 + }, + { + "epoch": 0.27629513343799056, + "grad_norm": 8.402571678161621, + "learning_rate": 4.956634239342012e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.859375, + "logps/chosen": -338.0, + "logps/rejected": -464.0, + "loss": 0.6564, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.255859375, + "rewards/margins": -0.0380859375, + "rewards/rejected": -0.216796875, + "step": 132 + }, + { + "epoch": 0.2783882783882784, + "grad_norm": 7.434090614318848, + "learning_rate": 4.955573251688922e-07, + "logits/chosen": 3.359375, + "logits/rejected": 2.859375, + "logps/chosen": -278.0, + "logps/rejected": -310.0, + "loss": 0.6647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1630859375, + "rewards/margins": 0.0771484375, + "rewards/rejected": -0.240234375, + "step": 133 + }, + { + "epoch": 0.2804814233385662, + "grad_norm": 8.35785961151123, + "learning_rate": 4.954499557683971e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.28125, + "logps/chosen": -588.0, + "logps/rejected": -448.0, + "loss": 0.6737, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.146484375, + "rewards/margins": 0.1171875, + "rewards/rejected": -0.263671875, + "step": 134 + }, + { + "epoch": 0.282574568288854, + "grad_norm": 8.191429138183594, + "learning_rate": 4.95341316288355e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.40625, + "logps/chosen": -238.0, + "logps/rejected": -268.0, + "loss": 0.6541, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.177734375, + "rewards/margins": -0.0322265625, + "rewards/rejected": -0.146484375, + "step": 135 + }, + { + "epoch": 0.2846677132391418, + "grad_norm": 7.505073547363281, + "learning_rate": 4.952314072909776e-07, + "logits/chosen": 3.171875, + "logits/rejected": 2.90625, + "logps/chosen": -199.0, + "logps/rejected": -390.0, + "loss": 0.6669, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1845703125, + "rewards/margins": 0.07275390625, + "rewards/rejected": -0.2578125, + "step": 136 + }, + { + "epoch": 0.2867608581894296, + "grad_norm": 9.103921890258789, + "learning_rate": 4.951202293450464e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.5625, + "logps/chosen": -884.0, + "logps/rejected": -284.0, + "loss": 0.676, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3515625, + "rewards/margins": -0.1767578125, + "rewards/rejected": -0.17578125, + "step": 137 + }, + { + "epoch": 0.28885400313971743, + "grad_norm": 7.8134307861328125, + "learning_rate": 4.950077830259097e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.28125, + "logps/chosen": -524.0, + "logps/rejected": -520.0, + "loss": 0.666, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1669921875, + "rewards/margins": -0.0166015625, + "rewards/rejected": -0.150390625, + "step": 138 + }, + { + "epoch": 0.2909471480900052, + "grad_norm": 7.591533184051514, + "learning_rate": 4.948940689154794e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.78125, + "logps/chosen": -302.0, + "logps/rejected": -354.0, + "loss": 0.6713, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1748046875, + "rewards/margins": 0.0771484375, + "rewards/rejected": -0.251953125, + "step": 139 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 8.549483299255371, + "learning_rate": 4.94779087602229e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.625, + "logps/chosen": -472.0, + "logps/rejected": -448.0, + "loss": 0.669, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.30859375, + "rewards/margins": -0.1396484375, + "rewards/rejected": -0.169921875, + "step": 140 + }, + { + "epoch": 0.29513343799058084, + "grad_norm": 8.168773651123047, + "learning_rate": 4.94662839681189e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.3125, + "logps/chosen": -406.0, + "logps/rejected": -330.0, + "loss": 0.658, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.203125, + "rewards/margins": 0.01806640625, + "rewards/rejected": -0.220703125, + "step": 141 + }, + { + "epoch": 0.2972265829408687, + "grad_norm": 7.819066047668457, + "learning_rate": 4.945453257539451e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -512.0, + "logps/rejected": -388.0, + "loss": 0.6628, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.054931640625, + "rewards/margins": 0.1259765625, + "rewards/rejected": -0.1806640625, + "step": 142 + }, + { + "epoch": 0.29931972789115646, + "grad_norm": 8.835631370544434, + "learning_rate": 4.944265464286343e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.890625, + "logps/chosen": -466.0, + "logps/rejected": -256.0, + "loss": 0.6803, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.228515625, + "rewards/margins": -0.049560546875, + "rewards/rejected": -0.1787109375, + "step": 143 + }, + { + "epoch": 0.30141287284144425, + "grad_norm": 7.867812633514404, + "learning_rate": 4.943065023199424e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.40625, + "logps/chosen": -470.0, + "logps/rejected": -338.0, + "loss": 0.6677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03955078125, + "rewards/margins": 0.0673828125, + "rewards/rejected": -0.1064453125, + "step": 144 + }, + { + "epoch": 0.3035060177917321, + "grad_norm": 7.702564239501953, + "learning_rate": 4.941851940491002e-07, + "logits/chosen": 2.546875, + "logits/rejected": 3.125, + "logps/chosen": -540.0, + "logps/rejected": -428.0, + "loss": 0.6558, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.322265625, + "rewards/margins": 0.00537109375, + "rewards/rejected": -0.328125, + "step": 145 + }, + { + "epoch": 0.30559916274201987, + "grad_norm": 9.008699417114258, + "learning_rate": 4.940626222438808e-07, + "logits/chosen": 2.515625, + "logits/rejected": 2.703125, + "logps/chosen": -164.0, + "logps/rejected": -288.0, + "loss": 0.6747, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2177734375, + "rewards/margins": 0.0234375, + "rewards/rejected": -0.2412109375, + "step": 146 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 7.8209662437438965, + "learning_rate": 4.939387875385958e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.53125, + "logps/chosen": -209.0, + "logps/rejected": -276.0, + "loss": 0.6712, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.33203125, + "rewards/margins": -0.1533203125, + "rewards/rejected": -0.1796875, + "step": 147 + }, + { + "epoch": 0.3097854526425955, + "grad_norm": 7.214803218841553, + "learning_rate": 4.938136905740926e-07, + "logits/chosen": 3.609375, + "logits/rejected": 3.734375, + "logps/chosen": -880.0, + "logps/rejected": -740.0, + "loss": 0.6711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1103515625, + "rewards/margins": 0.41015625, + "rewards/rejected": -0.51953125, + "step": 148 + }, + { + "epoch": 0.31187859759288333, + "grad_norm": 8.265995979309082, + "learning_rate": 4.936873319977508e-07, + "logits/chosen": 3.203125, + "logits/rejected": 2.953125, + "logps/chosen": -756.0, + "logps/rejected": -632.0, + "loss": 0.6612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.123046875, + "rewards/margins": 0.2470703125, + "rewards/rejected": -0.37109375, + "step": 149 + }, + { + "epoch": 0.3139717425431711, + "grad_norm": 7.911357402801514, + "learning_rate": 4.935597124634788e-07, + "logits/chosen": 3.140625, + "logits/rejected": 2.984375, + "logps/chosen": -318.0, + "logps/rejected": -332.0, + "loss": 0.6705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.32421875, + "rewards/margins": -0.2333984375, + "rewards/rejected": -0.091796875, + "step": 150 + }, + { + "epoch": 0.3160648874934589, + "grad_norm": 7.947584629058838, + "learning_rate": 4.934308326317104e-07, + "logits/chosen": 2.859375, + "logits/rejected": 3.296875, + "logps/chosen": -362.0, + "logps/rejected": -368.0, + "loss": 0.6506, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.197265625, + "rewards/margins": 0.0152587890625, + "rewards/rejected": -0.212890625, + "step": 151 + }, + { + "epoch": 0.31815803244374674, + "grad_norm": 8.037640571594238, + "learning_rate": 4.933006931694018e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.6649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.140625, + "rewards/margins": 0.232421875, + "rewards/rejected": -0.373046875, + "step": 152 + }, + { + "epoch": 0.3202511773940345, + "grad_norm": 8.108712196350098, + "learning_rate": 4.931692947500272e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.84375, + "logps/chosen": -432.0, + "logps/rejected": -428.0, + "loss": 0.6289, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.185546875, + "rewards/margins": 0.171875, + "rewards/rejected": -0.357421875, + "step": 153 + }, + { + "epoch": 0.32234432234432236, + "grad_norm": 8.41116714477539, + "learning_rate": 4.930366380535766e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.296875, + "logps/chosen": -264.0, + "logps/rejected": -276.0, + "loss": 0.6543, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.361328125, + "rewards/margins": 0.1689453125, + "rewards/rejected": -0.53125, + "step": 154 + }, + { + "epoch": 0.32443746729461015, + "grad_norm": 8.361553192138672, + "learning_rate": 4.929027237665514e-07, + "logits/chosen": 2.640625, + "logits/rejected": 2.828125, + "logps/chosen": -548.0, + "logps/rejected": -370.0, + "loss": 0.6697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14453125, + "rewards/margins": 0.12890625, + "rewards/rejected": -0.2734375, + "step": 155 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 7.492923736572266, + "learning_rate": 4.927675525819608e-07, + "logits/chosen": 2.640625, + "logits/rejected": 2.875, + "logps/chosen": -165.0, + "logps/rejected": -156.0, + "loss": 0.6464, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.263671875, + "rewards/margins": -0.01953125, + "rewards/rejected": -0.244140625, + "step": 156 + }, + { + "epoch": 0.3286237571951858, + "grad_norm": 7.923641681671143, + "learning_rate": 4.926311251993185e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.734375, + "logps/chosen": -394.0, + "logps/rejected": -255.0, + "loss": 0.6645, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25390625, + "rewards/margins": 0.126953125, + "rewards/rejected": -0.380859375, + "step": 157 + }, + { + "epoch": 0.33071690214547356, + "grad_norm": 9.376420974731445, + "learning_rate": 4.924934423246395e-07, + "logits/chosen": 2.671875, + "logits/rejected": 3.234375, + "logps/chosen": -158.0, + "logps/rejected": -109.5, + "loss": 0.659, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2021484375, + "rewards/margins": -0.001953125, + "rewards/rejected": -0.2001953125, + "step": 158 + }, + { + "epoch": 0.3328100470957614, + "grad_norm": 8.906984329223633, + "learning_rate": 4.923545046704356e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.4375, + "logps/chosen": -418.0, + "logps/rejected": -430.0, + "loss": 0.7016, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2373046875, + "rewards/margins": -0.0322265625, + "rewards/rejected": -0.205078125, + "step": 159 + }, + { + "epoch": 0.3349031920460492, + "grad_norm": 8.105554580688477, + "learning_rate": 4.922143129557123e-07, + "logits/chosen": 3.71875, + "logits/rejected": 4.28125, + "logps/chosen": -446.0, + "logps/rejected": -216.0, + "loss": 0.6532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1259765625, + "rewards/margins": 0.1328125, + "rewards/rejected": -0.2578125, + "step": 160 + }, + { + "epoch": 0.336996336996337, + "grad_norm": 9.821364402770996, + "learning_rate": 4.920728679059647e-07, + "logits/chosen": 2.78125, + "logits/rejected": 3.0625, + "logps/chosen": -356.0, + "logps/rejected": -276.0, + "loss": 0.6464, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.234375, + "rewards/margins": -0.000732421875, + "rewards/rejected": -0.2333984375, + "step": 161 + }, + { + "epoch": 0.3390894819466248, + "grad_norm": 7.859842777252197, + "learning_rate": 4.91930170253174e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.59375, + "logps/chosen": -416.0, + "logps/rejected": -332.0, + "loss": 0.6639, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.18359375, + "rewards/margins": -0.037353515625, + "rewards/rejected": -0.146484375, + "step": 162 + }, + { + "epoch": 0.3411826268969126, + "grad_norm": 8.701375007629395, + "learning_rate": 4.917862207358038e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.9375, + "logps/chosen": -608.0, + "logps/rejected": -484.0, + "loss": 0.6665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2470703125, + "rewards/margins": 0.205078125, + "rewards/rejected": -0.453125, + "step": 163 + }, + { + "epoch": 0.34327577184720043, + "grad_norm": 7.780459403991699, + "learning_rate": 4.91641020098796e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.53125, + "logps/chosen": -258.0, + "logps/rejected": -280.0, + "loss": 0.6705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.375, + "rewards/margins": -0.030517578125, + "rewards/rejected": -0.34375, + "step": 164 + }, + { + "epoch": 0.3453689167974882, + "grad_norm": 8.421460151672363, + "learning_rate": 4.914945690935671e-07, + "logits/chosen": 3.4375, + "logits/rejected": 3.625, + "logps/chosen": -532.0, + "logps/rejected": -348.0, + "loss": 0.6714, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.359375, + "rewards/margins": 0.00341796875, + "rewards/rejected": -0.36328125, + "step": 165 + }, + { + "epoch": 0.34746206174777605, + "grad_norm": 7.451173782348633, + "learning_rate": 4.913468684780043e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.84375, + "logps/chosen": -334.0, + "logps/rejected": -356.0, + "loss": 0.6533, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1162109375, + "rewards/margins": 0.1201171875, + "rewards/rejected": -0.236328125, + "step": 166 + }, + { + "epoch": 0.34955520669806384, + "grad_norm": 8.066396713256836, + "learning_rate": 4.911979190164615e-07, + "logits/chosen": 2.953125, + "logits/rejected": 3.15625, + "logps/chosen": -334.0, + "logps/rejected": -294.0, + "loss": 0.6521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.287109375, + "rewards/margins": 0.1494140625, + "rewards/rejected": -0.435546875, + "step": 167 + }, + { + "epoch": 0.3516483516483517, + "grad_norm": 8.994278907775879, + "learning_rate": 4.910477214797554e-07, + "logits/chosen": 2.296875, + "logits/rejected": 2.84375, + "logps/chosen": -438.0, + "logps/rejected": -334.0, + "loss": 0.6584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.140625, + "rewards/margins": 0.173828125, + "rewards/rejected": -0.314453125, + "step": 168 + }, + { + "epoch": 0.35374149659863946, + "grad_norm": 7.937893390655518, + "learning_rate": 4.908962766451616e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.546875, + "logps/chosen": -432.0, + "logps/rejected": -356.0, + "loss": 0.6485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.421875, + "rewards/margins": 0.04833984375, + "rewards/rejected": -0.470703125, + "step": 169 + }, + { + "epoch": 0.35583464154892724, + "grad_norm": 8.24114990234375, + "learning_rate": 4.907435852964103e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.4375, + "logps/chosen": -410.0, + "logps/rejected": -190.0, + "loss": 0.641, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.40625, + "rewards/margins": -0.1328125, + "rewards/rejected": -0.2734375, + "step": 170 + }, + { + "epoch": 0.3579277864992151, + "grad_norm": 8.00881576538086, + "learning_rate": 4.905896482236829e-07, + "logits/chosen": 2.453125, + "logits/rejected": 2.609375, + "logps/chosen": -219.0, + "logps/rejected": -214.0, + "loss": 0.647, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1962890625, + "rewards/margins": 0.134765625, + "rewards/rejected": -0.33203125, + "step": 171 + }, + { + "epoch": 0.36002093144950287, + "grad_norm": 8.792490005493164, + "learning_rate": 4.904344662236069e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.0625, + "logps/chosen": -280.0, + "logps/rejected": -400.0, + "loss": 0.671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.259765625, + "rewards/margins": 0.32421875, + "rewards/rejected": -0.5859375, + "step": 172 + }, + { + "epoch": 0.3621140763997907, + "grad_norm": 8.430554389953613, + "learning_rate": 4.902780400992526e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.515625, + "logps/chosen": -312.0, + "logps/rejected": -366.0, + "loss": 0.6734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.330078125, + "rewards/margins": 0.009765625, + "rewards/rejected": -0.33984375, + "step": 173 + }, + { + "epoch": 0.3642072213500785, + "grad_norm": 8.057472229003906, + "learning_rate": 4.901203706601288e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.15625, + "logps/chosen": -780.0, + "logps/rejected": -428.0, + "loss": 0.6611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2431640625, + "rewards/margins": 0.314453125, + "rewards/rejected": -0.55859375, + "step": 174 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 8.198448181152344, + "learning_rate": 4.899614587221782e-07, + "logits/chosen": 2.234375, + "logits/rejected": 3.09375, + "logps/chosen": -506.0, + "logps/rejected": -262.0, + "loss": 0.6604, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2158203125, + "rewards/margins": 0.11767578125, + "rewards/rejected": -0.33203125, + "step": 175 + }, + { + "epoch": 0.3683935112506541, + "grad_norm": 8.351573944091797, + "learning_rate": 4.898013051077735e-07, + "logits/chosen": 3.375, + "logits/rejected": 2.921875, + "logps/chosen": -165.0, + "logps/rejected": -286.0, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21484375, + "rewards/margins": 0.0262451171875, + "rewards/rejected": -0.2412109375, + "step": 176 + }, + { + "epoch": 0.3704866562009419, + "grad_norm": 8.730794906616211, + "learning_rate": 4.896399106457132e-07, + "logits/chosen": 3.46875, + "logits/rejected": 3.984375, + "logps/chosen": -348.0, + "logps/rejected": -330.0, + "loss": 0.6684, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.248046875, + "rewards/margins": 0.0380859375, + "rewards/rejected": -0.287109375, + "step": 177 + }, + { + "epoch": 0.37257980115122974, + "grad_norm": 8.036724090576172, + "learning_rate": 4.894772761712174e-07, + "logits/chosen": 2.921875, + "logits/rejected": 3.375, + "logps/chosen": -340.0, + "logps/rejected": -193.0, + "loss": 0.6449, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.236328125, + "rewards/margins": 0.021484375, + "rewards/rejected": -0.2578125, + "step": 178 + }, + { + "epoch": 0.3746729461015175, + "grad_norm": 8.092848777770996, + "learning_rate": 4.893134025259228e-07, + "logits/chosen": 3.296875, + "logits/rejected": 4.0625, + "logps/chosen": -544.0, + "logps/rejected": -432.0, + "loss": 0.6621, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.484375, + "rewards/margins": -0.0849609375, + "rewards/rejected": -0.400390625, + "step": 179 + }, + { + "epoch": 0.37676609105180536, + "grad_norm": 7.546232223510742, + "learning_rate": 4.891482905578792e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.59375, + "logps/chosen": -320.0, + "logps/rejected": -364.0, + "loss": 0.6624, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2890625, + "rewards/margins": 0.005859375, + "rewards/rejected": -0.294921875, + "step": 180 + }, + { + "epoch": 0.37885923600209315, + "grad_norm": 8.636161804199219, + "learning_rate": 4.889819411215448e-07, + "logits/chosen": 2.25, + "logits/rejected": 2.09375, + "logps/chosen": -106.5, + "logps/rejected": -172.0, + "loss": 0.6579, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.310546875, + "rewards/margins": -0.08544921875, + "rewards/rejected": -0.224609375, + "step": 181 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 8.529451370239258, + "learning_rate": 4.888143550777814e-07, + "logits/chosen": 3.515625, + "logits/rejected": 3.703125, + "logps/chosen": -340.0, + "logps/rejected": -264.0, + "loss": 0.6399, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.337890625, + "rewards/margins": -0.083984375, + "rewards/rejected": -0.25390625, + "step": 182 + }, + { + "epoch": 0.38304552590266877, + "grad_norm": 8.135290145874023, + "learning_rate": 4.886455332938507e-07, + "logits/chosen": 1.6953125, + "logits/rejected": 2.078125, + "logps/chosen": -306.0, + "logps/rejected": -298.0, + "loss": 0.667, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.341796875, + "rewards/margins": -0.1474609375, + "rewards/rejected": -0.193359375, + "step": 183 + }, + { + "epoch": 0.38513867085295656, + "grad_norm": 8.299905776977539, + "learning_rate": 4.88475476643409e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.953125, + "logps/chosen": -464.0, + "logps/rejected": -444.0, + "loss": 0.6306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.404296875, + "rewards/margins": 0.271484375, + "rewards/rejected": -0.67578125, + "step": 184 + }, + { + "epoch": 0.3872318158032444, + "grad_norm": 8.095890998840332, + "learning_rate": 4.883041860065032e-07, + "logits/chosen": 3.03125, + "logits/rejected": 4.0625, + "logps/chosen": -664.0, + "logps/rejected": -470.0, + "loss": 0.6491, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.310546875, + "rewards/margins": 0.083984375, + "rewards/rejected": -0.39453125, + "step": 185 + }, + { + "epoch": 0.3893249607535322, + "grad_norm": 8.727668762207031, + "learning_rate": 4.881316622695661e-07, + "logits/chosen": 3.234375, + "logits/rejected": 2.71875, + "logps/chosen": -161.0, + "logps/rejected": -274.0, + "loss": 0.643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.216796875, + "rewards/margins": 0.212890625, + "rewards/rejected": -0.4296875, + "step": 186 + }, + { + "epoch": 0.39141810570381996, + "grad_norm": 8.658443450927734, + "learning_rate": 4.87957906325412e-07, + "logits/chosen": 2.640625, + "logits/rejected": 3.046875, + "logps/chosen": -382.0, + "logps/rejected": -304.0, + "loss": 0.6738, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34375, + "rewards/margins": 0.15234375, + "rewards/rejected": -0.49609375, + "step": 187 + }, + { + "epoch": 0.3935112506541078, + "grad_norm": 8.59584903717041, + "learning_rate": 4.877829190732315e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.375, + "logps/chosen": -344.0, + "logps/rejected": -230.0, + "loss": 0.6756, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2275390625, + "rewards/margins": 0.2392578125, + "rewards/rejected": -0.466796875, + "step": 188 + }, + { + "epoch": 0.3956043956043956, + "grad_norm": 8.687397003173828, + "learning_rate": 4.876067014185876e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.21875, + "logps/chosen": -330.0, + "logps/rejected": -384.0, + "loss": 0.6226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.125, + "rewards/margins": 0.3515625, + "rewards/rejected": -0.4765625, + "step": 189 + }, + { + "epoch": 0.3976975405546834, + "grad_norm": 7.772812366485596, + "learning_rate": 4.874292542734106e-07, + "logits/chosen": 2.84375, + "logits/rejected": 3.015625, + "logps/chosen": -366.0, + "logps/rejected": -282.0, + "loss": 0.645, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.515625, + "rewards/margins": -0.0244140625, + "rewards/rejected": -0.490234375, + "step": 190 + }, + { + "epoch": 0.3997906855049712, + "grad_norm": 8.127242088317871, + "learning_rate": 4.872505785559932e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.96875, + "logps/chosen": -208.0, + "logps/rejected": -158.0, + "loss": 0.647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21484375, + "rewards/margins": 0.166015625, + "rewards/rejected": -0.3828125, + "step": 191 + }, + { + "epoch": 0.40188383045525905, + "grad_norm": 8.73538589477539, + "learning_rate": 4.870706751909864e-07, + "logits/chosen": 3.21875, + "logits/rejected": 2.96875, + "logps/chosen": -177.0, + "logps/rejected": -328.0, + "loss": 0.6665, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3671875, + "rewards/margins": 0.0068359375, + "rewards/rejected": -0.375, + "step": 192 + }, + { + "epoch": 0.40397697540554683, + "grad_norm": 9.213397026062012, + "learning_rate": 4.868895451093939e-07, + "logits/chosen": 2.34375, + "logits/rejected": 2.796875, + "logps/chosen": -350.0, + "logps/rejected": -158.0, + "loss": 0.6662, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.330078125, + "rewards/margins": 0.0888671875, + "rewards/rejected": -0.41796875, + "step": 193 + }, + { + "epoch": 0.4060701203558346, + "grad_norm": 8.662314414978027, + "learning_rate": 4.867071892485679e-07, + "logits/chosen": 3.3125, + "logits/rejected": 4.0, + "logps/chosen": -474.0, + "logps/rejected": -384.0, + "loss": 0.6729, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.458984375, + "rewards/margins": -0.0498046875, + "rewards/rejected": -0.408203125, + "step": 194 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 8.317453384399414, + "learning_rate": 4.865236085522042e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.09375, + "logps/chosen": -544.0, + "logps/rejected": -592.0, + "loss": 0.6539, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.287109375, + "rewards/margins": -0.002685546875, + "rewards/rejected": -0.28515625, + "step": 195 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 7.9303789138793945, + "learning_rate": 4.863388039703365e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.703125, + "logps/chosen": -183.0, + "logps/rejected": -296.0, + "loss": 0.6524, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.439453125, + "rewards/margins": 0.015625, + "rewards/rejected": -0.455078125, + "step": 196 + }, + { + "epoch": 0.4123495552066981, + "grad_norm": 8.1914644241333, + "learning_rate": 4.861527764593328e-07, + "logits/chosen": 2.4375, + "logits/rejected": 2.921875, + "logps/chosen": -432.0, + "logps/rejected": -208.0, + "loss": 0.6423, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5703125, + "rewards/margins": -0.08642578125, + "rewards/rejected": -0.484375, + "step": 197 + }, + { + "epoch": 0.41444270015698587, + "grad_norm": 8.490318298339844, + "learning_rate": 4.859655269818898e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.9375, + "logps/chosen": -772.0, + "logps/rejected": -1056.0, + "loss": 0.6417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.224609375, + "rewards/margins": 0.66796875, + "rewards/rejected": -0.89453125, + "step": 198 + }, + { + "epoch": 0.4165358451072737, + "grad_norm": 7.858203887939453, + "learning_rate": 4.857770565070274e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.234375, + "logps/chosen": -338.0, + "logps/rejected": -402.0, + "loss": 0.6594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.294921875, + "rewards/margins": 0.34375, + "rewards/rejected": -0.63671875, + "step": 199 + }, + { + "epoch": 0.4186289900575615, + "grad_norm": 9.21390438079834, + "learning_rate": 4.855873660100845e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.75, + "logps/chosen": -636.0, + "logps/rejected": -516.0, + "loss": 0.6812, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41015625, + "rewards/margins": 0.06982421875, + "rewards/rejected": -0.478515625, + "step": 200 + }, + { + "epoch": 0.4207221350078493, + "grad_norm": 8.616729736328125, + "learning_rate": 4.853964564727136e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.71875, + "logps/chosen": -308.0, + "logps/rejected": -388.0, + "loss": 0.6656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.52734375, + "rewards/margins": 0.0400390625, + "rewards/rejected": -0.5703125, + "step": 201 + }, + { + "epoch": 0.4228152799581371, + "grad_norm": 8.263435363769531, + "learning_rate": 4.852043288828757e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -572.0, + "logps/rejected": -452.0, + "loss": 0.6563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.609375, + "rewards/margins": 0.16796875, + "rewards/rejected": -0.77734375, + "step": 202 + }, + { + "epoch": 0.4249084249084249, + "grad_norm": 8.495272636413574, + "learning_rate": 4.850109842348355e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.4375, + "logps/chosen": -266.0, + "logps/rejected": -210.0, + "loss": 0.6447, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.478515625, + "rewards/margins": -0.12890625, + "rewards/rejected": -0.349609375, + "step": 203 + }, + { + "epoch": 0.42700156985871274, + "grad_norm": 8.544230461120605, + "learning_rate": 4.848164235291556e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.734375, + "logps/chosen": -432.0, + "logps/rejected": -314.0, + "loss": 0.6496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.287109375, + "rewards/margins": 0.26171875, + "rewards/rejected": -0.546875, + "step": 204 + }, + { + "epoch": 0.4290947148090005, + "grad_norm": 8.632508277893066, + "learning_rate": 4.846206477726922e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.578125, + "logps/chosen": -480.0, + "logps/rejected": -450.0, + "loss": 0.6327, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.478515625, + "rewards/margins": 0.0908203125, + "rewards/rejected": -0.5703125, + "step": 205 + }, + { + "epoch": 0.4311878597592883, + "grad_norm": 9.020018577575684, + "learning_rate": 4.844236579785887e-07, + "logits/chosen": 3.53125, + "logits/rejected": 4.0, + "logps/chosen": -472.0, + "logps/rejected": -932.0, + "loss": 0.6466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2177734375, + "rewards/margins": 0.515625, + "rewards/rejected": -0.734375, + "step": 206 + }, + { + "epoch": 0.43328100470957615, + "grad_norm": 8.227005004882812, + "learning_rate": 4.84225455166272e-07, + "logits/chosen": 2.046875, + "logits/rejected": 2.71875, + "logps/chosen": -396.0, + "logps/rejected": -360.0, + "loss": 0.6483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2021484375, + "rewards/margins": 0.1884765625, + "rewards/rejected": -0.390625, + "step": 207 + }, + { + "epoch": 0.43537414965986393, + "grad_norm": 8.853082656860352, + "learning_rate": 4.840260403614459e-07, + "logits/chosen": 3.4375, + "logits/rejected": 3.703125, + "logps/chosen": -708.0, + "logps/rejected": -502.0, + "loss": 0.6511, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.640625, + "rewards/margins": 0.00634765625, + "rewards/rejected": -0.6484375, + "step": 208 + }, + { + "epoch": 0.43746729461015177, + "grad_norm": 9.090826988220215, + "learning_rate": 4.838254145960864e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.421875, + "logps/chosen": -576.0, + "logps/rejected": -466.0, + "loss": 0.6669, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5546875, + "rewards/margins": 0.0458984375, + "rewards/rejected": -0.6015625, + "step": 209 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 8.810733795166016, + "learning_rate": 4.836235789084363e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.515625, + "logps/chosen": -338.0, + "logps/rejected": -336.0, + "loss": 0.6378, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.265625, + "rewards/margins": 0.1591796875, + "rewards/rejected": -0.42578125, + "step": 210 + }, + { + "epoch": 0.4416535845107274, + "grad_norm": 8.438765525817871, + "learning_rate": 4.834205343429996e-07, + "logits/chosen": 2.9375, + "logits/rejected": 3.25, + "logps/chosen": -250.0, + "logps/rejected": -232.0, + "loss": 0.6315, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.369140625, + "rewards/margins": -0.00048828125, + "rewards/rejected": -0.369140625, + "step": 211 + }, + { + "epoch": 0.4437467294610152, + "grad_norm": 8.868012428283691, + "learning_rate": 4.832162819505364e-07, + "logits/chosen": 1.984375, + "logits/rejected": 2.1875, + "logps/chosen": -228.0, + "logps/rejected": -185.0, + "loss": 0.6631, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.341796875, + "rewards/margins": 0.1728515625, + "rewards/rejected": -0.515625, + "step": 212 + }, + { + "epoch": 0.44583987441130296, + "grad_norm": 9.201122283935547, + "learning_rate": 4.830108227880576e-07, + "logits/chosen": 2.3125, + "logits/rejected": 2.734375, + "logps/chosen": -552.0, + "logps/rejected": -462.0, + "loss": 0.6587, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.376953125, + "rewards/margins": 0.326171875, + "rewards/rejected": -0.703125, + "step": 213 + }, + { + "epoch": 0.4479330193615908, + "grad_norm": 9.223275184631348, + "learning_rate": 4.828041579188185e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.6875, + "logps/chosen": -272.0, + "logps/rejected": -616.0, + "loss": 0.6646, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.46875, + "rewards/margins": 0.177734375, + "rewards/rejected": -0.6484375, + "step": 214 + }, + { + "epoch": 0.4500261643118786, + "grad_norm": 10.037179946899414, + "learning_rate": 4.825962884123146e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.34375, + "logps/chosen": -360.0, + "logps/rejected": -354.0, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.546875, + "rewards/margins": 0.146484375, + "rewards/rejected": -0.69140625, + "step": 215 + }, + { + "epoch": 0.4521193092621664, + "grad_norm": 9.276552200317383, + "learning_rate": 4.823872153442752e-07, + "logits/chosen": 2.0, + "logits/rejected": 2.21875, + "logps/chosen": -183.0, + "logps/rejected": -400.0, + "loss": 0.66, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47265625, + "rewards/margins": 0.302734375, + "rewards/rejected": -0.77734375, + "step": 216 + }, + { + "epoch": 0.4542124542124542, + "grad_norm": 8.489870071411133, + "learning_rate": 4.821769397966578e-07, + "logits/chosen": 2.546875, + "logits/rejected": 2.578125, + "logps/chosen": -328.0, + "logps/rejected": -556.0, + "loss": 0.6381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40234375, + "rewards/margins": 0.369140625, + "rewards/rejected": -0.7734375, + "step": 217 + }, + { + "epoch": 0.456305599162742, + "grad_norm": 9.456313133239746, + "learning_rate": 4.819654628576432e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.28125, + "logps/chosen": -532.0, + "logps/rejected": -498.0, + "loss": 0.6358, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.384765625, + "rewards/margins": 0.240234375, + "rewards/rejected": -0.625, + "step": 218 + }, + { + "epoch": 0.45839874411302983, + "grad_norm": 8.657371520996094, + "learning_rate": 4.81752785621629e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.75, + "logps/chosen": -448.0, + "logps/rejected": -548.0, + "loss": 0.6388, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.71875, + "rewards/margins": -0.1513671875, + "rewards/rejected": -0.56640625, + "step": 219 + }, + { + "epoch": 0.4604918890633176, + "grad_norm": 8.963862419128418, + "learning_rate": 4.815389091892249e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.234375, + "logps/chosen": -408.0, + "logps/rejected": -378.0, + "loss": 0.6558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.462890625, + "rewards/margins": 0.423828125, + "rewards/rejected": -0.88671875, + "step": 220 + }, + { + "epoch": 0.46258503401360546, + "grad_norm": 9.423280715942383, + "learning_rate": 4.813238346672459e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.953125, + "logps/chosen": -334.0, + "logps/rejected": -306.0, + "loss": 0.6548, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.458984375, + "rewards/margins": 0.2060546875, + "rewards/rejected": -0.6640625, + "step": 221 + }, + { + "epoch": 0.46467817896389324, + "grad_norm": 9.261499404907227, + "learning_rate": 4.811075631687073e-07, + "logits/chosen": 3.171875, + "logits/rejected": 4.0625, + "logps/chosen": -486.0, + "logps/rejected": -336.0, + "loss": 0.672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.45703125, + "rewards/margins": -0.0068359375, + "rewards/rejected": -0.451171875, + "step": 222 + }, + { + "epoch": 0.4667713239141811, + "grad_norm": 8.188753128051758, + "learning_rate": 4.80890095812819e-07, + "logits/chosen": 2.328125, + "logits/rejected": 2.796875, + "logps/chosen": -200.0, + "logps/rejected": -207.0, + "loss": 0.6451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.400390625, + "rewards/margins": 0.2060546875, + "rewards/rejected": -0.60546875, + "step": 223 + }, + { + "epoch": 0.46886446886446886, + "grad_norm": 7.763673305511475, + "learning_rate": 4.806714337249796e-07, + "logits/chosen": 2.90625, + "logits/rejected": 3.453125, + "logps/chosen": -804.0, + "logps/rejected": -330.0, + "loss": 0.6344, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6953125, + "rewards/margins": -0.0556640625, + "rewards/rejected": -0.640625, + "step": 224 + }, + { + "epoch": 0.47095761381475665, + "grad_norm": 9.266056060791016, + "learning_rate": 4.804515780367698e-07, + "logits/chosen": 2.5, + "logits/rejected": 3.375, + "logps/chosen": -608.0, + "logps/rejected": -498.0, + "loss": 0.6601, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.50390625, + "rewards/margins": 0.5078125, + "rewards/rejected": -1.015625, + "step": 225 + }, + { + "epoch": 0.4730507587650445, + "grad_norm": 8.741589546203613, + "learning_rate": 4.802305298859477e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.203125, + "logps/chosen": -436.0, + "logps/rejected": -552.0, + "loss": 0.6669, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.375, + "rewards/margins": 0.298828125, + "rewards/rejected": -0.67578125, + "step": 226 + }, + { + "epoch": 0.47514390371533227, + "grad_norm": 8.920707702636719, + "learning_rate": 4.800082904164425e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.375, + "logps/chosen": -178.0, + "logps/rejected": -197.0, + "loss": 0.6363, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30078125, + "rewards/margins": 0.1923828125, + "rewards/rejected": -0.4921875, + "step": 227 + }, + { + "epoch": 0.4772370486656201, + "grad_norm": 9.029594421386719, + "learning_rate": 4.797848607783484e-07, + "logits/chosen": 2.84375, + "logits/rejected": 3.21875, + "logps/chosen": -388.0, + "logps/rejected": -348.0, + "loss": 0.6363, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6640625, + "rewards/margins": -0.12451171875, + "rewards/rejected": -0.5390625, + "step": 228 + }, + { + "epoch": 0.4793301936159079, + "grad_norm": 9.25216293334961, + "learning_rate": 4.795602421279185e-07, + "logits/chosen": 2.859375, + "logits/rejected": 3.109375, + "logps/chosen": -756.0, + "logps/rejected": -524.0, + "loss": 0.6263, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.470703125, + "rewards/margins": 0.39453125, + "rewards/rejected": -0.86328125, + "step": 229 + }, + { + "epoch": 0.48142333856619574, + "grad_norm": 8.571414947509766, + "learning_rate": 4.793344356275594e-07, + "logits/chosen": 2.078125, + "logits/rejected": 2.1875, + "logps/chosen": -336.0, + "logps/rejected": -462.0, + "loss": 0.6558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.337890625, + "rewards/margins": 0.162109375, + "rewards/rejected": -0.5, + "step": 230 + }, + { + "epoch": 0.4835164835164835, + "grad_norm": 9.288797378540039, + "learning_rate": 4.791074424458246e-07, + "logits/chosen": 2.625, + "logits/rejected": 2.828125, + "logps/chosen": -434.0, + "logps/rejected": -432.0, + "loss": 0.6546, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33984375, + "rewards/margins": 0.1484375, + "rewards/rejected": -0.48828125, + "step": 231 + }, + { + "epoch": 0.4856096284667713, + "grad_norm": 8.921971321105957, + "learning_rate": 4.788792637574087e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.8125, + "logps/chosen": -420.0, + "logps/rejected": -176.0, + "loss": 0.6511, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5234375, + "rewards/margins": 0.03076171875, + "rewards/rejected": -0.55078125, + "step": 232 + }, + { + "epoch": 0.48770277341705914, + "grad_norm": 8.216841697692871, + "learning_rate": 4.786499007431418e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.40625, + "logps/chosen": -250.0, + "logps/rejected": -213.0, + "loss": 0.6406, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.61328125, + "rewards/margins": -0.0615234375, + "rewards/rejected": -0.55078125, + "step": 233 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 9.005748748779297, + "learning_rate": 4.784193545899823e-07, + "logits/chosen": 2.359375, + "logits/rejected": 3.0, + "logps/chosen": -400.0, + "logps/rejected": -346.0, + "loss": 0.633, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.734375, + "rewards/margins": -0.03515625, + "rewards/rejected": -0.69921875, + "step": 234 + }, + { + "epoch": 0.49188906331763477, + "grad_norm": 9.359798431396484, + "learning_rate": 4.781876264910116e-07, + "logits/chosen": 2.5625, + "logits/rejected": 3.375, + "logps/chosen": -378.0, + "logps/rejected": -262.0, + "loss": 0.6571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34375, + "rewards/margins": 0.345703125, + "rewards/rejected": -0.6875, + "step": 235 + }, + { + "epoch": 0.49398220826792255, + "grad_norm": 8.288033485412598, + "learning_rate": 4.779547176454278e-07, + "logits/chosen": 1.5859375, + "logits/rejected": 1.484375, + "logps/chosen": -175.0, + "logps/rejected": -183.0, + "loss": 0.6334, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.625, + "rewards/margins": -0.007080078125, + "rewards/rejected": -0.6171875, + "step": 236 + }, + { + "epoch": 0.49607535321821034, + "grad_norm": 9.106061935424805, + "learning_rate": 4.777206292585393e-07, + "logits/chosen": 3.125, + "logits/rejected": 2.984375, + "logps/chosen": -612.0, + "logps/rejected": -784.0, + "loss": 0.6595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8125, + "rewards/margins": 0.05908203125, + "rewards/rejected": -0.87109375, + "step": 237 + }, + { + "epoch": 0.4981684981684982, + "grad_norm": 9.198912620544434, + "learning_rate": 4.774853625417585e-07, + "logits/chosen": 3.4375, + "logits/rejected": 3.703125, + "logps/chosen": -520.0, + "logps/rejected": -356.0, + "loss": 0.641, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37109375, + "rewards/margins": 0.296875, + "rewards/rejected": -0.66796875, + "step": 238 + }, + { + "epoch": 0.500261643118786, + "grad_norm": 9.001884460449219, + "learning_rate": 4.772489187125961e-07, + "logits/chosen": 2.765625, + "logits/rejected": 3.40625, + "logps/chosen": -258.0, + "logps/rejected": -358.0, + "loss": 0.6473, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.61328125, + "rewards/margins": 0.1689453125, + "rewards/rejected": -0.78125, + "step": 239 + }, + { + "epoch": 0.5023547880690737, + "grad_norm": 8.567695617675781, + "learning_rate": 4.770112989946538e-07, + "logits/chosen": 1.3984375, + "logits/rejected": 1.4296875, + "logps/chosen": -282.0, + "logps/rejected": -202.0, + "loss": 0.6417, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.392578125, + "rewards/margins": 0.02685546875, + "rewards/rejected": -0.41796875, + "step": 240 + }, + { + "epoch": 0.5044479330193616, + "grad_norm": 9.044564247131348, + "learning_rate": 4.767725046176192e-07, + "logits/chosen": 2.546875, + "logits/rejected": 2.671875, + "logps/chosen": -278.0, + "logps/rejected": -344.0, + "loss": 0.6656, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.62109375, + "rewards/margins": -0.11181640625, + "rewards/rejected": -0.5078125, + "step": 241 + }, + { + "epoch": 0.5065410779696494, + "grad_norm": 8.735559463500977, + "learning_rate": 4.765325368172582e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.265625, + "logps/chosen": -636.0, + "logps/rejected": -512.0, + "loss": 0.6522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.52734375, + "rewards/margins": 0.232421875, + "rewards/rejected": -0.76171875, + "step": 242 + }, + { + "epoch": 0.5086342229199372, + "grad_norm": 9.265771865844727, + "learning_rate": 4.7629139683540966e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.6875, + "logps/chosen": -396.0, + "logps/rejected": -512.0, + "loss": 0.6323, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.62109375, + "rewards/margins": -0.1181640625, + "rewards/rejected": -0.5, + "step": 243 + }, + { + "epoch": 0.510727367870225, + "grad_norm": 9.315106391906738, + "learning_rate": 4.760490859199781e-07, + "logits/chosen": 2.609375, + "logits/rejected": 3.4375, + "logps/chosen": -660.0, + "logps/rejected": -444.0, + "loss": 0.6428, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7421875, + "rewards/margins": -0.080078125, + "rewards/rejected": -0.66015625, + "step": 244 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 8.939321517944336, + "learning_rate": 4.75805605324928e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.5, + "logps/chosen": -372.0, + "logps/rejected": -414.0, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.419921875, + "rewards/margins": 0.26171875, + "rewards/rejected": -0.6796875, + "step": 245 + }, + { + "epoch": 0.5149136577708007, + "grad_norm": 8.98937702178955, + "learning_rate": 4.7556095631027667e-07, + "logits/chosen": 2.453125, + "logits/rejected": 2.59375, + "logps/chosen": -324.0, + "logps/rejected": -226.0, + "loss": 0.6713, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4453125, + "rewards/margins": 0.1171875, + "rewards/rejected": -0.5625, + "step": 246 + }, + { + "epoch": 0.5170068027210885, + "grad_norm": 8.769051551818848, + "learning_rate": 4.7531514014208813e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.609375, + "logps/chosen": -418.0, + "logps/rejected": -324.0, + "loss": 0.6452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4609375, + "rewards/margins": 0.21875, + "rewards/rejected": -0.6796875, + "step": 247 + }, + { + "epoch": 0.5190999476713762, + "grad_norm": 8.939539909362793, + "learning_rate": 4.7506815809246653e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.5, + "logps/chosen": -512.0, + "logps/rejected": -632.0, + "loss": 0.6657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.265625, + "rewards/margins": 0.515625, + "rewards/rejected": -0.78125, + "step": 248 + }, + { + "epoch": 0.521193092621664, + "grad_norm": 9.614801406860352, + "learning_rate": 4.7482001143954943e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.34375, + "logps/chosen": -360.0, + "logps/rejected": -454.0, + "loss": 0.6703, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6796875, + "rewards/margins": -0.01416015625, + "rewards/rejected": -0.6640625, + "step": 249 + }, + { + "epoch": 0.5232862375719518, + "grad_norm": 8.643899917602539, + "learning_rate": 4.745707014675012e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.984375, + "logps/chosen": -488.0, + "logps/rejected": -520.0, + "loss": 0.6558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.515625, + "rewards/margins": 0.2470703125, + "rewards/rejected": -0.765625, + "step": 250 + }, + { + "epoch": 0.5253793825222397, + "grad_norm": 8.415505409240723, + "learning_rate": 4.743202294665065e-07, + "logits/chosen": 2.109375, + "logits/rejected": 3.59375, + "logps/chosen": -442.0, + "logps/rejected": -262.0, + "loss": 0.6401, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.490234375, + "rewards/margins": -0.04345703125, + "rewards/rejected": -0.4453125, + "step": 251 + }, + { + "epoch": 0.5274725274725275, + "grad_norm": 8.936569213867188, + "learning_rate": 4.7406859673276333e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.84375, + "logps/chosen": -294.0, + "logps/rejected": -496.0, + "loss": 0.6242, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.58984375, + "rewards/margins": 0.248046875, + "rewards/rejected": -0.8359375, + "step": 252 + }, + { + "epoch": 0.5295656724228153, + "grad_norm": 8.643172264099121, + "learning_rate": 4.738158045684766e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.546875, + "logps/chosen": -620.0, + "logps/rejected": -584.0, + "loss": 0.6572, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.474609375, + "rewards/margins": 0.70703125, + "rewards/rejected": -1.1796875, + "step": 253 + }, + { + "epoch": 0.531658817373103, + "grad_norm": 8.712985038757324, + "learning_rate": 4.7356185428185145e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.5, + "logps/chosen": -512.0, + "logps/rejected": -620.0, + "loss": 0.652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5078125, + "rewards/margins": 0.138671875, + "rewards/rejected": -0.6484375, + "step": 254 + }, + { + "epoch": 0.533751962323391, + "grad_norm": 8.554315567016602, + "learning_rate": 4.733067471870862e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.8125, + "logps/chosen": -436.0, + "logps/rejected": -494.0, + "loss": 0.6377, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30859375, + "rewards/margins": 0.11376953125, + "rewards/rejected": -0.421875, + "step": 255 + }, + { + "epoch": 0.5358451072736787, + "grad_norm": 10.03768253326416, + "learning_rate": 4.7305048460436555e-07, + "logits/chosen": 3.53125, + "logits/rejected": 3.15625, + "logps/chosen": -688.0, + "logps/rejected": -612.0, + "loss": 0.6636, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4609375, + "rewards/margins": 0.37109375, + "rewards/rejected": -0.83203125, + "step": 256 + }, + { + "epoch": 0.5379382522239665, + "grad_norm": 8.926187515258789, + "learning_rate": 4.727930678598541e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.703125, + "logps/chosen": -258.0, + "logps/rejected": -358.0, + "loss": 0.6494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41015625, + "rewards/margins": 0.1064453125, + "rewards/rejected": -0.515625, + "step": 257 + }, + { + "epoch": 0.5400313971742543, + "grad_norm": 9.17234992980957, + "learning_rate": 4.725344982856891e-07, + "logits/chosen": 2.078125, + "logits/rejected": 2.125, + "logps/chosen": -360.0, + "logps/rejected": -243.0, + "loss": 0.6528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.412109375, + "rewards/margins": 0.48046875, + "rewards/rejected": -0.89453125, + "step": 258 + }, + { + "epoch": 0.5421245421245421, + "grad_norm": 9.169678688049316, + "learning_rate": 4.7227477721997387e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.75, + "logps/chosen": -592.0, + "logps/rejected": -402.0, + "loss": 0.652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.384765625, + "rewards/margins": 0.296875, + "rewards/rejected": -0.6796875, + "step": 259 + }, + { + "epoch": 0.54421768707483, + "grad_norm": 9.725273132324219, + "learning_rate": 4.720139060067706e-07, + "logits/chosen": 3.125, + "logits/rejected": 4.0625, + "logps/chosen": -572.0, + "logps/rejected": -330.0, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6015625, + "rewards/margins": 0.08447265625, + "rewards/rejected": -0.68359375, + "step": 260 + }, + { + "epoch": 0.5463108320251178, + "grad_norm": 8.998029708862305, + "learning_rate": 4.7175188599609363e-07, + "logits/chosen": 2.109375, + "logits/rejected": 2.578125, + "logps/chosen": -300.0, + "logps/rejected": -308.0, + "loss": 0.6564, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.46484375, + "rewards/margins": -0.03173828125, + "rewards/rejected": -0.43359375, + "step": 261 + }, + { + "epoch": 0.5484039769754055, + "grad_norm": 9.795524597167969, + "learning_rate": 4.7148871854390204e-07, + "logits/chosen": 2.0, + "logits/rejected": 1.9375, + "logps/chosen": -440.0, + "logps/rejected": -536.0, + "loss": 0.6377, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.78515625, + "rewards/margins": 0.30078125, + "rewards/rejected": -1.0859375, + "step": 262 + }, + { + "epoch": 0.5504971219256933, + "grad_norm": 8.896550178527832, + "learning_rate": 4.7122440501209356e-07, + "logits/chosen": 2.4375, + "logits/rejected": 2.234375, + "logps/chosen": -276.0, + "logps/rejected": -320.0, + "loss": 0.6503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48828125, + "rewards/margins": 0.26953125, + "rewards/rejected": -0.7578125, + "step": 263 + }, + { + "epoch": 0.5525902668759811, + "grad_norm": 8.59717082977295, + "learning_rate": 4.709589467684962e-07, + "logits/chosen": 2.03125, + "logits/rejected": 1.7265625, + "logps/chosen": -158.0, + "logps/rejected": -204.0, + "loss": 0.6415, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.482421875, + "rewards/margins": 0.1396484375, + "rewards/rejected": -0.62109375, + "step": 264 + }, + { + "epoch": 0.554683411826269, + "grad_norm": 9.051630973815918, + "learning_rate": 4.7069234518686243e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.515625, + "logps/chosen": -210.0, + "logps/rejected": -396.0, + "loss": 0.6432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.62109375, + "rewards/margins": 0.578125, + "rewards/rejected": -1.1953125, + "step": 265 + }, + { + "epoch": 0.5567765567765568, + "grad_norm": 8.217545509338379, + "learning_rate": 4.7042460164686113e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.09375, + "logps/chosen": -536.0, + "logps/rejected": -716.0, + "loss": 0.6057, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49609375, + "rewards/margins": 0.1865234375, + "rewards/rejected": -0.68359375, + "step": 266 + }, + { + "epoch": 0.5588697017268446, + "grad_norm": 9.651106834411621, + "learning_rate": 4.701557175340711e-07, + "logits/chosen": 2.546875, + "logits/rejected": 2.609375, + "logps/chosen": -191.0, + "logps/rejected": -328.0, + "loss": 0.663, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.56640625, + "rewards/margins": 0.03466796875, + "rewards/rejected": -0.6015625, + "step": 267 + }, + { + "epoch": 0.5609628466771324, + "grad_norm": 8.923408508300781, + "learning_rate": 4.6988569423997357e-07, + "logits/chosen": 3.25, + "logits/rejected": 2.65625, + "logps/chosen": -266.0, + "logps/rejected": -696.0, + "loss": 0.6271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.53125, + "rewards/margins": 0.470703125, + "rewards/rejected": -1.0, + "step": 268 + }, + { + "epoch": 0.5630559916274201, + "grad_norm": 8.7314453125, + "learning_rate": 4.69614533161945e-07, + "logits/chosen": 3.265625, + "logits/rejected": 2.828125, + "logps/chosen": -464.0, + "logps/rejected": -512.0, + "loss": 0.6557, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.765625, + "rewards/margins": 0.0390625, + "rewards/rejected": -0.8046875, + "step": 269 + }, + { + "epoch": 0.565149136577708, + "grad_norm": 9.67919921875, + "learning_rate": 4.6934223570325e-07, + "logits/chosen": 2.53125, + "logits/rejected": 2.53125, + "logps/chosen": -504.0, + "logps/rejected": -520.0, + "loss": 0.6701, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.15625, + "rewards/margins": -0.263671875, + "rewards/rejected": -0.8984375, + "step": 270 + }, + { + "epoch": 0.5672422815279958, + "grad_norm": 8.798002243041992, + "learning_rate": 4.6906880327303377e-07, + "logits/chosen": 2.546875, + "logits/rejected": 2.28125, + "logps/chosen": -724.0, + "logps/rejected": -736.0, + "loss": 0.641, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41796875, + "rewards/margins": 0.16015625, + "rewards/rejected": -0.578125, + "step": 271 + }, + { + "epoch": 0.5693354264782836, + "grad_norm": 9.1842622756958, + "learning_rate": 4.6879423728631526e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.578125, + "logps/chosen": -608.0, + "logps/rejected": -376.0, + "loss": 0.6578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5546875, + "rewards/margins": 0.12890625, + "rewards/rejected": -0.68359375, + "step": 272 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 9.7493314743042, + "learning_rate": 4.685185391639795e-07, + "logits/chosen": 2.640625, + "logits/rejected": 2.828125, + "logps/chosen": -508.0, + "logps/rejected": -544.0, + "loss": 0.6402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.76953125, + "rewards/margins": 0.4453125, + "rewards/rejected": -1.21875, + "step": 273 + }, + { + "epoch": 0.5735217163788592, + "grad_norm": 9.176734924316406, + "learning_rate": 4.6824171033277026e-07, + "logits/chosen": 2.40625, + "logits/rejected": 3.15625, + "logps/chosen": -434.0, + "logps/rejected": -270.0, + "loss": 0.6278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.56640625, + "rewards/margins": 0.47265625, + "rewards/rejected": -1.0390625, + "step": 274 + }, + { + "epoch": 0.5756148613291471, + "grad_norm": 8.987980842590332, + "learning_rate": 4.679637522252829e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.4375, + "logps/chosen": -536.0, + "logps/rejected": -402.0, + "loss": 0.6304, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43359375, + "rewards/margins": 0.2001953125, + "rewards/rejected": -0.6328125, + "step": 275 + }, + { + "epoch": 0.5777080062794349, + "grad_norm": 9.4086332321167, + "learning_rate": 4.676846662799566e-07, + "logits/chosen": 3.484375, + "logits/rejected": 4.21875, + "logps/chosen": -544.0, + "logps/rejected": -416.0, + "loss": 0.6252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41015625, + "rewards/margins": 0.37890625, + "rewards/rejected": -0.7890625, + "step": 276 + }, + { + "epoch": 0.5798011512297226, + "grad_norm": 9.234297752380371, + "learning_rate": 4.6740445394106755e-07, + "logits/chosen": 2.390625, + "logits/rejected": 2.359375, + "logps/chosen": -262.0, + "logps/rejected": -274.0, + "loss": 0.6749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7109375, + "rewards/margins": -0.02099609375, + "rewards/rejected": -0.69140625, + "step": 277 + }, + { + "epoch": 0.5818942961800104, + "grad_norm": 10.775644302368164, + "learning_rate": 4.6712311665872057e-07, + "logits/chosen": 1.6875, + "logits/rejected": 1.7578125, + "logps/chosen": -468.0, + "logps/rejected": -532.0, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.70703125, + "rewards/margins": -0.0751953125, + "rewards/rejected": -0.6328125, + "step": 278 + }, + { + "epoch": 0.5839874411302983, + "grad_norm": 9.250503540039062, + "learning_rate": 4.6684065588884224e-07, + "logits/chosen": 2.265625, + "logits/rejected": 2.59375, + "logps/chosen": -498.0, + "logps/rejected": -478.0, + "loss": 0.6085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.498046875, + "rewards/margins": 0.2333984375, + "rewards/rejected": -0.73046875, + "step": 279 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 9.45741081237793, + "learning_rate": 4.6655707309317345e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.53125, + "logps/chosen": -600.0, + "logps/rejected": -444.0, + "loss": 0.6351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6640625, + "rewards/margins": 0.1962890625, + "rewards/rejected": -0.86328125, + "step": 280 + }, + { + "epoch": 0.5881737310308739, + "grad_norm": 9.61277961730957, + "learning_rate": 4.6627236973926126e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.46875, + "logps/chosen": -376.0, + "logps/rejected": -318.0, + "loss": 0.6356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6328125, + "rewards/margins": 0.14453125, + "rewards/rejected": -0.77734375, + "step": 281 + }, + { + "epoch": 0.5902668759811617, + "grad_norm": 9.949418067932129, + "learning_rate": 4.6598654730045177e-07, + "logits/chosen": 2.28125, + "logits/rejected": 2.15625, + "logps/chosen": -253.0, + "logps/rejected": -336.0, + "loss": 0.6626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5703125, + "rewards/margins": 0.43359375, + "rewards/rejected": -1.0, + "step": 282 + }, + { + "epoch": 0.5923600209314495, + "grad_norm": 8.826484680175781, + "learning_rate": 4.6569960725588256e-07, + "logits/chosen": 3.609375, + "logits/rejected": 2.734375, + "logps/chosen": -366.0, + "logps/rejected": -584.0, + "loss": 0.627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.78125, + "rewards/margins": 0.37890625, + "rewards/rejected": -1.15625, + "step": 283 + }, + { + "epoch": 0.5944531658817374, + "grad_norm": 8.73790454864502, + "learning_rate": 4.654115510904746e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.859375, + "logps/chosen": -296.0, + "logps/rejected": -232.0, + "loss": 0.6401, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.55859375, + "rewards/margins": -0.0615234375, + "rewards/rejected": -0.49609375, + "step": 284 + }, + { + "epoch": 0.5965463108320251, + "grad_norm": 10.235679626464844, + "learning_rate": 4.651223802949247e-07, + "logits/chosen": 2.734375, + "logits/rejected": 2.875, + "logps/chosen": -498.0, + "logps/rejected": -376.0, + "loss": 0.682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.53515625, + "rewards/margins": 0.34375, + "rewards/rejected": -0.87890625, + "step": 285 + }, + { + "epoch": 0.5986394557823129, + "grad_norm": 9.037766456604004, + "learning_rate": 4.6483209636569837e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.46875, + "logps/chosen": -652.0, + "logps/rejected": -660.0, + "loss": 0.6442, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6953125, + "rewards/margins": 0.044921875, + "rewards/rejected": -0.7421875, + "step": 286 + }, + { + "epoch": 0.6007326007326007, + "grad_norm": 8.641436576843262, + "learning_rate": 4.645407008050212e-07, + "logits/chosen": 2.609375, + "logits/rejected": 2.640625, + "logps/chosen": -324.0, + "logps/rejected": -340.0, + "loss": 0.6445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.52734375, + "rewards/margins": 0.3125, + "rewards/rejected": -0.83984375, + "step": 287 + }, + { + "epoch": 0.6028257456828885, + "grad_norm": 9.133179664611816, + "learning_rate": 4.6424819512087166e-07, + "logits/chosen": 2.625, + "logits/rejected": 2.265625, + "logps/chosen": -161.0, + "logps/rejected": -320.0, + "loss": 0.6419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4765625, + "rewards/margins": 0.32421875, + "rewards/rejected": -0.80078125, + "step": 288 + }, + { + "epoch": 0.6049188906331764, + "grad_norm": 10.273738861083984, + "learning_rate": 4.639545808269731e-07, + "logits/chosen": 3.296875, + "logits/rejected": 2.890625, + "logps/chosen": -604.0, + "logps/rejected": -668.0, + "loss": 0.6861, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.71875, + "rewards/margins": -0.162109375, + "rewards/rejected": -0.55859375, + "step": 289 + }, + { + "epoch": 0.6070120355834642, + "grad_norm": 9.593812942504883, + "learning_rate": 4.636598594427858e-07, + "logits/chosen": 2.671875, + "logits/rejected": 2.9375, + "logps/chosen": -672.0, + "logps/rejected": -600.0, + "loss": 0.6522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.578125, + "rewards/margins": 0.515625, + "rewards/rejected": -1.09375, + "step": 290 + }, + { + "epoch": 0.609105180533752, + "grad_norm": 10.181145668029785, + "learning_rate": 4.6336403249349966e-07, + "logits/chosen": 2.375, + "logits/rejected": 2.796875, + "logps/chosen": -458.0, + "logps/rejected": -410.0, + "loss": 0.6496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.56640625, + "rewards/margins": 0.43359375, + "rewards/rejected": -1.0, + "step": 291 + }, + { + "epoch": 0.6111983254840397, + "grad_norm": 9.083106994628906, + "learning_rate": 4.630671015100255e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.875, + "logps/chosen": -272.0, + "logps/rejected": -370.0, + "loss": 0.6419, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.734375, + "rewards/margins": 0.16796875, + "rewards/rejected": -0.90234375, + "step": 292 + }, + { + "epoch": 0.6132914704343275, + "grad_norm": 8.897668838500977, + "learning_rate": 4.6276906802898776e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.96875, + "logps/chosen": -304.0, + "logps/rejected": -270.0, + "loss": 0.6371, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.76953125, + "rewards/margins": 0.107421875, + "rewards/rejected": -0.875, + "step": 293 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 8.895344734191895, + "learning_rate": 4.624699335927162e-07, + "logits/chosen": 1.90625, + "logits/rejected": 2.296875, + "logps/chosen": -306.0, + "logps/rejected": -252.0, + "loss": 0.6739, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.546875, + "rewards/margins": 0.1904296875, + "rewards/rejected": -0.73828125, + "step": 294 + }, + { + "epoch": 0.6174777603349032, + "grad_norm": 9.05825138092041, + "learning_rate": 4.6216969974923816e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.71875, + "logps/chosen": -474.0, + "logps/rejected": -376.0, + "loss": 0.6607, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6875, + "rewards/margins": -0.0556640625, + "rewards/rejected": -0.6328125, + "step": 295 + }, + { + "epoch": 0.619570905285191, + "grad_norm": 8.959853172302246, + "learning_rate": 4.618683680522703e-07, + "logits/chosen": 1.3046875, + "logits/rejected": 1.3125, + "logps/chosen": -264.0, + "logps/rejected": -201.0, + "loss": 0.6305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.490234375, + "rewards/margins": 0.15625, + "rewards/rejected": -0.6484375, + "step": 296 + }, + { + "epoch": 0.6216640502354788, + "grad_norm": 9.268010139465332, + "learning_rate": 4.6156594006121095e-07, + "logits/chosen": 2.15625, + "logits/rejected": 2.109375, + "logps/chosen": -410.0, + "logps/rejected": -600.0, + "loss": 0.6418, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6640625, + "rewards/margins": 0.6875, + "rewards/rejected": -1.3515625, + "step": 297 + }, + { + "epoch": 0.6237571951857667, + "grad_norm": 9.329010963439941, + "learning_rate": 4.612624173411315e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.5625, + "logps/chosen": -494.0, + "logps/rejected": -548.0, + "loss": 0.6407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.55859375, + "rewards/margins": 0.30859375, + "rewards/rejected": -0.8671875, + "step": 298 + }, + { + "epoch": 0.6258503401360545, + "grad_norm": 9.748213768005371, + "learning_rate": 4.609578014627687e-07, + "logits/chosen": 2.828125, + "logits/rejected": 3.25, + "logps/chosen": -740.0, + "logps/rejected": -464.0, + "loss": 0.6651, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.63671875, + "rewards/margins": 0.119140625, + "rewards/rejected": -0.7578125, + "step": 299 + }, + { + "epoch": 0.6279434850863422, + "grad_norm": 9.283663749694824, + "learning_rate": 4.6065209400251655e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.078125, + "logps/chosen": -207.0, + "logps/rejected": -326.0, + "loss": 0.6435, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6328125, + "rewards/margins": 0.201171875, + "rewards/rejected": -0.83203125, + "step": 300 + }, + { + "epoch": 0.63003663003663, + "grad_norm": 10.709654808044434, + "learning_rate": 4.6034529654241766e-07, + "logits/chosen": 3.28125, + "logits/rejected": 2.671875, + "logps/chosen": -292.0, + "logps/rejected": -332.0, + "loss": 0.6894, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.58203125, + "rewards/margins": 0.224609375, + "rewards/rejected": -0.80859375, + "step": 301 + }, + { + "epoch": 0.6321297749869178, + "grad_norm": 9.497162818908691, + "learning_rate": 4.600374106701558e-07, + "logits/chosen": 2.9375, + "logits/rejected": 3.375, + "logps/chosen": -656.0, + "logps/rejected": -458.0, + "loss": 0.6371, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.640625, + "rewards/margins": 0.189453125, + "rewards/rejected": -0.828125, + "step": 302 + }, + { + "epoch": 0.6342229199372057, + "grad_norm": 10.339771270751953, + "learning_rate": 4.597284379790471e-07, + "logits/chosen": 3.203125, + "logits/rejected": 2.84375, + "logps/chosen": -398.0, + "logps/rejected": -490.0, + "loss": 0.6441, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4375, + "rewards/margins": 0.154296875, + "rewards/rejected": -0.58984375, + "step": 303 + }, + { + "epoch": 0.6363160648874935, + "grad_norm": 8.851433753967285, + "learning_rate": 4.5941838006803196e-07, + "logits/chosen": 2.359375, + "logits/rejected": 3.09375, + "logps/chosen": -502.0, + "logps/rejected": -356.0, + "loss": 0.6332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40234375, + "rewards/margins": 0.271484375, + "rewards/rejected": -0.67578125, + "step": 304 + }, + { + "epoch": 0.6384092098377813, + "grad_norm": 8.970887184143066, + "learning_rate": 4.591072385416671e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.09375, + "logps/chosen": -290.0, + "logps/rejected": -364.0, + "loss": 0.5897, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.578125, + "rewards/margins": 0.083984375, + "rewards/rejected": -0.6640625, + "step": 305 + }, + { + "epoch": 0.640502354788069, + "grad_norm": 9.5183744430542, + "learning_rate": 4.5879501501011657e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.96875, + "logps/chosen": -492.0, + "logps/rejected": -532.0, + "loss": 0.6454, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.828125, + "rewards/margins": 0.0849609375, + "rewards/rejected": -0.9140625, + "step": 306 + }, + { + "epoch": 0.6425954997383568, + "grad_norm": 10.027036666870117, + "learning_rate": 4.5848171108914405e-07, + "logits/chosen": 2.78125, + "logits/rejected": 3.984375, + "logps/chosen": -752.0, + "logps/rejected": -560.0, + "loss": 0.6652, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.68359375, + "rewards/margins": 0.419921875, + "rewards/rejected": -1.1015625, + "step": 307 + }, + { + "epoch": 0.6446886446886447, + "grad_norm": 9.015626907348633, + "learning_rate": 4.581673284001044e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.96875, + "logps/chosen": -378.0, + "logps/rejected": -356.0, + "loss": 0.6544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.671875, + "rewards/margins": 0.40625, + "rewards/rejected": -1.078125, + "step": 308 + }, + { + "epoch": 0.6467817896389325, + "grad_norm": 8.90971565246582, + "learning_rate": 4.578518685699347e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.734375, + "logps/chosen": -712.0, + "logps/rejected": -580.0, + "loss": 0.6208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.431640625, + "rewards/margins": 0.0654296875, + "rewards/rejected": -0.498046875, + "step": 309 + }, + { + "epoch": 0.6488749345892203, + "grad_norm": 9.74847412109375, + "learning_rate": 4.575353332311466e-07, + "logits/chosen": 2.609375, + "logits/rejected": 2.515625, + "logps/chosen": -278.0, + "logps/rejected": -354.0, + "loss": 0.6351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44921875, + "rewards/margins": 0.330078125, + "rewards/rejected": -0.78125, + "step": 310 + }, + { + "epoch": 0.6509680795395081, + "grad_norm": 9.1494722366333, + "learning_rate": 4.572177240218175e-07, + "logits/chosen": 2.8125, + "logits/rejected": 3.203125, + "logps/chosen": -512.0, + "logps/rejected": -336.0, + "loss": 0.6362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6015625, + "rewards/margins": 0.1484375, + "rewards/rejected": -0.75, + "step": 311 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 8.894120216369629, + "learning_rate": 4.5689904258558203e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.5625, + "logps/chosen": -183.0, + "logps/rejected": -280.0, + "loss": 0.6426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.57421875, + "rewards/margins": 0.380859375, + "rewards/rejected": -0.95703125, + "step": 312 + }, + { + "epoch": 0.6551543694400838, + "grad_norm": 9.807157516479492, + "learning_rate": 4.565792905716236e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.96875, + "logps/chosen": -456.0, + "logps/rejected": -556.0, + "loss": 0.6645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.76171875, + "rewards/margins": 0.55859375, + "rewards/rejected": -1.3203125, + "step": 313 + }, + { + "epoch": 0.6572475143903715, + "grad_norm": 9.278183937072754, + "learning_rate": 4.562584696346659e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.28125, + "logps/chosen": -245.0, + "logps/rejected": -300.0, + "loss": 0.6436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.578125, + "rewards/margins": 0.08544921875, + "rewards/rejected": -0.6640625, + "step": 314 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 8.839766502380371, + "learning_rate": 4.5593658143496447e-07, + "logits/chosen": 3.234375, + "logits/rejected": 4.0, + "logps/chosen": -624.0, + "logps/rejected": -404.0, + "loss": 0.6102, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.75390625, + "rewards/margins": 0.2470703125, + "rewards/rejected": -1.0, + "step": 315 + }, + { + "epoch": 0.6614338042909471, + "grad_norm": 10.181482315063477, + "learning_rate": 4.5561362763829763e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.734375, + "logps/chosen": -540.0, + "logps/rejected": -310.0, + "loss": 0.6332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36328125, + "rewards/margins": 0.33984375, + "rewards/rejected": -0.703125, + "step": 316 + }, + { + "epoch": 0.663526949241235, + "grad_norm": 8.602537155151367, + "learning_rate": 4.5528960991595857e-07, + "logits/chosen": 2.671875, + "logits/rejected": 1.984375, + "logps/chosen": -260.0, + "logps/rejected": -229.0, + "loss": 0.6315, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.65234375, + "rewards/margins": -0.12890625, + "rewards/rejected": -0.5234375, + "step": 317 + }, + { + "epoch": 0.6656200941915228, + "grad_norm": 8.840538024902344, + "learning_rate": 4.549645299447461e-07, + "logits/chosen": 1.8046875, + "logits/rejected": 2.46875, + "logps/chosen": -352.0, + "logps/rejected": -524.0, + "loss": 0.6126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6953125, + "rewards/margins": 0.4296875, + "rewards/rejected": -1.125, + "step": 318 + }, + { + "epoch": 0.6677132391418106, + "grad_norm": 8.675968170166016, + "learning_rate": 4.546383894069561e-07, + "logits/chosen": 2.640625, + "logits/rejected": 3.234375, + "logps/chosen": -692.0, + "logps/rejected": -616.0, + "loss": 0.6181, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.79296875, + "rewards/margins": 0.44140625, + "rewards/rejected": -1.234375, + "step": 319 + }, + { + "epoch": 0.6698063840920984, + "grad_norm": 9.338340759277344, + "learning_rate": 4.54311189990373e-07, + "logits/chosen": 2.265625, + "logits/rejected": 1.859375, + "logps/chosen": -247.0, + "logps/rejected": -402.0, + "loss": 0.6366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4453125, + "rewards/margins": 0.30078125, + "rewards/rejected": -0.74609375, + "step": 320 + }, + { + "epoch": 0.6718995290423861, + "grad_norm": 9.707039833068848, + "learning_rate": 4.5398293338826126e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.859375, + "logps/chosen": -544.0, + "logps/rejected": -442.0, + "loss": 0.6643, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.59375, + "rewards/margins": 0.10546875, + "rewards/rejected": -0.69921875, + "step": 321 + }, + { + "epoch": 0.673992673992674, + "grad_norm": 9.66869068145752, + "learning_rate": 4.5365362129935584e-07, + "logits/chosen": 1.78125, + "logits/rejected": 1.9140625, + "logps/chosen": -392.0, + "logps/rejected": -251.0, + "loss": 0.6646, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.921875, + "rewards/margins": -0.271484375, + "rewards/rejected": -0.65234375, + "step": 322 + }, + { + "epoch": 0.6760858189429618, + "grad_norm": 9.099617958068848, + "learning_rate": 4.5332325542785406e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.59375, + "logps/chosen": -468.0, + "logps/rejected": -474.0, + "loss": 0.6547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.55078125, + "rewards/margins": 0.6875, + "rewards/rejected": -1.234375, + "step": 323 + }, + { + "epoch": 0.6781789638932496, + "grad_norm": 8.761299133300781, + "learning_rate": 4.5299183748340655e-07, + "logits/chosen": 2.34375, + "logits/rejected": 2.75, + "logps/chosen": -286.0, + "logps/rejected": -212.0, + "loss": 0.6341, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.80859375, + "rewards/margins": 0.0458984375, + "rewards/rejected": -0.85546875, + "step": 324 + }, + { + "epoch": 0.6802721088435374, + "grad_norm": 8.962592124938965, + "learning_rate": 4.526593691811084e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.5, + "logps/chosen": -472.0, + "logps/rejected": -406.0, + "loss": 0.6351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.56640625, + "rewards/margins": 0.486328125, + "rewards/rejected": -1.046875, + "step": 325 + }, + { + "epoch": 0.6823652537938252, + "grad_norm": 9.102997779846191, + "learning_rate": 4.5232585224149054e-07, + "logits/chosen": 2.0625, + "logits/rejected": 1.8515625, + "logps/chosen": -334.0, + "logps/rejected": -366.0, + "loss": 0.6107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.609375, + "rewards/margins": 0.365234375, + "rewards/rejected": -0.97265625, + "step": 326 + }, + { + "epoch": 0.6844583987441131, + "grad_norm": 9.190810203552246, + "learning_rate": 4.519912883905105e-07, + "logits/chosen": 2.4375, + "logits/rejected": 1.5546875, + "logps/chosen": -352.0, + "logps/rejected": -442.0, + "loss": 0.6275, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8359375, + "rewards/margins": 0.0498046875, + "rewards/rejected": -0.88671875, + "step": 327 + }, + { + "epoch": 0.6865515436944009, + "grad_norm": 9.286701202392578, + "learning_rate": 4.516556793595433e-07, + "logits/chosen": 2.28125, + "logits/rejected": 2.265625, + "logps/chosen": -372.0, + "logps/rejected": -620.0, + "loss": 0.6347, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7421875, + "rewards/margins": 0.09375, + "rewards/rejected": -0.8359375, + "step": 328 + }, + { + "epoch": 0.6886446886446886, + "grad_norm": 9.687287330627441, + "learning_rate": 4.5131902688537337e-07, + "logits/chosen": 2.078125, + "logits/rejected": 2.4375, + "logps/chosen": -412.0, + "logps/rejected": -334.0, + "loss": 0.6518, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7109375, + "rewards/margins": 0.044921875, + "rewards/rejected": -0.7578125, + "step": 329 + }, + { + "epoch": 0.6907378335949764, + "grad_norm": 9.833063125610352, + "learning_rate": 4.509813327101845e-07, + "logits/chosen": 2.5625, + "logits/rejected": 3.140625, + "logps/chosen": -346.0, + "logps/rejected": -422.0, + "loss": 0.6264, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.84375, + "rewards/margins": 0.2470703125, + "rewards/rejected": -1.0859375, + "step": 330 + }, + { + "epoch": 0.6928309785452642, + "grad_norm": 10.08375358581543, + "learning_rate": 4.5064259858155156e-07, + "logits/chosen": 2.015625, + "logits/rejected": 1.921875, + "logps/chosen": -390.0, + "logps/rejected": -312.0, + "loss": 0.65, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8828125, + "rewards/margins": 0.15234375, + "rewards/rejected": -1.03125, + "step": 331 + }, + { + "epoch": 0.6949241234955521, + "grad_norm": 9.232449531555176, + "learning_rate": 4.503028262524311e-07, + "logits/chosen": 1.984375, + "logits/rejected": 2.640625, + "logps/chosen": -498.0, + "logps/rejected": -144.0, + "loss": 0.6328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.494140625, + "rewards/margins": 0.07666015625, + "rewards/rejected": -0.5703125, + "step": 332 + }, + { + "epoch": 0.6970172684458399, + "grad_norm": 10.257896423339844, + "learning_rate": 4.4996201748115235e-07, + "logits/chosen": 1.6484375, + "logits/rejected": 1.28125, + "logps/chosen": -115.0, + "logps/rejected": -264.0, + "loss": 0.6495, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.578125, + "rewards/margins": 0.322265625, + "rewards/rejected": -0.8984375, + "step": 333 + }, + { + "epoch": 0.6991104133961277, + "grad_norm": 9.689282417297363, + "learning_rate": 4.4962017403140816e-07, + "logits/chosen": 1.4921875, + "logits/rejected": 1.8046875, + "logps/chosen": -386.0, + "logps/rejected": -376.0, + "loss": 0.6146, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.82421875, + "rewards/margins": 0.14453125, + "rewards/rejected": -0.96875, + "step": 334 + }, + { + "epoch": 0.7012035583464155, + "grad_norm": 9.05044937133789, + "learning_rate": 4.4927729767224616e-07, + "logits/chosen": 2.390625, + "logits/rejected": 2.515625, + "logps/chosen": -346.0, + "logps/rejected": -322.0, + "loss": 0.654, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5546875, + "rewards/margins": 0.244140625, + "rewards/rejected": -0.80078125, + "step": 335 + }, + { + "epoch": 0.7032967032967034, + "grad_norm": 10.08155632019043, + "learning_rate": 4.489333901780587e-07, + "logits/chosen": 2.28125, + "logits/rejected": 1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -552.0, + "loss": 0.6719, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.71875, + "rewards/margins": 0.125, + "rewards/rejected": -0.84375, + "step": 336 + }, + { + "epoch": 0.7053898482469911, + "grad_norm": 8.750693321228027, + "learning_rate": 4.4858845332857485e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.8125, + "logps/chosen": -576.0, + "logps/rejected": -592.0, + "loss": 0.6136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.73828125, + "rewards/margins": 0.5078125, + "rewards/rejected": -1.2421875, + "step": 337 + }, + { + "epoch": 0.7074829931972789, + "grad_norm": 9.24592399597168, + "learning_rate": 4.4824248890885044e-07, + "logits/chosen": 2.8125, + "logits/rejected": 3.15625, + "logps/chosen": -544.0, + "logps/rejected": -362.0, + "loss": 0.6244, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5234375, + "rewards/margins": 0.12890625, + "rewards/rejected": -0.65234375, + "step": 338 + }, + { + "epoch": 0.7095761381475667, + "grad_norm": 9.083433151245117, + "learning_rate": 4.478954987092588e-07, + "logits/chosen": 2.25, + "logits/rejected": 2.0625, + "logps/chosen": -346.0, + "logps/rejected": -226.0, + "loss": 0.6079, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0, + "rewards/margins": -0.1474609375, + "rewards/rejected": -0.8515625, + "step": 339 + }, + { + "epoch": 0.7116692830978545, + "grad_norm": 9.71336841583252, + "learning_rate": 4.4754748452548186e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.9375, + "logps/chosen": -576.0, + "logps/rejected": -378.0, + "loss": 0.6613, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.03125, + "rewards/margins": -0.08203125, + "rewards/rejected": -0.953125, + "step": 340 + }, + { + "epoch": 0.7137624280481424, + "grad_norm": 10.675765037536621, + "learning_rate": 4.4719844815850084e-07, + "logits/chosen": 2.8125, + "logits/rejected": 3.390625, + "logps/chosen": -556.0, + "logps/rejected": -744.0, + "loss": 0.6692, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.890625, + "rewards/margins": 0.30859375, + "rewards/rejected": -1.203125, + "step": 341 + }, + { + "epoch": 0.7158555729984302, + "grad_norm": 9.6324462890625, + "learning_rate": 4.468483914145865e-07, + "logits/chosen": 1.8359375, + "logits/rejected": 2.40625, + "logps/chosen": -360.0, + "logps/rejected": -306.0, + "loss": 0.6413, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.1015625, + "rewards/margins": -0.251953125, + "rewards/rejected": -0.8515625, + "step": 342 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 10.155203819274902, + "learning_rate": 4.464973161052901e-07, + "logits/chosen": 1.9453125, + "logits/rejected": 2.046875, + "logps/chosen": -270.0, + "logps/rejected": -284.0, + "loss": 0.6607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.60546875, + "rewards/margins": 0.1669921875, + "rewards/rejected": -0.7734375, + "step": 343 + }, + { + "epoch": 0.7200418628990057, + "grad_norm": 9.593433380126953, + "learning_rate": 4.461452240474343e-07, + "logits/chosen": 2.5, + "logits/rejected": 3.0, + "logps/chosen": -612.0, + "logps/rejected": -580.0, + "loss": 0.6474, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7890625, + "rewards/margins": 0.37109375, + "rewards/rejected": -1.1640625, + "step": 344 + }, + { + "epoch": 0.7221350078492935, + "grad_norm": 9.28181266784668, + "learning_rate": 4.457921170631032e-07, + "logits/chosen": 2.046875, + "logits/rejected": 1.953125, + "logps/chosen": -492.0, + "logps/rejected": -506.0, + "loss": 0.6416, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6171875, + "rewards/margins": 0.24609375, + "rewards/rejected": -0.86328125, + "step": 345 + }, + { + "epoch": 0.7242281527995814, + "grad_norm": 9.405036926269531, + "learning_rate": 4.45437996979633e-07, + "logits/chosen": 1.4609375, + "logits/rejected": 1.90625, + "logps/chosen": -224.0, + "logps/rejected": -186.0, + "loss": 0.6443, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.85546875, + "rewards/margins": -0.16015625, + "rewards/rejected": -0.6953125, + "step": 346 + }, + { + "epoch": 0.7263212977498692, + "grad_norm": 9.424813270568848, + "learning_rate": 4.4508286562960327e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.765625, + "logps/chosen": -326.0, + "logps/rejected": -199.0, + "loss": 0.6361, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.96875, + "rewards/margins": -0.390625, + "rewards/rejected": -0.578125, + "step": 347 + }, + { + "epoch": 0.728414442700157, + "grad_norm": 9.514280319213867, + "learning_rate": 4.447267248508263e-07, + "logits/chosen": 2.859375, + "logits/rejected": 3.421875, + "logps/chosen": -528.0, + "logps/rejected": -458.0, + "loss": 0.6744, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7109375, + "rewards/margins": 0.0849609375, + "rewards/rejected": -0.796875, + "step": 348 + }, + { + "epoch": 0.7305075876504448, + "grad_norm": 9.275189399719238, + "learning_rate": 4.4436957648633847e-07, + "logits/chosen": 2.921875, + "logits/rejected": 3.0625, + "logps/chosen": -370.0, + "logps/rejected": -406.0, + "loss": 0.6114, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7421875, + "rewards/margins": 0.4375, + "rewards/rejected": -1.1796875, + "step": 349 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 9.640008926391602, + "learning_rate": 4.440114223843906e-07, + "logits/chosen": 2.0625, + "logits/rejected": 3.015625, + "logps/chosen": -398.0, + "logps/rejected": -208.0, + "loss": 0.6386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7109375, + "rewards/margins": 0.07958984375, + "rewards/rejected": -0.79296875, + "step": 350 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 10.19519329071045, + "learning_rate": 4.436522643984378e-07, + "logits/chosen": 0.310546875, + "logits/rejected": 0.4140625, + "logps/chosen": -186.0, + "logps/rejected": -286.0, + "loss": 0.6782, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.96875, + "rewards/margins": 0.2353515625, + "rewards/rejected": -1.203125, + "step": 351 + }, + { + "epoch": 0.7367870225013082, + "grad_norm": 10.587912559509277, + "learning_rate": 4.4329210438713085e-07, + "logits/chosen": 3.25, + "logits/rejected": 4.0, + "logps/chosen": -540.0, + "logps/rejected": -712.0, + "loss": 0.6384, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.74609375, + "rewards/margins": -0.009765625, + "rewards/rejected": -0.734375, + "step": 352 + }, + { + "epoch": 0.738880167451596, + "grad_norm": 9.61915397644043, + "learning_rate": 4.429309442143055e-07, + "logits/chosen": 2.171875, + "logits/rejected": 2.5, + "logps/chosen": -330.0, + "logps/rejected": -204.0, + "loss": 0.5989, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.64453125, + "rewards/margins": 0.1484375, + "rewards/rejected": -0.796875, + "step": 353 + }, + { + "epoch": 0.7409733124018838, + "grad_norm": 9.3129243850708, + "learning_rate": 4.4256878574897375e-07, + "logits/chosen": 1.875, + "logits/rejected": 2.59375, + "logps/chosen": -185.0, + "logps/rejected": -155.0, + "loss": 0.6421, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5546875, + "rewards/margins": 0.016357421875, + "rewards/rejected": -0.5703125, + "step": 354 + }, + { + "epoch": 0.7430664573521716, + "grad_norm": 9.82224178314209, + "learning_rate": 4.4220563086531347e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.5625, + "logps/chosen": -452.0, + "logps/rejected": -472.0, + "loss": 0.6432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.64453125, + "rewards/margins": 0.298828125, + "rewards/rejected": -0.9453125, + "step": 355 + }, + { + "epoch": 0.7451596023024595, + "grad_norm": 7.942782878875732, + "learning_rate": 4.418414814426593e-07, + "logits/chosen": 1.5234375, + "logits/rejected": 1.890625, + "logps/chosen": -236.0, + "logps/rejected": -236.0, + "loss": 0.6293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7265625, + "rewards/margins": 0.154296875, + "rewards/rejected": -0.8828125, + "step": 356 + }, + { + "epoch": 0.7472527472527473, + "grad_norm": 9.060127258300781, + "learning_rate": 4.414763393654924e-07, + "logits/chosen": 2.796875, + "logits/rejected": 3.078125, + "logps/chosen": -324.0, + "logps/rejected": -340.0, + "loss": 0.6477, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.76171875, + "rewards/margins": 0.2099609375, + "rewards/rejected": -0.97265625, + "step": 357 + }, + { + "epoch": 0.749345892203035, + "grad_norm": 9.260727882385254, + "learning_rate": 4.4111020652343117e-07, + "logits/chosen": 2.71875, + "logits/rejected": 3.21875, + "logps/chosen": -564.0, + "logps/rejected": -370.0, + "loss": 0.6309, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5546875, + "rewards/margins": 0.04541015625, + "rewards/rejected": -0.59765625, + "step": 358 + }, + { + "epoch": 0.7514390371533228, + "grad_norm": 10.070478439331055, + "learning_rate": 4.4074308481122106e-07, + "logits/chosen": 1.953125, + "logits/rejected": 2.625, + "logps/chosen": -418.0, + "logps/rejected": -452.0, + "loss": 0.6358, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.73828125, + "rewards/margins": -0.046875, + "rewards/rejected": -0.69140625, + "step": 359 + }, + { + "epoch": 0.7535321821036107, + "grad_norm": 9.476323127746582, + "learning_rate": 4.4037497612872504e-07, + "logits/chosen": 2.15625, + "logits/rejected": 2.125, + "logps/chosen": -174.0, + "logps/rejected": -528.0, + "loss": 0.6452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6875, + "rewards/margins": 0.859375, + "rewards/rejected": -1.546875, + "step": 360 + }, + { + "epoch": 0.7556253270538985, + "grad_norm": 9.520855903625488, + "learning_rate": 4.4000588238091365e-07, + "logits/chosen": 2.328125, + "logits/rejected": 2.1875, + "logps/chosen": -184.0, + "logps/rejected": -245.0, + "loss": 0.6271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.484375, + "rewards/margins": 0.224609375, + "rewards/rejected": -0.7109375, + "step": 361 + }, + { + "epoch": 0.7577184720041863, + "grad_norm": 9.436513900756836, + "learning_rate": 4.3963580547785513e-07, + "logits/chosen": 2.515625, + "logits/rejected": 2.953125, + "logps/chosen": -560.0, + "logps/rejected": -330.0, + "loss": 0.6483, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.97265625, + "rewards/margins": 0.0458984375, + "rewards/rejected": -1.015625, + "step": 362 + }, + { + "epoch": 0.7598116169544741, + "grad_norm": 9.640151023864746, + "learning_rate": 4.3926474733470554e-07, + "logits/chosen": 2.796875, + "logits/rejected": 3.53125, + "logps/chosen": -636.0, + "logps/rejected": -494.0, + "loss": 0.6107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.53515625, + "rewards/margins": 0.21875, + "rewards/rejected": -0.75390625, + "step": 363 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 10.11239242553711, + "learning_rate": 4.3889270987169904e-07, + "logits/chosen": 2.09375, + "logits/rejected": 2.59375, + "logps/chosen": -382.0, + "logps/rejected": -460.0, + "loss": 0.6359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.78515625, + "rewards/margins": 0.70703125, + "rewards/rejected": -1.4921875, + "step": 364 + }, + { + "epoch": 0.7639979068550498, + "grad_norm": 10.607931137084961, + "learning_rate": 4.385196950141377e-07, + "logits/chosen": 2.09375, + "logits/rejected": 1.953125, + "logps/chosen": -348.0, + "logps/rejected": -264.0, + "loss": 0.6619, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.54296875, + "rewards/margins": 0.171875, + "rewards/rejected": -0.71484375, + "step": 365 + }, + { + "epoch": 0.7660910518053375, + "grad_norm": 10.365743637084961, + "learning_rate": 4.381457046923815e-07, + "logits/chosen": 2.109375, + "logits/rejected": 2.34375, + "logps/chosen": -500.0, + "logps/rejected": -482.0, + "loss": 0.6543, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.078125, + "rewards/margins": 0.134765625, + "rewards/rejected": -1.2109375, + "step": 366 + }, + { + "epoch": 0.7681841967556253, + "grad_norm": 9.481950759887695, + "learning_rate": 4.377707408418387e-07, + "logits/chosen": 2.09375, + "logits/rejected": 2.65625, + "logps/chosen": -452.0, + "logps/rejected": -312.0, + "loss": 0.6312, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.90234375, + "rewards/margins": 0.08203125, + "rewards/rejected": -0.984375, + "step": 367 + }, + { + "epoch": 0.7702773417059131, + "grad_norm": 10.416890144348145, + "learning_rate": 4.373948054029554e-07, + "logits/chosen": 2.765625, + "logits/rejected": 3.4375, + "logps/chosen": -820.0, + "logps/rejected": -780.0, + "loss": 0.5978, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.349609375, + "rewards/margins": 0.51953125, + "rewards/rejected": -0.8671875, + "step": 368 + }, + { + "epoch": 0.7723704866562009, + "grad_norm": 10.182770729064941, + "learning_rate": 4.3701790032120584e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.484375, + "logps/chosen": -452.0, + "logps/rejected": -548.0, + "loss": 0.6289, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9453125, + "rewards/margins": 0.1630859375, + "rewards/rejected": -1.109375, + "step": 369 + }, + { + "epoch": 0.7744636316064888, + "grad_norm": 8.755770683288574, + "learning_rate": 4.3664002754708203e-07, + "logits/chosen": 2.375, + "logits/rejected": 2.875, + "logps/chosen": -408.0, + "logps/rejected": -362.0, + "loss": 0.618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9375, + "rewards/margins": -0.022216796875, + "rewards/rejected": -0.91796875, + "step": 370 + }, + { + "epoch": 0.7765567765567766, + "grad_norm": 10.320544242858887, + "learning_rate": 4.362611890360839e-07, + "logits/chosen": 2.6875, + "logits/rejected": 3.171875, + "logps/chosen": -450.0, + "logps/rejected": -408.0, + "loss": 0.6703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.69140625, + "rewards/margins": -0.0234375, + "rewards/rejected": -0.66796875, + "step": 371 + }, + { + "epoch": 0.7786499215070644, + "grad_norm": 9.803793907165527, + "learning_rate": 4.358813867487092e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.234375, + "logps/chosen": -1004.0, + "logps/rejected": -540.0, + "loss": 0.6332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.64453125, + "rewards/margins": 0.271484375, + "rewards/rejected": -0.9140625, + "step": 372 + }, + { + "epoch": 0.7807430664573521, + "grad_norm": 9.712671279907227, + "learning_rate": 4.3550062265044304e-07, + "logits/chosen": 1.9296875, + "logits/rejected": 1.6875, + "logps/chosen": -660.0, + "logps/rejected": -508.0, + "loss": 0.6387, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5625, + "rewards/margins": 0.0849609375, + "rewards/rejected": -0.6484375, + "step": 373 + }, + { + "epoch": 0.7828362114076399, + "grad_norm": 9.95979118347168, + "learning_rate": 4.351188987117479e-07, + "logits/chosen": 2.953125, + "logits/rejected": 3.15625, + "logps/chosen": -648.0, + "logps/rejected": -536.0, + "loss": 0.6454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.65234375, + "rewards/margins": 0.8828125, + "rewards/rejected": -1.53125, + "step": 374 + }, + { + "epoch": 0.7849293563579278, + "grad_norm": 9.249582290649414, + "learning_rate": 4.3473621690805376e-07, + "logits/chosen": 2.625, + "logits/rejected": 2.96875, + "logps/chosen": -328.0, + "logps/rejected": -193.0, + "loss": 0.6431, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.77734375, + "rewards/margins": 0.07373046875, + "rewards/rejected": -0.8515625, + "step": 375 + }, + { + "epoch": 0.7870225013082156, + "grad_norm": 9.266115188598633, + "learning_rate": 4.343525792197472e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.109375, + "logps/chosen": -266.0, + "logps/rejected": -330.0, + "loss": 0.6174, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.84375, + "rewards/margins": 0.578125, + "rewards/rejected": -1.421875, + "step": 376 + }, + { + "epoch": 0.7891156462585034, + "grad_norm": 9.573073387145996, + "learning_rate": 4.339679876321619e-07, + "logits/chosen": 2.8125, + "logits/rejected": 3.0625, + "logps/chosen": -700.0, + "logps/rejected": -494.0, + "loss": 0.6442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8359375, + "rewards/margins": 0.4453125, + "rewards/rejected": -1.28125, + "step": 377 + }, + { + "epoch": 0.7912087912087912, + "grad_norm": 9.77106761932373, + "learning_rate": 4.335824441355677e-07, + "logits/chosen": 1.8203125, + "logits/rejected": 2.453125, + "logps/chosen": -624.0, + "logps/rejected": -376.0, + "loss": 0.6366, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.78125, + "rewards/margins": -0.1123046875, + "rewards/rejected": -0.66796875, + "step": 378 + }, + { + "epoch": 0.7933019361590791, + "grad_norm": 9.449440002441406, + "learning_rate": 4.331959507251606e-07, + "logits/chosen": 2.09375, + "logits/rejected": 2.328125, + "logps/chosen": -162.0, + "logps/rejected": -162.0, + "loss": 0.6182, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5, + "rewards/margins": 0.0927734375, + "rewards/rejected": -0.59375, + "step": 379 + }, + { + "epoch": 0.7953950811093669, + "grad_norm": 9.60571575164795, + "learning_rate": 4.3280850940105243e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.53125, + "logps/chosen": -418.0, + "logps/rejected": -506.0, + "loss": 0.6176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.474609375, + "rewards/margins": 0.71484375, + "rewards/rejected": -1.1875, + "step": 380 + }, + { + "epoch": 0.7974882260596546, + "grad_norm": 9.75421142578125, + "learning_rate": 4.3242012216826084e-07, + "logits/chosen": 2.65625, + "logits/rejected": 2.578125, + "logps/chosen": -418.0, + "logps/rejected": -344.0, + "loss": 0.6361, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4609375, + "rewards/margins": 0.234375, + "rewards/rejected": -0.6953125, + "step": 381 + }, + { + "epoch": 0.7995813710099424, + "grad_norm": 9.220489501953125, + "learning_rate": 4.3203079103669807e-07, + "logits/chosen": 2.125, + "logits/rejected": 1.9453125, + "logps/chosen": -171.0, + "logps/rejected": -264.0, + "loss": 0.5963, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8125, + "rewards/margins": 0.17578125, + "rewards/rejected": -0.98828125, + "step": 382 + }, + { + "epoch": 0.8016745159602302, + "grad_norm": 9.26289176940918, + "learning_rate": 4.316405180211615e-07, + "logits/chosen": 2.6875, + "logits/rejected": 2.296875, + "logps/chosen": -334.0, + "logps/rejected": -520.0, + "loss": 0.6377, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7265625, + "rewards/margins": 0.0205078125, + "rewards/rejected": -0.75, + "step": 383 + }, + { + "epoch": 0.8037676609105181, + "grad_norm": 10.633591651916504, + "learning_rate": 4.312493051413224e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.6875, + "logps/chosen": -372.0, + "logps/rejected": -342.0, + "loss": 0.6234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.69140625, + "rewards/margins": 0.337890625, + "rewards/rejected": -1.03125, + "step": 384 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 9.378695487976074, + "learning_rate": 4.308571544217161e-07, + "logits/chosen": 2.8125, + "logits/rejected": 3.59375, + "logps/chosen": -592.0, + "logps/rejected": -592.0, + "loss": 0.6395, + "rewards/accuracies": 0.0, + "rewards/chosen": -1.1796875, + "rewards/margins": -0.185546875, + "rewards/rejected": -0.99609375, + "step": 385 + }, + { + "epoch": 0.8079539508110937, + "grad_norm": 10.046592712402344, + "learning_rate": 4.3046406789173123e-07, + "logits/chosen": 2.3125, + "logits/rejected": 2.21875, + "logps/chosen": -572.0, + "logps/rejected": -560.0, + "loss": 0.6069, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0078125, + "rewards/margins": 0.0029296875, + "rewards/rejected": -1.015625, + "step": 386 + }, + { + "epoch": 0.8100470957613815, + "grad_norm": 9.885677337646484, + "learning_rate": 4.300700475855992e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.78125, + "logps/chosen": -274.0, + "logps/rejected": -199.0, + "loss": 0.6354, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.95703125, + "rewards/margins": -0.248046875, + "rewards/rejected": -0.7109375, + "step": 387 + }, + { + "epoch": 0.8121402407116692, + "grad_norm": 10.309823036193848, + "learning_rate": 4.296750955423837e-07, + "logits/chosen": 3.34375, + "logits/rejected": 2.859375, + "logps/chosen": -580.0, + "logps/rejected": -672.0, + "loss": 0.6669, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.80859375, + "rewards/margins": -0.0576171875, + "rewards/rejected": -0.75, + "step": 388 + }, + { + "epoch": 0.8142333856619571, + "grad_norm": 9.416850090026855, + "learning_rate": 4.2927921380597037e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.828125, + "logps/chosen": -240.0, + "logps/rejected": -256.0, + "loss": 0.6156, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.75, + "rewards/margins": -0.09375, + "rewards/rejected": -0.65625, + "step": 389 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 9.89782428741455, + "learning_rate": 4.288824044250558e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.765625, + "logps/chosen": -516.0, + "logps/rejected": -716.0, + "loss": 0.609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.59765625, + "rewards/margins": 0.7734375, + "rewards/rejected": -1.375, + "step": 390 + }, + { + "epoch": 0.8184196755625327, + "grad_norm": 8.746621131896973, + "learning_rate": 4.284846694531373e-07, + "logits/chosen": 1.921875, + "logits/rejected": 2.40625, + "logps/chosen": -260.0, + "logps/rejected": -193.0, + "loss": 0.6066, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.98828125, + "rewards/margins": 0.087890625, + "rewards/rejected": -1.078125, + "step": 391 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 9.77097225189209, + "learning_rate": 4.2808601094850214e-07, + "logits/chosen": 2.59375, + "logits/rejected": 3.53125, + "logps/chosen": -792.0, + "logps/rejected": -470.0, + "loss": 0.575, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.03125, + "rewards/margins": 0.1416015625, + "rewards/rejected": -1.171875, + "step": 392 + }, + { + "epoch": 0.8226059654631083, + "grad_norm": 9.895403861999512, + "learning_rate": 4.276864309742169e-07, + "logits/chosen": 2.328125, + "logits/rejected": 2.21875, + "logps/chosen": -572.0, + "logps/rejected": -468.0, + "loss": 0.658, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0390625, + "rewards/margins": 0.046875, + "rewards/rejected": -1.0859375, + "step": 393 + }, + { + "epoch": 0.8246991104133962, + "grad_norm": 10.531278610229492, + "learning_rate": 4.2728593159811667e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.65625, + "logps/chosen": -412.0, + "logps/rejected": -255.0, + "loss": 0.6505, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6796875, + "rewards/margins": 0.2109375, + "rewards/rejected": -0.890625, + "step": 394 + }, + { + "epoch": 0.826792255363684, + "grad_norm": 9.876930236816406, + "learning_rate": 4.268845148927945e-07, + "logits/chosen": 1.7890625, + "logits/rejected": 1.78125, + "logps/chosen": -536.0, + "logps/rejected": -672.0, + "loss": 0.6202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8515625, + "rewards/margins": 0.87109375, + "rewards/rejected": -1.7265625, + "step": 395 + }, + { + "epoch": 0.8288854003139717, + "grad_norm": 9.344927787780762, + "learning_rate": 4.264821829355908e-07, + "logits/chosen": 2.734375, + "logits/rejected": 3.3125, + "logps/chosen": -564.0, + "logps/rejected": -410.0, + "loss": 0.6163, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6953125, + "rewards/margins": 0.359375, + "rewards/rejected": -1.0546875, + "step": 396 + }, + { + "epoch": 0.8309785452642595, + "grad_norm": 11.048480987548828, + "learning_rate": 4.260789378085821e-07, + "logits/chosen": 1.890625, + "logits/rejected": 1.8671875, + "logps/chosen": -238.0, + "logps/rejected": -228.0, + "loss": 0.6577, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.890625, + "rewards/margins": 0.0693359375, + "rewards/rejected": -0.9609375, + "step": 397 + }, + { + "epoch": 0.8330716902145474, + "grad_norm": 10.133160591125488, + "learning_rate": 4.2567478159857087e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.390625, + "logps/chosen": -640.0, + "logps/rejected": -608.0, + "loss": 0.6498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.80078125, + "rewards/margins": 0.349609375, + "rewards/rejected": -1.1484375, + "step": 398 + }, + { + "epoch": 0.8351648351648352, + "grad_norm": 10.660299301147461, + "learning_rate": 4.2526971639707456e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.8125, + "logps/chosen": -664.0, + "logps/rejected": -506.0, + "loss": 0.6444, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.046875, + "rewards/margins": -0.0458984375, + "rewards/rejected": -1.0, + "step": 399 + }, + { + "epoch": 0.837257980115123, + "grad_norm": 9.645748138427734, + "learning_rate": 4.248637443003144e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.515625, + "logps/chosen": -302.0, + "logps/rejected": -219.0, + "loss": 0.6131, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.69140625, + "rewards/margins": 0.06103515625, + "rewards/rejected": -0.75390625, + "step": 400 + }, + { + "epoch": 0.8393511250654108, + "grad_norm": 10.709028244018555, + "learning_rate": 4.2445686740920484e-07, + "logits/chosen": 3.25, + "logits/rejected": 2.9375, + "logps/chosen": -450.0, + "logps/rejected": -480.0, + "loss": 0.6438, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.046875, + "rewards/margins": -0.052734375, + "rewards/rejected": -0.9921875, + "step": 401 + }, + { + "epoch": 0.8414442700156985, + "grad_norm": 9.625758171081543, + "learning_rate": 4.240490878293428e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.734375, + "logps/chosen": -296.0, + "logps/rejected": -207.0, + "loss": 0.625, + "rewards/accuracies": 0.0, + "rewards/chosen": -1.1328125, + "rewards/margins": -0.181640625, + "rewards/rejected": -0.953125, + "step": 402 + }, + { + "epoch": 0.8435374149659864, + "grad_norm": 9.67353630065918, + "learning_rate": 4.236404076709967e-07, + "logits/chosen": 1.734375, + "logits/rejected": 2.25, + "logps/chosen": -338.0, + "logps/rejected": -430.0, + "loss": 0.5896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.875, + "rewards/margins": 0.40234375, + "rewards/rejected": -1.28125, + "step": 403 + }, + { + "epoch": 0.8456305599162742, + "grad_norm": 10.118279457092285, + "learning_rate": 4.232308290490952e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.75, + "logps/chosen": -808.0, + "logps/rejected": -692.0, + "loss": 0.6401, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.77734375, + "rewards/margins": 0.42578125, + "rewards/rejected": -1.203125, + "step": 404 + }, + { + "epoch": 0.847723704866562, + "grad_norm": 9.055681228637695, + "learning_rate": 4.2282035408321663e-07, + "logits/chosen": 2.484375, + "logits/rejected": 3.046875, + "logps/chosen": -600.0, + "logps/rejected": -684.0, + "loss": 0.6226, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0234375, + "rewards/margins": -0.005859375, + "rewards/rejected": -1.015625, + "step": 405 + }, + { + "epoch": 0.8498168498168498, + "grad_norm": 10.447412490844727, + "learning_rate": 4.2240898489757816e-07, + "logits/chosen": 1.671875, + "logits/rejected": 1.6875, + "logps/chosen": -394.0, + "logps/rejected": -314.0, + "loss": 0.6685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.703125, + "rewards/margins": 0.59375, + "rewards/rejected": -1.296875, + "step": 406 + }, + { + "epoch": 0.8519099947671376, + "grad_norm": 9.07568359375, + "learning_rate": 4.2199672362102435e-07, + "logits/chosen": 1.828125, + "logits/rejected": 2.546875, + "logps/chosen": -416.0, + "logps/rejected": -328.0, + "loss": 0.6175, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.015625, + "rewards/margins": 0.01171875, + "rewards/rejected": -1.03125, + "step": 407 + }, + { + "epoch": 0.8540031397174255, + "grad_norm": 11.152243614196777, + "learning_rate": 4.215835723870162e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.421875, + "logps/chosen": -330.0, + "logps/rejected": -608.0, + "loss": 0.6663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.69140625, + "rewards/margins": 0.5, + "rewards/rejected": -1.1875, + "step": 408 + }, + { + "epoch": 0.8560962846677133, + "grad_norm": 9.441628456115723, + "learning_rate": 4.211695333336206e-07, + "logits/chosen": 3.328125, + "logits/rejected": 2.9375, + "logps/chosen": -648.0, + "logps/rejected": -528.0, + "loss": 0.6234, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0546875, + "rewards/margins": 0.046875, + "rewards/rejected": -1.1015625, + "step": 409 + }, + { + "epoch": 0.858189429618001, + "grad_norm": 9.586783409118652, + "learning_rate": 4.207546086034987e-07, + "logits/chosen": 2.6875, + "logits/rejected": 2.234375, + "logps/chosen": -588.0, + "logps/rejected": -892.0, + "loss": 0.6212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.62109375, + "rewards/margins": 0.38671875, + "rewards/rejected": -1.0078125, + "step": 410 + }, + { + "epoch": 0.8602825745682888, + "grad_norm": 9.743223190307617, + "learning_rate": 4.203388003438951e-07, + "logits/chosen": 1.4921875, + "logits/rejected": 1.625, + "logps/chosen": -264.0, + "logps/rejected": -245.0, + "loss": 0.6136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7578125, + "rewards/margins": 0.255859375, + "rewards/rejected": -1.015625, + "step": 411 + }, + { + "epoch": 0.8623757195185766, + "grad_norm": 10.374858856201172, + "learning_rate": 4.1992211070662686e-07, + "logits/chosen": 1.3984375, + "logits/rejected": 1.1015625, + "logps/chosen": -372.0, + "logps/rejected": -488.0, + "loss": 0.6324, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9921875, + "rewards/margins": 0.0126953125, + "rewards/rejected": -1.0078125, + "step": 412 + }, + { + "epoch": 0.8644688644688645, + "grad_norm": 9.237102508544922, + "learning_rate": 4.195045418480717e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.09375, + "logps/chosen": -416.0, + "logps/rejected": -434.0, + "loss": 0.614, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0234375, + "rewards/margins": -0.2109375, + "rewards/rejected": -0.8125, + "step": 413 + }, + { + "epoch": 0.8665620094191523, + "grad_norm": 10.228826522827148, + "learning_rate": 4.19086095929158e-07, + "logits/chosen": 2.3125, + "logits/rejected": 2.28125, + "logps/chosen": -360.0, + "logps/rejected": -368.0, + "loss": 0.6384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.890625, + "rewards/margins": 0.1767578125, + "rewards/rejected": -1.0625, + "step": 414 + }, + { + "epoch": 0.8686551543694401, + "grad_norm": 9.450304985046387, + "learning_rate": 4.1866677511535237e-07, + "logits/chosen": 1.953125, + "logits/rejected": 1.4609375, + "logps/chosen": -154.0, + "logps/rejected": -292.0, + "loss": 0.6115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.91796875, + "rewards/margins": 0.35546875, + "rewards/rejected": -1.2734375, + "step": 415 + }, + { + "epoch": 0.8707482993197279, + "grad_norm": 10.266785621643066, + "learning_rate": 4.1824658157664935e-07, + "logits/chosen": 2.359375, + "logits/rejected": 2.140625, + "logps/chosen": -400.0, + "logps/rejected": -490.0, + "loss": 0.6201, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.1982421875, + "rewards/rejected": -1.3828125, + "step": 416 + }, + { + "epoch": 0.8728414442700158, + "grad_norm": 11.08837890625, + "learning_rate": 4.1782551748755954e-07, + "logits/chosen": 1.5, + "logits/rejected": 2.1875, + "logps/chosen": -350.0, + "logps/rejected": -278.0, + "loss": 0.6378, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.890625, + "rewards/margins": 0.1796875, + "rewards/rejected": -1.0703125, + "step": 417 + }, + { + "epoch": 0.8749345892203035, + "grad_norm": 9.815693855285645, + "learning_rate": 4.174035850270993e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.109375, + "logps/chosen": -792.0, + "logps/rejected": -420.0, + "loss": 0.6368, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.28125, + "rewards/margins": -0.50390625, + "rewards/rejected": -0.77734375, + "step": 418 + }, + { + "epoch": 0.8770277341705913, + "grad_norm": 9.853411674499512, + "learning_rate": 4.1698078637877795e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.796875, + "logps/chosen": -928.0, + "logps/rejected": -800.0, + "loss": 0.64, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7265625, + "rewards/margins": 0.31640625, + "rewards/rejected": -1.046875, + "step": 419 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 8.987032890319824, + "learning_rate": 4.165571237305881e-07, + "logits/chosen": 1.7890625, + "logits/rejected": 2.046875, + "logps/chosen": -312.0, + "logps/rejected": -196.0, + "loss": 0.618, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.96484375, + "rewards/margins": -0.1591796875, + "rewards/rejected": -0.8046875, + "step": 420 + }, + { + "epoch": 0.8812140240711669, + "grad_norm": 9.354534149169922, + "learning_rate": 4.161325992749931e-07, + "logits/chosen": 2.34375, + "logits/rejected": 2.375, + "logps/chosen": -840.0, + "logps/rejected": -496.0, + "loss": 0.6295, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.921875, + "rewards/margins": 0.30859375, + "rewards/rejected": -1.2265625, + "step": 421 + }, + { + "epoch": 0.8833071690214548, + "grad_norm": 9.199723243713379, + "learning_rate": 4.1570721520891646e-07, + "logits/chosen": 3.453125, + "logits/rejected": 3.078125, + "logps/chosen": -688.0, + "logps/rejected": -792.0, + "loss": 0.6236, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.9296875, + "rewards/margins": -0.1298828125, + "rewards/rejected": -0.80078125, + "step": 422 + }, + { + "epoch": 0.8854003139717426, + "grad_norm": 9.82619571685791, + "learning_rate": 4.1528097373373e-07, + "logits/chosen": 1.921875, + "logits/rejected": 2.125, + "logps/chosen": -616.0, + "logps/rejected": -524.0, + "loss": 0.5998, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.85546875, + "rewards/margins": 0.0830078125, + "rewards/rejected": -0.9375, + "step": 423 + }, + { + "epoch": 0.8874934589220304, + "grad_norm": 9.572153091430664, + "learning_rate": 4.1485387705524277e-07, + "logits/chosen": 1.8671875, + "logits/rejected": 1.65625, + "logps/chosen": -476.0, + "logps/rejected": -508.0, + "loss": 0.6332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.83203125, + "rewards/margins": 0.328125, + "rewards/rejected": -1.15625, + "step": 424 + }, + { + "epoch": 0.8895866038723181, + "grad_norm": 9.886564254760742, + "learning_rate": 4.144259273836896e-07, + "logits/chosen": 2.609375, + "logits/rejected": 2.921875, + "logps/chosen": -442.0, + "logps/rejected": -296.0, + "loss": 0.6237, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.78125, + "rewards/margins": 0.0859375, + "rewards/rejected": -0.8671875, + "step": 425 + }, + { + "epoch": 0.8916797488226059, + "grad_norm": 9.998958587646484, + "learning_rate": 4.139971269337192e-07, + "logits/chosen": 2.640625, + "logits/rejected": 2.6875, + "logps/chosen": -360.0, + "logps/rejected": -416.0, + "loss": 0.6264, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.87890625, + "rewards/margins": 0.0615234375, + "rewards/rejected": -0.94140625, + "step": 426 + }, + { + "epoch": 0.8937728937728938, + "grad_norm": 10.484004974365234, + "learning_rate": 4.135674779243835e-07, + "logits/chosen": 2.65625, + "logits/rejected": 2.6875, + "logps/chosen": -270.0, + "logps/rejected": -440.0, + "loss": 0.5778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7109375, + "rewards/margins": 0.453125, + "rewards/rejected": -1.1640625, + "step": 427 + }, + { + "epoch": 0.8958660387231816, + "grad_norm": 9.723721504211426, + "learning_rate": 4.131369825791256e-07, + "logits/chosen": 2.515625, + "logits/rejected": 2.59375, + "logps/chosen": -564.0, + "logps/rejected": -488.0, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8671875, + "rewards/margins": 0.4765625, + "rewards/rejected": -1.34375, + "step": 428 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 10.536715507507324, + "learning_rate": 4.127056431257683e-07, + "logits/chosen": 1.703125, + "logits/rejected": 2.125, + "logps/chosen": -334.0, + "logps/rejected": -241.0, + "loss": 0.6463, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.87109375, + "rewards/margins": -0.0751953125, + "rewards/rejected": -0.796875, + "step": 429 + }, + { + "epoch": 0.9000523286237572, + "grad_norm": 10.433703422546387, + "learning_rate": 4.1227346179650286e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.9375, + "logps/chosen": -540.0, + "logps/rejected": -720.0, + "loss": 0.6252, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.921875, + "rewards/margins": 0.169921875, + "rewards/rejected": -1.09375, + "step": 430 + }, + { + "epoch": 0.902145473574045, + "grad_norm": 10.272560119628906, + "learning_rate": 4.118404408278771e-07, + "logits/chosen": 3.28125, + "logits/rejected": 2.03125, + "logps/chosen": -442.0, + "logps/rejected": -588.0, + "loss": 0.6508, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.96484375, + "rewards/margins": -0.119140625, + "rewards/rejected": -0.84375, + "step": 431 + }, + { + "epoch": 0.9042386185243328, + "grad_norm": 9.89719009399414, + "learning_rate": 4.11406582460784e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.40625, + "logps/chosen": -924.0, + "logps/rejected": -624.0, + "loss": 0.6147, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0, + "rewards/margins": 0.052734375, + "rewards/rejected": -1.0546875, + "step": 432 + }, + { + "epoch": 0.9063317634746206, + "grad_norm": 9.447464942932129, + "learning_rate": 4.109718889404503e-07, + "logits/chosen": 1.109375, + "logits/rejected": 1.0078125, + "logps/chosen": -214.0, + "logps/rejected": -262.0, + "loss": 0.636, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.046875, + "rewards/margins": -0.0390625, + "rewards/rejected": -1.0078125, + "step": 433 + }, + { + "epoch": 0.9084249084249084, + "grad_norm": 10.362798690795898, + "learning_rate": 4.1053636251642456e-07, + "logits/chosen": 1.84375, + "logits/rejected": 2.703125, + "logps/chosen": -552.0, + "logps/rejected": -420.0, + "loss": 0.6307, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.53125, + "rewards/margins": -0.240234375, + "rewards/rejected": -1.296875, + "step": 434 + }, + { + "epoch": 0.9105180533751962, + "grad_norm": 9.440129280090332, + "learning_rate": 4.1010000544256536e-07, + "logits/chosen": 1.546875, + "logits/rejected": 1.4453125, + "logps/chosen": -456.0, + "logps/rejected": -390.0, + "loss": 0.6167, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0078125, + "rewards/margins": 0.08984375, + "rewards/rejected": -1.1015625, + "step": 435 + }, + { + "epoch": 0.912611198325484, + "grad_norm": 10.998126983642578, + "learning_rate": 4.096628199770304e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.671875, + "logps/chosen": -696.0, + "logps/rejected": -716.0, + "loss": 0.6675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7421875, + "rewards/margins": 0.390625, + "rewards/rejected": -1.1328125, + "step": 436 + }, + { + "epoch": 0.9147043432757719, + "grad_norm": 9.635455131530762, + "learning_rate": 4.0922480838226394e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.125, + "logps/chosen": -436.0, + "logps/rejected": -812.0, + "loss": 0.6078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.640625, + "rewards/margins": 0.50390625, + "rewards/rejected": -1.140625, + "step": 437 + }, + { + "epoch": 0.9167974882260597, + "grad_norm": 10.190224647521973, + "learning_rate": 4.0878597292498576e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.515625, + "logps/chosen": -346.0, + "logps/rejected": -524.0, + "loss": 0.6652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7421875, + "rewards/margins": 0.27734375, + "rewards/rejected": -1.015625, + "step": 438 + }, + { + "epoch": 0.9188906331763474, + "grad_norm": 9.4074125289917, + "learning_rate": 4.083463158761789e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.1875, + "logps/chosen": -452.0, + "logps/rejected": -502.0, + "loss": 0.6276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7890625, + "rewards/margins": 0.328125, + "rewards/rejected": -1.1171875, + "step": 439 + }, + { + "epoch": 0.9209837781266352, + "grad_norm": 10.4319429397583, + "learning_rate": 4.079058395110782e-07, + "logits/chosen": 2.09375, + "logits/rejected": 2.421875, + "logps/chosen": -480.0, + "logps/rejected": -376.0, + "loss": 0.642, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0234375, + "rewards/margins": 0.06396484375, + "rewards/rejected": -1.0859375, + "step": 440 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 10.221841812133789, + "learning_rate": 4.074645461091587e-07, + "logits/chosen": 2.5, + "logits/rejected": 2.671875, + "logps/chosen": -496.0, + "logps/rejected": -440.0, + "loss": 0.5748, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.640625, + "rewards/margins": 0.515625, + "rewards/rejected": -1.15625, + "step": 441 + }, + { + "epoch": 0.9251700680272109, + "grad_norm": 10.588394165039062, + "learning_rate": 4.0702243795412343e-07, + "logits/chosen": 2.734375, + "logits/rejected": 3.453125, + "logps/chosen": -608.0, + "logps/rejected": -456.0, + "loss": 0.6535, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2421875, + "rewards/margins": 0.23046875, + "rewards/rejected": -1.46875, + "step": 442 + }, + { + "epoch": 0.9272632129774987, + "grad_norm": 9.824782371520996, + "learning_rate": 4.065795173338918e-07, + "logits/chosen": 1.8671875, + "logits/rejected": 2.09375, + "logps/chosen": -476.0, + "logps/rejected": -476.0, + "loss": 0.6246, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.265625, + "rewards/margins": -0.26171875, + "rewards/rejected": -1.0078125, + "step": 443 + }, + { + "epoch": 0.9293563579277865, + "grad_norm": 10.064045906066895, + "learning_rate": 4.061357865405877e-07, + "logits/chosen": 1.9609375, + "logits/rejected": 2.171875, + "logps/chosen": -400.0, + "logps/rejected": -332.0, + "loss": 0.6211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.86328125, + "rewards/margins": 0.23828125, + "rewards/rejected": -1.1015625, + "step": 444 + }, + { + "epoch": 0.9314495028780743, + "grad_norm": 9.577927589416504, + "learning_rate": 4.056912478705279e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.5625, + "logps/chosen": -560.0, + "logps/rejected": -342.0, + "loss": 0.6292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7734375, + "rewards/margins": 0.3828125, + "rewards/rejected": -1.15625, + "step": 445 + }, + { + "epoch": 0.9335426478283622, + "grad_norm": 9.735443115234375, + "learning_rate": 4.052459036242096e-07, + "logits/chosen": 2.0, + "logits/rejected": 2.0625, + "logps/chosen": -496.0, + "logps/rejected": -552.0, + "loss": 0.6157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.78515625, + "rewards/margins": 0.8359375, + "rewards/rejected": -1.6171875, + "step": 446 + }, + { + "epoch": 0.9356357927786499, + "grad_norm": 9.2816162109375, + "learning_rate": 4.047997561062993e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.78125, + "logps/chosen": -494.0, + "logps/rejected": -568.0, + "loss": 0.6006, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0, + "rewards/margins": -0.169921875, + "rewards/rejected": -0.828125, + "step": 447 + }, + { + "epoch": 0.9377289377289377, + "grad_norm": 10.253028869628906, + "learning_rate": 4.0435280762562e-07, + "logits/chosen": 1.7890625, + "logits/rejected": 1.8671875, + "logps/chosen": -278.0, + "logps/rejected": -284.0, + "loss": 0.6109, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.703125, + "rewards/margins": 0.2265625, + "rewards/rejected": -0.9296875, + "step": 448 + }, + { + "epoch": 0.9398220826792255, + "grad_norm": 9.290606498718262, + "learning_rate": 4.039050604951401e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.78125, + "logps/chosen": -592.0, + "logps/rejected": -440.0, + "loss": 0.6194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.67578125, + "rewards/margins": 0.63671875, + "rewards/rejected": -1.3125, + "step": 449 + }, + { + "epoch": 0.9419152276295133, + "grad_norm": 10.120656967163086, + "learning_rate": 4.0345651703196084e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.875, + "logps/chosen": -430.0, + "logps/rejected": -175.0, + "loss": 0.6422, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.59765625, + "rewards/margins": 0.2314453125, + "rewards/rejected": -0.83203125, + "step": 450 + }, + { + "epoch": 0.9440083725798012, + "grad_norm": 10.762495994567871, + "learning_rate": 4.030071795573044e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.390625, + "logps/chosen": -704.0, + "logps/rejected": -584.0, + "loss": 0.6472, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9453125, + "rewards/margins": 0.32421875, + "rewards/rejected": -1.2734375, + "step": 451 + }, + { + "epoch": 0.946101517530089, + "grad_norm": 10.06574535369873, + "learning_rate": 4.025570503965021e-07, + "logits/chosen": 2.8125, + "logits/rejected": 3.25, + "logps/chosen": -820.0, + "logps/rejected": -440.0, + "loss": 0.6119, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.21875, + "rewards/margins": -0.150390625, + "rewards/rejected": -1.0625, + "step": 452 + }, + { + "epoch": 0.9481946624803768, + "grad_norm": 10.84588623046875, + "learning_rate": 4.0210613187898243e-07, + "logits/chosen": 2.359375, + "logits/rejected": 2.484375, + "logps/chosen": -528.0, + "logps/rejected": -516.0, + "loss": 0.6291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9375, + "rewards/margins": 0.478515625, + "rewards/rejected": -1.4140625, + "step": 453 + }, + { + "epoch": 0.9502878074306645, + "grad_norm": 10.217286109924316, + "learning_rate": 4.016544263382585e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.5, + "logps/chosen": -880.0, + "logps/rejected": -652.0, + "loss": 0.6135, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7421875, + "rewards/margins": -0.02685546875, + "rewards/rejected": -0.71484375, + "step": 454 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 10.715296745300293, + "learning_rate": 4.012019361119164e-07, + "logits/chosen": 1.828125, + "logits/rejected": 2.625, + "logps/chosen": -320.0, + "logps/rejected": -458.0, + "loss": 0.6359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6953125, + "rewards/margins": 1.1796875, + "rewards/rejected": -1.8671875, + "step": 455 + }, + { + "epoch": 0.9544740973312402, + "grad_norm": 10.716880798339844, + "learning_rate": 4.0074866354160304e-07, + "logits/chosen": 2.15625, + "logits/rejected": 3.0, + "logps/chosen": -588.0, + "logps/rejected": -360.0, + "loss": 0.6413, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.83203125, + "rewards/margins": 0.306640625, + "rewards/rejected": -1.140625, + "step": 456 + }, + { + "epoch": 0.956567242281528, + "grad_norm": 9.964006423950195, + "learning_rate": 4.00294610973014e-07, + "logits/chosen": 2.46875, + "logits/rejected": 2.375, + "logps/chosen": -444.0, + "logps/rejected": -604.0, + "loss": 0.5844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.83984375, + "rewards/margins": 0.515625, + "rewards/rejected": -1.359375, + "step": 457 + }, + { + "epoch": 0.9586603872318158, + "grad_norm": 9.94836711883545, + "learning_rate": 3.998397807558813e-07, + "logits/chosen": 2.328125, + "logits/rejected": 2.5, + "logps/chosen": -364.0, + "logps/rejected": -388.0, + "loss": 0.6311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.890625, + "rewards/margins": 0.28125, + "rewards/rejected": -1.171875, + "step": 458 + }, + { + "epoch": 0.9607535321821036, + "grad_norm": 10.611427307128906, + "learning_rate": 3.9938417524396124e-07, + "logits/chosen": 1.84375, + "logits/rejected": 2.328125, + "logps/chosen": -500.0, + "logps/rejected": -378.0, + "loss": 0.6265, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.171875, + "rewards/margins": 0.3125, + "rewards/rejected": -1.484375, + "step": 459 + }, + { + "epoch": 0.9628466771323915, + "grad_norm": 10.26854133605957, + "learning_rate": 3.9892779679502246e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.984375, + "logps/chosen": -688.0, + "logps/rejected": -672.0, + "loss": 0.6392, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4140625, + "rewards/margins": -0.40234375, + "rewards/rejected": -1.015625, + "step": 460 + }, + { + "epoch": 0.9649398220826793, + "grad_norm": 9.015223503112793, + "learning_rate": 3.984706477708335e-07, + "logits/chosen": 2.03125, + "logits/rejected": 2.265625, + "logps/chosen": -344.0, + "logps/rejected": -278.0, + "loss": 0.6155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.71875, + "rewards/margins": 0.353515625, + "rewards/rejected": -1.0703125, + "step": 461 + }, + { + "epoch": 0.967032967032967, + "grad_norm": 9.67743968963623, + "learning_rate": 3.9801273053715045e-07, + "logits/chosen": 1.109375, + "logits/rejected": 1.484375, + "logps/chosen": -166.0, + "logps/rejected": -181.0, + "loss": 0.6059, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.75, + "rewards/margins": 0.2119140625, + "rewards/rejected": -0.9609375, + "step": 462 + }, + { + "epoch": 0.9691261119832548, + "grad_norm": 9.850830078125, + "learning_rate": 3.975540474637053e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.109375, + "logps/chosen": -350.0, + "logps/rejected": -342.0, + "loss": 0.6226, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.98046875, + "rewards/margins": -0.02001953125, + "rewards/rejected": -0.9609375, + "step": 463 + }, + { + "epoch": 0.9712192569335426, + "grad_norm": 10.694343566894531, + "learning_rate": 3.970946009241929e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.578125, + "logps/chosen": -520.0, + "logps/rejected": -436.0, + "loss": 0.6407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.859375, + "rewards/margins": 0.4296875, + "rewards/rejected": -1.2890625, + "step": 464 + }, + { + "epoch": 0.9733124018838305, + "grad_norm": 10.816640853881836, + "learning_rate": 3.9663439329625917e-07, + "logits/chosen": 2.265625, + "logits/rejected": 2.515625, + "logps/chosen": -664.0, + "logps/rejected": -476.0, + "loss": 0.6241, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.98828125, + "rewards/margins": 0.578125, + "rewards/rejected": -1.5625, + "step": 465 + }, + { + "epoch": 0.9754055468341183, + "grad_norm": 9.781015396118164, + "learning_rate": 3.961734269614889e-07, + "logits/chosen": 2.109375, + "logits/rejected": 2.28125, + "logps/chosen": -412.0, + "logps/rejected": -366.0, + "loss": 0.6075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9375, + "rewards/margins": 0.185546875, + "rewards/rejected": -1.125, + "step": 466 + }, + { + "epoch": 0.9774986917844061, + "grad_norm": 10.579108238220215, + "learning_rate": 3.9571170430539283e-07, + "logits/chosen": 2.09375, + "logits/rejected": 1.8359375, + "logps/chosen": -354.0, + "logps/rejected": -496.0, + "loss": 0.656, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.25390625, + "rewards/rejected": -1.34375, + "step": 467 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 12.218819618225098, + "learning_rate": 3.952492277173959e-07, + "logits/chosen": 2.765625, + "logits/rejected": 3.484375, + "logps/chosen": -528.0, + "logps/rejected": -298.0, + "loss": 0.6665, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.015625, + "rewards/margins": -0.09375, + "rewards/rejected": -0.921875, + "step": 468 + }, + { + "epoch": 0.9816849816849816, + "grad_norm": 10.382241249084473, + "learning_rate": 3.947859995908248e-07, + "logits/chosen": 1.9921875, + "logits/rejected": 2.390625, + "logps/chosen": -288.0, + "logps/rejected": -344.0, + "loss": 0.6083, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.125, + "rewards/margins": 0.39453125, + "rewards/rejected": -1.5234375, + "step": 469 + }, + { + "epoch": 0.9837781266352695, + "grad_norm": 10.058098793029785, + "learning_rate": 3.9432202232289497e-07, + "logits/chosen": 2.546875, + "logits/rejected": 2.5, + "logps/chosen": -688.0, + "logps/rejected": -676.0, + "loss": 0.5984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.859375, + "rewards/margins": 0.63671875, + "rewards/rejected": -1.5, + "step": 470 + }, + { + "epoch": 0.9858712715855573, + "grad_norm": 9.432204246520996, + "learning_rate": 3.938572983146993e-07, + "logits/chosen": 1.328125, + "logits/rejected": 1.6953125, + "logps/chosen": -346.0, + "logps/rejected": -338.0, + "loss": 0.5903, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0390625, + "rewards/margins": 0.2890625, + "rewards/rejected": -1.328125, + "step": 471 + }, + { + "epoch": 0.9879644165358451, + "grad_norm": 8.902270317077637, + "learning_rate": 3.9339182997119455e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.40625, + "logps/chosen": -388.0, + "logps/rejected": -616.0, + "loss": 0.6047, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8984375, + "rewards/margins": 0.33984375, + "rewards/rejected": -1.234375, + "step": 472 + }, + { + "epoch": 0.9900575614861329, + "grad_norm": 9.261591911315918, + "learning_rate": 3.9292561970118976e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.0625, + "logps/chosen": -600.0, + "logps/rejected": -516.0, + "loss": 0.5908, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9375, + "rewards/margins": 0.21484375, + "rewards/rejected": -1.1484375, + "step": 473 + }, + { + "epoch": 0.9921507064364207, + "grad_norm": 9.265706062316895, + "learning_rate": 3.9245866991733324e-07, + "logits/chosen": 3.1875, + "logits/rejected": 2.640625, + "logps/chosen": -290.0, + "logps/rejected": -446.0, + "loss": 0.599, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9921875, + "rewards/margins": 0.353515625, + "rewards/rejected": -1.34375, + "step": 474 + }, + { + "epoch": 0.9942438513867086, + "grad_norm": 9.12977123260498, + "learning_rate": 3.919909830361004e-07, + "logits/chosen": 1.3984375, + "logits/rejected": 2.265625, + "logps/chosen": -376.0, + "logps/rejected": -216.0, + "loss": 0.6122, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.86328125, + "rewards/margins": 0.2177734375, + "rewards/rejected": -1.078125, + "step": 475 + }, + { + "epoch": 0.9963369963369964, + "grad_norm": 10.491602897644043, + "learning_rate": 3.9152256147778124e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.84375, + "logps/chosen": -378.0, + "logps/rejected": -444.0, + "loss": 0.6401, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.83984375, + "rewards/margins": 0.0986328125, + "rewards/rejected": -0.9375, + "step": 476 + }, + { + "epoch": 0.9984301412872841, + "grad_norm": 9.917890548706055, + "learning_rate": 3.910534076664676e-07, + "logits/chosen": 1.609375, + "logits/rejected": 2.0625, + "logps/chosen": -528.0, + "logps/rejected": -490.0, + "loss": 0.6253, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.349609375, + "rewards/rejected": -1.484375, + "step": 477 + }, + { + "epoch": 1.000523286237572, + "grad_norm": 10.06885051727295, + "learning_rate": 3.905835240300407e-07, + "logits/chosen": 2.25, + "logits/rejected": 1.9375, + "logps/chosen": -470.0, + "logps/rejected": -488.0, + "loss": 0.6139, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.98046875, + "rewards/margins": 0.2578125, + "rewards/rejected": -1.234375, + "step": 478 + }, + { + "epoch": 1.0026164311878598, + "grad_norm": 9.318933486938477, + "learning_rate": 3.901129130001588e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.25, + "logps/chosen": -840.0, + "logps/rejected": -470.0, + "loss": 0.5939, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.1708984375, + "rewards/rejected": -1.265625, + "step": 479 + }, + { + "epoch": 1.0047095761381475, + "grad_norm": 9.795000076293945, + "learning_rate": 3.896415770122443e-07, + "logits/chosen": 2.046875, + "logits/rejected": 2.296875, + "logps/chosen": -442.0, + "logps/rejected": -508.0, + "loss": 0.6037, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.015625, + "rewards/margins": 0.2109375, + "rewards/rejected": -1.2265625, + "step": 480 + }, + { + "epoch": 1.0068027210884354, + "grad_norm": 9.666790962219238, + "learning_rate": 3.891695185054712e-07, + "logits/chosen": 1.3125, + "logits/rejected": 1.7578125, + "logps/chosen": -253.0, + "logps/rejected": -264.0, + "loss": 0.5925, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.79296875, + "rewards/margins": 0.421875, + "rewards/rejected": -1.21875, + "step": 481 + }, + { + "epoch": 1.0088958660387233, + "grad_norm": 10.20748233795166, + "learning_rate": 3.886967399227529e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.75, + "logps/chosen": -696.0, + "logps/rejected": -418.0, + "loss": 0.6023, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.65625, + "rewards/margins": 0.671875, + "rewards/rejected": -1.328125, + "step": 482 + }, + { + "epoch": 1.010989010989011, + "grad_norm": 10.576319694519043, + "learning_rate": 3.8822324371072865e-07, + "logits/chosen": 2.046875, + "logits/rejected": 2.203125, + "logps/chosen": -344.0, + "logps/rejected": -332.0, + "loss": 0.6513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.81640625, + "rewards/margins": 0.314453125, + "rewards/rejected": -1.1328125, + "step": 483 + }, + { + "epoch": 1.0130821559392988, + "grad_norm": 10.309803009033203, + "learning_rate": 3.877490323197521e-07, + "logits/chosen": 1.96875, + "logits/rejected": 1.7578125, + "logps/chosen": -426.0, + "logps/rejected": -494.0, + "loss": 0.6231, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8984375, + "rewards/margins": 0.12109375, + "rewards/rejected": -1.0234375, + "step": 484 + }, + { + "epoch": 1.0151753008895865, + "grad_norm": 10.046106338500977, + "learning_rate": 3.872741082038774e-07, + "logits/chosen": 2.171875, + "logits/rejected": 2.4375, + "logps/chosen": -536.0, + "logps/rejected": -556.0, + "loss": 0.6388, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.09375, + "rewards/margins": -0.00927734375, + "rewards/rejected": -1.0859375, + "step": 485 + }, + { + "epoch": 1.0172684458398744, + "grad_norm": 10.342788696289062, + "learning_rate": 3.8679847382084747e-07, + "logits/chosen": 2.234375, + "logits/rejected": 2.625, + "logps/chosen": -496.0, + "logps/rejected": -352.0, + "loss": 0.6206, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.34375, + "rewards/margins": -0.0185546875, + "rewards/rejected": -1.328125, + "step": 486 + }, + { + "epoch": 1.0193615907901623, + "grad_norm": 9.817326545715332, + "learning_rate": 3.8632213163208053e-07, + "logits/chosen": 1.8359375, + "logits/rejected": 1.2578125, + "logps/chosen": -209.0, + "logps/rejected": -378.0, + "loss": 0.5834, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.98046875, + "rewards/margins": 0.31640625, + "rewards/rejected": -1.296875, + "step": 487 + }, + { + "epoch": 1.02145473574045, + "grad_norm": 9.901391983032227, + "learning_rate": 3.85845084102658e-07, + "logits/chosen": 2.609375, + "logits/rejected": 2.359375, + "logps/chosen": -370.0, + "logps/rejected": -596.0, + "loss": 0.6091, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.125, + "rewards/margins": 0.197265625, + "rewards/rejected": -1.3203125, + "step": 488 + }, + { + "epoch": 1.0235478806907379, + "grad_norm": 10.532851219177246, + "learning_rate": 3.853673337013113e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.78125, + "logps/chosen": -820.0, + "logps/rejected": -848.0, + "loss": 0.607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6484375, + "rewards/margins": 0.482421875, + "rewards/rejected": -1.1328125, + "step": 489 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 10.164332389831543, + "learning_rate": 3.8488888290040944e-07, + "logits/chosen": 2.53125, + "logits/rejected": 2.359375, + "logps/chosen": -584.0, + "logps/rejected": -608.0, + "loss": 0.6228, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.29296875, + "rewards/rejected": -1.484375, + "step": 490 + }, + { + "epoch": 1.0277341705913134, + "grad_norm": 10.77008056640625, + "learning_rate": 3.844097341759455e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.25, + "logps/chosen": -294.0, + "logps/rejected": -320.0, + "loss": 0.6264, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.89453125, + "rewards/margins": 0.22265625, + "rewards/rejected": -1.1171875, + "step": 491 + }, + { + "epoch": 1.0298273155416013, + "grad_norm": 10.060259819030762, + "learning_rate": 3.8392989000752504e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.40625, + "logps/chosen": -394.0, + "logps/rejected": -306.0, + "loss": 0.617, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.078125, + "rewards/margins": 0.26953125, + "rewards/rejected": -1.34375, + "step": 492 + }, + { + "epoch": 1.031920460491889, + "grad_norm": 10.199516296386719, + "learning_rate": 3.834493528783519e-07, + "logits/chosen": 2.515625, + "logits/rejected": 3.171875, + "logps/chosen": -440.0, + "logps/rejected": -324.0, + "loss": 0.6474, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.1416015625, + "rewards/rejected": -1.34375, + "step": 493 + }, + { + "epoch": 1.034013605442177, + "grad_norm": 9.630121231079102, + "learning_rate": 3.829681252752165e-07, + "logits/chosen": 1.0625, + "logits/rejected": 1.6875, + "logps/chosen": -446.0, + "logps/rejected": -366.0, + "loss": 0.587, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3984375, + "rewards/margins": -0.08984375, + "rewards/rejected": -1.3125, + "step": 494 + }, + { + "epoch": 1.0361067503924646, + "grad_norm": 9.805791854858398, + "learning_rate": 3.824862096884822e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.59375, + "logps/chosen": -298.0, + "logps/rejected": -408.0, + "loss": 0.6153, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.9765625, + "rewards/margins": -0.076171875, + "rewards/rejected": -0.8984375, + "step": 495 + }, + { + "epoch": 1.0381998953427525, + "grad_norm": 10.049763679504395, + "learning_rate": 3.820036086120726e-07, + "logits/chosen": 2.28125, + "logits/rejected": 2.875, + "logps/chosen": -540.0, + "logps/rejected": -350.0, + "loss": 0.6468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.78515625, + "rewards/margins": 0.5390625, + "rewards/rejected": -1.328125, + "step": 496 + }, + { + "epoch": 1.0402930402930404, + "grad_norm": 10.213252067565918, + "learning_rate": 3.815203245434593e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.546875, + "logps/chosen": -528.0, + "logps/rejected": -456.0, + "loss": 0.6571, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.421875, + "rewards/margins": -0.28515625, + "rewards/rejected": -1.1328125, + "step": 497 + }, + { + "epoch": 1.042386185243328, + "grad_norm": 9.96432113647461, + "learning_rate": 3.8103635998364756e-07, + "logits/chosen": 2.53125, + "logits/rejected": 3.25, + "logps/chosen": -736.0, + "logps/rejected": -496.0, + "loss": 0.5566, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.3359375, + "rewards/rejected": -1.65625, + "step": 498 + }, + { + "epoch": 1.044479330193616, + "grad_norm": 10.319202423095703, + "learning_rate": 3.805517174371649e-07, + "logits/chosen": 2.390625, + "logits/rejected": 2.703125, + "logps/chosen": -308.0, + "logps/rejected": -302.0, + "loss": 0.6423, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.83203125, + "rewards/margins": 0.140625, + "rewards/rejected": -0.97265625, + "step": 499 + }, + { + "epoch": 1.0465724751439036, + "grad_norm": 10.619379043579102, + "learning_rate": 3.8006639941204707e-07, + "logits/chosen": 1.84375, + "logits/rejected": 1.1328125, + "logps/chosen": -456.0, + "logps/rejected": -728.0, + "loss": 0.593, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.15625, + "rewards/margins": 0.67578125, + "rewards/rejected": -1.828125, + "step": 500 + }, + { + "epoch": 1.0486656200941915, + "grad_norm": 9.219826698303223, + "learning_rate": 3.7958040841982554e-07, + "logits/chosen": 1.7734375, + "logits/rejected": 1.7890625, + "logps/chosen": -388.0, + "logps/rejected": -468.0, + "loss": 0.5655, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0625, + "rewards/margins": 1.0859375, + "rewards/rejected": -2.140625, + "step": 501 + }, + { + "epoch": 1.0507587650444794, + "grad_norm": 10.068828582763672, + "learning_rate": 3.7909374697551437e-07, + "logits/chosen": 1.5078125, + "logits/rejected": 1.4140625, + "logps/chosen": -450.0, + "logps/rejected": -440.0, + "loss": 0.6082, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9140625, + "rewards/margins": 0.0166015625, + "rewards/rejected": -1.9296875, + "step": 502 + }, + { + "epoch": 1.052851909994767, + "grad_norm": 10.750775337219238, + "learning_rate": 3.786064175975972e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.59375, + "logps/chosen": -486.0, + "logps/rejected": -494.0, + "loss": 0.625, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.109375, + "rewards/margins": -0.0263671875, + "rewards/rejected": -1.0859375, + "step": 503 + }, + { + "epoch": 1.054945054945055, + "grad_norm": 10.124489784240723, + "learning_rate": 3.781184228080145e-07, + "logits/chosen": 2.171875, + "logits/rejected": 2.71875, + "logps/chosen": -656.0, + "logps/rejected": -318.0, + "loss": 0.6313, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.0, + "rewards/margins": -0.3359375, + "rewards/rejected": -1.671875, + "step": 504 + }, + { + "epoch": 1.0570381998953426, + "grad_norm": 10.40312385559082, + "learning_rate": 3.7762976513214966e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.109375, + "logps/chosen": -262.0, + "logps/rejected": -386.0, + "loss": 0.6014, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.21875, + "rewards/margins": 0.37890625, + "rewards/rejected": -1.6015625, + "step": 505 + }, + { + "epoch": 1.0591313448456305, + "grad_norm": 10.114124298095703, + "learning_rate": 3.771404470988174e-07, + "logits/chosen": 2.390625, + "logits/rejected": 2.0, + "logps/chosen": -290.0, + "logps/rejected": -348.0, + "loss": 0.5869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9140625, + "rewards/margins": 0.1962890625, + "rewards/rejected": -1.109375, + "step": 506 + }, + { + "epoch": 1.0612244897959184, + "grad_norm": 9.699861526489258, + "learning_rate": 3.766504712402488e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.75, + "logps/chosen": -220.0, + "logps/rejected": -210.0, + "loss": 0.5878, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.8125, + "rewards/margins": -0.09765625, + "rewards/rejected": -0.71875, + "step": 507 + }, + { + "epoch": 1.063317634746206, + "grad_norm": 10.216354370117188, + "learning_rate": 3.7615984009208006e-07, + "logits/chosen": 2.75, + "logits/rejected": 3.15625, + "logps/chosen": -482.0, + "logps/rejected": -466.0, + "loss": 0.6022, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.015625, + "rewards/margins": 0.02734375, + "rewards/rejected": -1.046875, + "step": 508 + }, + { + "epoch": 1.065410779696494, + "grad_norm": 10.050541877746582, + "learning_rate": 3.7566855619333816e-07, + "logits/chosen": 2.59375, + "logits/rejected": 3.15625, + "logps/chosen": -330.0, + "logps/rejected": -348.0, + "loss": 0.5886, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.109375, + "rewards/margins": 0.3046875, + "rewards/rejected": -1.4140625, + "step": 509 + }, + { + "epoch": 1.0675039246467817, + "grad_norm": 11.169584274291992, + "learning_rate": 3.7517662208642783e-07, + "logits/chosen": 0.93359375, + "logits/rejected": 1.453125, + "logps/chosen": -506.0, + "logps/rejected": -346.0, + "loss": 0.6547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.63671875, + "rewards/margins": 0.4453125, + "rewards/rejected": -1.0859375, + "step": 510 + }, + { + "epoch": 1.0695970695970696, + "grad_norm": 10.254963874816895, + "learning_rate": 3.7468404031711924e-07, + "logits/chosen": 1.25, + "logits/rejected": 2.03125, + "logps/chosen": -324.0, + "logps/rejected": -324.0, + "loss": 0.5988, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.609375, + "rewards/rejected": -1.7578125, + "step": 511 + }, + { + "epoch": 1.0716902145473575, + "grad_norm": 10.694738388061523, + "learning_rate": 3.741908134345335e-07, + "logits/chosen": 1.53125, + "logits/rejected": 2.0, + "logps/chosen": -354.0, + "logps/rejected": -490.0, + "loss": 0.618, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.15625, + "rewards/margins": 0.33203125, + "rewards/rejected": -1.4921875, + "step": 512 + }, + { + "epoch": 1.0737833594976451, + "grad_norm": 10.05710506439209, + "learning_rate": 3.736969439911309e-07, + "logits/chosen": 2.3125, + "logits/rejected": 2.359375, + "logps/chosen": -470.0, + "logps/rejected": -402.0, + "loss": 0.6105, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.203125, + "rewards/margins": 0.0185546875, + "rewards/rejected": -1.21875, + "step": 513 + }, + { + "epoch": 1.075876504447933, + "grad_norm": 9.925804138183594, + "learning_rate": 3.732024345426966e-07, + "logits/chosen": 1.4453125, + "logits/rejected": 1.4296875, + "logps/chosen": -360.0, + "logps/rejected": -450.0, + "loss": 0.6363, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1875, + "rewards/margins": 0.181640625, + "rewards/rejected": -1.3671875, + "step": 514 + }, + { + "epoch": 1.077969649398221, + "grad_norm": 9.118669509887695, + "learning_rate": 3.727072876483278e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.15625, + "logps/chosen": -328.0, + "logps/rejected": -458.0, + "loss": 0.5867, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.96484375, + "rewards/margins": 0.6640625, + "rewards/rejected": -1.625, + "step": 515 + }, + { + "epoch": 1.0800627943485086, + "grad_norm": 10.814146041870117, + "learning_rate": 3.722115058704207e-07, + "logits/chosen": 2.828125, + "logits/rejected": 3.0625, + "logps/chosen": -840.0, + "logps/rejected": -608.0, + "loss": 0.621, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3984375, + "rewards/margins": -0.0390625, + "rewards/rejected": -1.359375, + "step": 516 + }, + { + "epoch": 1.0821559392987965, + "grad_norm": 10.450875282287598, + "learning_rate": 3.7171509177465676e-07, + "logits/chosen": 1.953125, + "logits/rejected": 2.015625, + "logps/chosen": -480.0, + "logps/rejected": -520.0, + "loss": 0.6005, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.88671875, + "rewards/margins": 0.5703125, + "rewards/rejected": -1.453125, + "step": 517 + }, + { + "epoch": 1.0842490842490842, + "grad_norm": 10.862897872924805, + "learning_rate": 3.7121804792998995e-07, + "logits/chosen": 1.9140625, + "logits/rejected": 2.015625, + "logps/chosen": -414.0, + "logps/rejected": -382.0, + "loss": 0.6515, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2578125, + "rewards/margins": -0.111328125, + "rewards/rejected": -1.1484375, + "step": 518 + }, + { + "epoch": 1.086342229199372, + "grad_norm": 9.62572193145752, + "learning_rate": 3.7072037690863306e-07, + "logits/chosen": 2.65625, + "logits/rejected": 2.75, + "logps/chosen": -504.0, + "logps/rejected": -572.0, + "loss": 0.5884, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.0947265625, + "rewards/rejected": -1.40625, + "step": 519 + }, + { + "epoch": 1.08843537414966, + "grad_norm": 10.811123847961426, + "learning_rate": 3.7022208128604453e-07, + "logits/chosen": 2.03125, + "logits/rejected": 2.71875, + "logps/chosen": -648.0, + "logps/rejected": -408.0, + "loss": 0.5946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9296875, + "rewards/margins": 0.4921875, + "rewards/rejected": -1.421875, + "step": 520 + }, + { + "epoch": 1.0905285190999476, + "grad_norm": 10.257491111755371, + "learning_rate": 3.6972316364091525e-07, + "logits/chosen": 2.046875, + "logits/rejected": 2.484375, + "logps/chosen": -276.0, + "logps/rejected": -312.0, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.09375, + "rewards/margins": 0.390625, + "rewards/rejected": -1.484375, + "step": 521 + }, + { + "epoch": 1.0926216640502355, + "grad_norm": 11.21011734008789, + "learning_rate": 3.6922362655515507e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.75, + "logps/chosen": -520.0, + "logps/rejected": -540.0, + "loss": 0.6377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.64453125, + "rewards/margins": 0.1865234375, + "rewards/rejected": -0.828125, + "step": 522 + }, + { + "epoch": 1.0947148090005232, + "grad_norm": 11.032417297363281, + "learning_rate": 3.687234726138793e-07, + "logits/chosen": 1.890625, + "logits/rejected": 2.5, + "logps/chosen": -434.0, + "logps/rejected": -296.0, + "loss": 0.6326, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.1328125, + "rewards/margins": -0.1064453125, + "rewards/rejected": -1.03125, + "step": 523 + }, + { + "epoch": 1.096807953950811, + "grad_norm": 9.880134582519531, + "learning_rate": 3.682227044053957e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.390625, + "logps/chosen": -628.0, + "logps/rejected": -528.0, + "loss": 0.5774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.478515625, + "rewards/margins": 1.0546875, + "rewards/rejected": -1.5390625, + "step": 524 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 10.90539264678955, + "learning_rate": 3.677213245211906e-07, + "logits/chosen": 2.109375, + "logits/rejected": 2.203125, + "logps/chosen": -640.0, + "logps/rejected": -660.0, + "loss": 0.6084, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5546875, + "rewards/margins": -0.154296875, + "rewards/rejected": -1.40625, + "step": 525 + }, + { + "epoch": 1.1009942438513867, + "grad_norm": 11.089640617370605, + "learning_rate": 3.6721933555591603e-07, + "logits/chosen": 1.703125, + "logits/rejected": 2.234375, + "logps/chosen": -416.0, + "logps/rejected": -276.0, + "loss": 0.6526, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.40625, + "rewards/margins": -0.177734375, + "rewards/rejected": -1.2265625, + "step": 526 + }, + { + "epoch": 1.1030873888016746, + "grad_norm": 9.621665954589844, + "learning_rate": 3.6671674010737596e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.328125, + "logps/chosen": -424.0, + "logps/rejected": -456.0, + "loss": 0.6194, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1640625, + "rewards/margins": 0.2890625, + "rewards/rejected": -1.453125, + "step": 527 + }, + { + "epoch": 1.1051805337519622, + "grad_norm": 10.657153129577637, + "learning_rate": 3.6621354077651293e-07, + "logits/chosen": 2.125, + "logits/rejected": 1.7890625, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.85546875, + "rewards/margins": 0.28515625, + "rewards/rejected": -1.140625, + "step": 528 + }, + { + "epoch": 1.1072736787022501, + "grad_norm": 10.405318260192871, + "learning_rate": 3.657097401673944e-07, + "logits/chosen": 1.9375, + "logits/rejected": 2.984375, + "logps/chosen": -816.0, + "logps/rejected": -484.0, + "loss": 0.5704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.671875, + "rewards/margins": 0.41015625, + "rewards/rejected": -1.078125, + "step": 529 + }, + { + "epoch": 1.109366823652538, + "grad_norm": 10.833739280700684, + "learning_rate": 3.6520534088719963e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.421875, + "logps/chosen": -402.0, + "logps/rejected": -392.0, + "loss": 0.6021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.15625, + "rewards/margins": 0.369140625, + "rewards/rejected": -1.5234375, + "step": 530 + }, + { + "epoch": 1.1114599686028257, + "grad_norm": 10.032989501953125, + "learning_rate": 3.6470034554620614e-07, + "logits/chosen": 1.9609375, + "logits/rejected": 2.046875, + "logps/chosen": -344.0, + "logps/rejected": -286.0, + "loss": 0.6215, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.90234375, + "rewards/margins": 0.072265625, + "rewards/rejected": -0.9765625, + "step": 531 + }, + { + "epoch": 1.1135531135531136, + "grad_norm": 9.768916130065918, + "learning_rate": 3.6419475675777587e-07, + "logits/chosen": 2.0625, + "logits/rejected": 1.7734375, + "logps/chosen": -294.0, + "logps/rejected": -320.0, + "loss": 0.6178, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.265625, + "rewards/rejected": -1.3984375, + "step": 532 + }, + { + "epoch": 1.1156462585034013, + "grad_norm": 9.991143226623535, + "learning_rate": 3.636885771383419e-07, + "logits/chosen": 1.6953125, + "logits/rejected": 2.203125, + "logps/chosen": -296.0, + "logps/rejected": -552.0, + "loss": 0.6119, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.6484375, + "rewards/rejected": -1.78125, + "step": 533 + }, + { + "epoch": 1.1177394034536892, + "grad_norm": 9.989991188049316, + "learning_rate": 3.631818093073948e-07, + "logits/chosen": 2.453125, + "logits/rejected": 2.859375, + "logps/chosen": -572.0, + "logps/rejected": -446.0, + "loss": 0.6055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.66796875, + "rewards/margins": 0.515625, + "rewards/rejected": -1.1875, + "step": 534 + }, + { + "epoch": 1.119832548403977, + "grad_norm": 9.190662384033203, + "learning_rate": 3.626744558874696e-07, + "logits/chosen": 2.3125, + "logits/rejected": 2.65625, + "logps/chosen": -344.0, + "logps/rejected": -350.0, + "loss": 0.6081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.78125, + "rewards/margins": 0.19140625, + "rewards/rejected": -0.97265625, + "step": 535 + }, + { + "epoch": 1.1219256933542647, + "grad_norm": 10.211576461791992, + "learning_rate": 3.6216651950413097e-07, + "logits/chosen": 2.0625, + "logits/rejected": 2.328125, + "logps/chosen": -438.0, + "logps/rejected": -350.0, + "loss": 0.6157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.94140625, + "rewards/margins": 0.3515625, + "rewards/rejected": -1.2890625, + "step": 536 + }, + { + "epoch": 1.1240188383045526, + "grad_norm": 11.025035858154297, + "learning_rate": 3.6165800278596116e-07, + "logits/chosen": 2.265625, + "logits/rejected": 2.78125, + "logps/chosen": -502.0, + "logps/rejected": -448.0, + "loss": 0.6248, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0625, + "rewards/margins": 0.28515625, + "rewards/rejected": -1.34375, + "step": 537 + }, + { + "epoch": 1.1261119832548405, + "grad_norm": 10.022388458251953, + "learning_rate": 3.611489083645453e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.34375, + "logps/chosen": -652.0, + "logps/rejected": -764.0, + "loss": 0.5888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.734375, + "rewards/margins": 0.17578125, + "rewards/rejected": -0.91015625, + "step": 538 + }, + { + "epoch": 1.1282051282051282, + "grad_norm": 9.915376663208008, + "learning_rate": 3.6063923887445815e-07, + "logits/chosen": 1.8046875, + "logits/rejected": 1.8203125, + "logps/chosen": -314.0, + "logps/rejected": -382.0, + "loss": 0.5849, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.71875, + "rewards/margins": 0.75390625, + "rewards/rejected": -1.4765625, + "step": 539 + }, + { + "epoch": 1.130298273155416, + "grad_norm": 10.21751880645752, + "learning_rate": 3.601289969532506e-07, + "logits/chosen": 1.8984375, + "logits/rejected": 3.09375, + "logps/chosen": -328.0, + "logps/rejected": -408.0, + "loss": 0.585, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.3203125, + "rewards/rejected": -1.4140625, + "step": 540 + }, + { + "epoch": 1.1323914181057038, + "grad_norm": 9.77088451385498, + "learning_rate": 3.596181852414358e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.703125, + "logps/chosen": -496.0, + "logps/rejected": -500.0, + "loss": 0.5663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8125, + "rewards/margins": 0.671875, + "rewards/rejected": -1.484375, + "step": 541 + }, + { + "epoch": 1.1344845630559917, + "grad_norm": 10.153401374816895, + "learning_rate": 3.591068063824757e-07, + "logits/chosen": 3.296875, + "logits/rejected": 2.578125, + "logps/chosen": -342.0, + "logps/rejected": -420.0, + "loss": 0.596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9375, + "rewards/margins": 0.2421875, + "rewards/rejected": -1.1796875, + "step": 542 + }, + { + "epoch": 1.1365777080062793, + "grad_norm": 10.042383193969727, + "learning_rate": 3.5859486302276697e-07, + "logits/chosen": 2.265625, + "logits/rejected": 2.6875, + "logps/chosen": -340.0, + "logps/rejected": -328.0, + "loss": 0.611, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.28125, + "rewards/margins": -0.0625, + "rewards/rejected": -1.21875, + "step": 543 + }, + { + "epoch": 1.1386708529565672, + "grad_norm": 10.027029037475586, + "learning_rate": 3.5808235781162794e-07, + "logits/chosen": 1.546875, + "logits/rejected": 1.6875, + "logps/chosen": -244.0, + "logps/rejected": -468.0, + "loss": 0.6011, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.875, + "rewards/margins": 0.35546875, + "rewards/rejected": -1.2265625, + "step": 544 + }, + { + "epoch": 1.1407639979068551, + "grad_norm": 10.477700233459473, + "learning_rate": 3.575692934012843e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.609375, + "logps/chosen": -308.0, + "logps/rejected": -332.0, + "loss": 0.6235, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.140625, + "rewards/margins": 0.3515625, + "rewards/rejected": -1.4921875, + "step": 545 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 10.700268745422363, + "learning_rate": 3.570556724468556e-07, + "logits/chosen": 1.84375, + "logits/rejected": 1.703125, + "logps/chosen": -266.0, + "logps/rejected": -222.0, + "loss": 0.6432, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.46875, + "rewards/margins": -0.025390625, + "rewards/rejected": -1.4453125, + "step": 546 + }, + { + "epoch": 1.1449502878074307, + "grad_norm": 9.420391082763672, + "learning_rate": 3.5654149760634167e-07, + "logits/chosen": 1.0625, + "logits/rejected": 1.4375, + "logps/chosen": -302.0, + "logps/rejected": -396.0, + "loss": 0.5872, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.015625, + "rewards/margins": 0.51953125, + "rewards/rejected": -1.53125, + "step": 547 + }, + { + "epoch": 1.1470434327577186, + "grad_norm": 10.092485427856445, + "learning_rate": 3.560267715406085e-07, + "logits/chosen": 1.1015625, + "logits/rejected": 1.90625, + "logps/chosen": -396.0, + "logps/rejected": -344.0, + "loss": 0.5838, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.91015625, + "rewards/margins": 0.5546875, + "rewards/rejected": -1.46875, + "step": 548 + }, + { + "epoch": 1.1491365777080063, + "grad_norm": 10.177468299865723, + "learning_rate": 3.5551149691337496e-07, + "logits/chosen": 1.34375, + "logits/rejected": 1.5703125, + "logps/chosen": -233.0, + "logps/rejected": -197.0, + "loss": 0.6062, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.203125, + "rewards/margins": -0.095703125, + "rewards/rejected": -1.109375, + "step": 549 + }, + { + "epoch": 1.1512297226582942, + "grad_norm": 9.664475440979004, + "learning_rate": 3.549956763911985e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.390625, + "logps/chosen": -504.0, + "logps/rejected": -512.0, + "loss": 0.5928, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0625, + "rewards/margins": 0.177734375, + "rewards/rejected": -1.2421875, + "step": 550 + }, + { + "epoch": 1.1533228676085818, + "grad_norm": 10.4424467086792, + "learning_rate": 3.5447931264346163e-07, + "logits/chosen": 1.515625, + "logits/rejected": 1.921875, + "logps/chosen": -332.0, + "logps/rejected": -374.0, + "loss": 0.5921, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.3671875, + "rewards/rejected": -1.5703125, + "step": 551 + }, + { + "epoch": 1.1554160125588697, + "grad_norm": 10.784751892089844, + "learning_rate": 3.539624083423582e-07, + "logits/chosen": 2.125, + "logits/rejected": 2.71875, + "logps/chosen": -624.0, + "logps/rejected": -452.0, + "loss": 0.6126, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.453125, + "rewards/margins": -0.330078125, + "rewards/rejected": -1.125, + "step": 552 + }, + { + "epoch": 1.1575091575091574, + "grad_norm": 10.372719764709473, + "learning_rate": 3.534449661628793e-07, + "logits/chosen": 2.859375, + "logits/rejected": 3.125, + "logps/chosen": -592.0, + "logps/rejected": -652.0, + "loss": 0.6005, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.63671875, + "rewards/rejected": -1.953125, + "step": 553 + }, + { + "epoch": 1.1596023024594453, + "grad_norm": 10.629199981689453, + "learning_rate": 3.5292698878279964e-07, + "logits/chosen": 2.0, + "logits/rejected": 2.515625, + "logps/chosen": -418.0, + "logps/rejected": -414.0, + "loss": 0.5892, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0703125, + "rewards/margins": 0.27734375, + "rewards/rejected": -1.3515625, + "step": 554 + }, + { + "epoch": 1.1616954474097332, + "grad_norm": 10.558868408203125, + "learning_rate": 3.524084788826635e-07, + "logits/chosen": 1.875, + "logits/rejected": 1.90625, + "logps/chosen": -416.0, + "logps/rejected": -496.0, + "loss": 0.6155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8203125, + "rewards/margins": 0.55859375, + "rewards/rejected": -1.375, + "step": 555 + }, + { + "epoch": 1.1637885923600209, + "grad_norm": 10.18245792388916, + "learning_rate": 3.5188943914577097e-07, + "logits/chosen": 1.8359375, + "logits/rejected": 1.4453125, + "logps/chosen": -266.0, + "logps/rejected": -320.0, + "loss": 0.614, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.296875, + "rewards/margins": 0.1416015625, + "rewards/rejected": -1.4375, + "step": 556 + }, + { + "epoch": 1.1658817373103088, + "grad_norm": 11.512836456298828, + "learning_rate": 3.5136987225816433e-07, + "logits/chosen": 1.9140625, + "logits/rejected": 1.734375, + "logps/chosen": -326.0, + "logps/rejected": -464.0, + "loss": 0.6043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.125, + "rewards/margins": 0.36328125, + "rewards/rejected": -1.484375, + "step": 557 + }, + { + "epoch": 1.1679748822605966, + "grad_norm": 9.522254943847656, + "learning_rate": 3.508497809086134e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.921875, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.5811, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0859375, + "rewards/margins": 1.2734375, + "rewards/rejected": -2.359375, + "step": 558 + }, + { + "epoch": 1.1700680272108843, + "grad_norm": 10.200825691223145, + "learning_rate": 3.5032916778860253e-07, + "logits/chosen": 0.99609375, + "logits/rejected": 1.421875, + "logps/chosen": -189.0, + "logps/rejected": -167.0, + "loss": 0.5896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.82421875, + "rewards/margins": 0.0400390625, + "rewards/rejected": -0.86328125, + "step": 559 + }, + { + "epoch": 1.1721611721611722, + "grad_norm": 11.027997970581055, + "learning_rate": 3.4980803559231595e-07, + "logits/chosen": 1.0546875, + "logits/rejected": 1.8828125, + "logps/chosen": -300.0, + "logps/rejected": -250.0, + "loss": 0.6103, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.125, + "rewards/margins": -0.021484375, + "rewards/rejected": -1.1015625, + "step": 560 + }, + { + "epoch": 1.1742543171114599, + "grad_norm": 11.398727416992188, + "learning_rate": 3.4928638701662445e-07, + "logits/chosen": 0.79296875, + "logits/rejected": 0.86328125, + "logps/chosen": -201.0, + "logps/rejected": -304.0, + "loss": 0.5375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.94921875, + "rewards/margins": 0.7578125, + "rewards/rejected": -1.703125, + "step": 561 + }, + { + "epoch": 1.1763474620617478, + "grad_norm": 10.500263214111328, + "learning_rate": 3.4876422476107057e-07, + "logits/chosen": 1.1640625, + "logits/rejected": 1.3828125, + "logps/chosen": -164.0, + "logps/rejected": -334.0, + "loss": 0.6236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8203125, + "rewards/margins": 0.73046875, + "rewards/rejected": -1.5546875, + "step": 562 + }, + { + "epoch": 1.1784406070120357, + "grad_norm": 11.702860832214355, + "learning_rate": 3.482415515278558e-07, + "logits/chosen": 2.25, + "logits/rejected": 2.390625, + "logps/chosen": -272.0, + "logps/rejected": -370.0, + "loss": 0.6412, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0546875, + "rewards/margins": 0.6328125, + "rewards/rejected": -1.6875, + "step": 563 + }, + { + "epoch": 1.1805337519623234, + "grad_norm": 10.307535171508789, + "learning_rate": 3.477183700218254e-07, + "logits/chosen": 1.703125, + "logits/rejected": 2.375, + "logps/chosen": -520.0, + "logps/rejected": -580.0, + "loss": 0.5652, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.98828125, + "rewards/margins": 1.0703125, + "rewards/rejected": -2.0625, + "step": 564 + }, + { + "epoch": 1.1826268969126112, + "grad_norm": 10.563407897949219, + "learning_rate": 3.471946829504553e-07, + "logits/chosen": 3.09375, + "logits/rejected": 2.765625, + "logps/chosen": -420.0, + "logps/rejected": -596.0, + "loss": 0.6133, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.15625, + "rewards/margins": 0.015625, + "rewards/rejected": -1.171875, + "step": 565 + }, + { + "epoch": 1.184720041862899, + "grad_norm": 9.85606575012207, + "learning_rate": 3.4667049302383763e-07, + "logits/chosen": 2.53125, + "logits/rejected": 3.28125, + "logps/chosen": -588.0, + "logps/rejected": -476.0, + "loss": 0.5743, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.0908203125, + "rewards/rejected": -1.28125, + "step": 566 + }, + { + "epoch": 1.1868131868131868, + "grad_norm": 10.523385047912598, + "learning_rate": 3.461458029546666e-07, + "logits/chosen": 1.4296875, + "logits/rejected": 2.546875, + "logps/chosen": -408.0, + "logps/rejected": -300.0, + "loss": 0.616, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.19921875, + "rewards/rejected": -1.3359375, + "step": 567 + }, + { + "epoch": 1.1889063317634747, + "grad_norm": 10.355939865112305, + "learning_rate": 3.456206154582251e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.90625, + "logps/chosen": -636.0, + "logps/rejected": -580.0, + "loss": 0.5749, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.71875, + "rewards/rejected": -1.8125, + "step": 568 + }, + { + "epoch": 1.1909994767137624, + "grad_norm": 10.845210075378418, + "learning_rate": 3.4509493325236984e-07, + "logits/chosen": 2.140625, + "logits/rejected": 1.8671875, + "logps/chosen": -416.0, + "logps/rejected": -420.0, + "loss": 0.6238, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0546875, + "rewards/margins": 0.2890625, + "rewards/rejected": -1.34375, + "step": 569 + }, + { + "epoch": 1.1930926216640503, + "grad_norm": 10.860997200012207, + "learning_rate": 3.445687590575179e-07, + "logits/chosen": 2.296875, + "logits/rejected": 2.5, + "logps/chosen": -652.0, + "logps/rejected": -344.0, + "loss": 0.6565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9765625, + "rewards/margins": 0.384765625, + "rewards/rejected": -1.359375, + "step": 570 + }, + { + "epoch": 1.195185766614338, + "grad_norm": 10.557795524597168, + "learning_rate": 3.440420955966322e-07, + "logits/chosen": 2.4375, + "logits/rejected": 1.4296875, + "logps/chosen": -416.0, + "logps/rejected": -560.0, + "loss": 0.5388, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.078125, + "rewards/margins": 0.16796875, + "rewards/rejected": -1.25, + "step": 571 + }, + { + "epoch": 1.1972789115646258, + "grad_norm": 10.709152221679688, + "learning_rate": 3.435149455952078e-07, + "logits/chosen": 1.90625, + "logits/rejected": 2.375, + "logps/chosen": -370.0, + "logps/rejected": -312.0, + "loss": 0.5801, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.171875, + "rewards/margins": 0.40234375, + "rewards/rejected": -1.578125, + "step": 572 + }, + { + "epoch": 1.1993720565149137, + "grad_norm": 11.39714241027832, + "learning_rate": 3.429873117812576e-07, + "logits/chosen": 0.59765625, + "logits/rejected": 0.87109375, + "logps/chosen": -424.0, + "logps/rejected": -286.0, + "loss": 0.613, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.09375, + "rewards/margins": 0.04296875, + "rewards/rejected": -1.140625, + "step": 573 + }, + { + "epoch": 1.2014652014652014, + "grad_norm": 10.997570991516113, + "learning_rate": 3.4245919688529825e-07, + "logits/chosen": 1.609375, + "logits/rejected": 2.203125, + "logps/chosen": -510.0, + "logps/rejected": -432.0, + "loss": 0.5696, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.3046875, + "rewards/rejected": -1.625, + "step": 574 + }, + { + "epoch": 1.2035583464154893, + "grad_norm": 10.565017700195312, + "learning_rate": 3.419306036403357e-07, + "logits/chosen": 1.828125, + "logits/rejected": 2.078125, + "logps/chosen": -414.0, + "logps/rejected": -596.0, + "loss": 0.587, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.94140625, + "rewards/margins": 0.75, + "rewards/rejected": -1.6953125, + "step": 575 + }, + { + "epoch": 1.205651491365777, + "grad_norm": 10.412652969360352, + "learning_rate": 3.4140153478185194e-07, + "logits/chosen": 0.6953125, + "logits/rejected": 0.8515625, + "logps/chosen": -184.0, + "logps/rejected": -328.0, + "loss": 0.6048, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.265625, + "rewards/margins": 0.43359375, + "rewards/rejected": -1.6953125, + "step": 576 + }, + { + "epoch": 1.2077446363160649, + "grad_norm": 11.035755157470703, + "learning_rate": 3.408719930477898e-07, + "logits/chosen": 2.625, + "logits/rejected": 3.203125, + "logps/chosen": -680.0, + "logps/rejected": -620.0, + "loss": 0.6422, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9921875, + "rewards/margins": 0.291015625, + "rewards/rejected": -1.28125, + "step": 577 + }, + { + "epoch": 1.2098377812663528, + "grad_norm": 11.217260360717773, + "learning_rate": 3.4034198117853933e-07, + "logits/chosen": 1.578125, + "logits/rejected": 1.9765625, + "logps/chosen": -410.0, + "logps/rejected": -318.0, + "loss": 0.6314, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.94921875, + "rewards/margins": 0.08349609375, + "rewards/rejected": -1.03125, + "step": 578 + }, + { + "epoch": 1.2119309262166404, + "grad_norm": 10.444401741027832, + "learning_rate": 3.398115019169238e-07, + "logits/chosen": 2.203125, + "logits/rejected": 1.9765625, + "logps/chosen": -404.0, + "logps/rejected": -384.0, + "loss": 0.6165, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.5234375, + "rewards/margins": -0.55078125, + "rewards/rejected": -0.96875, + "step": 579 + }, + { + "epoch": 1.2140240711669283, + "grad_norm": 11.714608192443848, + "learning_rate": 3.3928055800818484e-07, + "logits/chosen": 1.4609375, + "logits/rejected": 1.84375, + "logps/chosen": -438.0, + "logps/rejected": -400.0, + "loss": 0.6471, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.40625, + "rewards/margins": 0.2041015625, + "rewards/rejected": -1.609375, + "step": 580 + }, + { + "epoch": 1.2161172161172162, + "grad_norm": 12.41568660736084, + "learning_rate": 3.387491521999692e-07, + "logits/chosen": 1.65625, + "logits/rejected": 2.21875, + "logps/chosen": -572.0, + "logps/rejected": -500.0, + "loss": 0.635, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.7578125, + "rewards/margins": 0.029296875, + "rewards/rejected": -1.7890625, + "step": 581 + }, + { + "epoch": 1.218210361067504, + "grad_norm": 10.436721801757812, + "learning_rate": 3.382172872423132e-07, + "logits/chosen": 2.34375, + "logits/rejected": 3.296875, + "logps/chosen": -760.0, + "logps/rejected": -344.0, + "loss": 0.6416, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0, + "rewards/margins": 0.267578125, + "rewards/rejected": -1.2734375, + "step": 582 + }, + { + "epoch": 1.2203035060177918, + "grad_norm": 9.642477989196777, + "learning_rate": 3.3768496588763007e-07, + "logits/chosen": 2.28125, + "logits/rejected": 2.15625, + "logps/chosen": -548.0, + "logps/rejected": -724.0, + "loss": 0.5784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.82421875, + "rewards/margins": 0.57421875, + "rewards/rejected": -1.3984375, + "step": 583 + }, + { + "epoch": 1.2223966509680795, + "grad_norm": 9.932283401489258, + "learning_rate": 3.371521908906943e-07, + "logits/chosen": 2.15625, + "logits/rejected": 2.703125, + "logps/chosen": -536.0, + "logps/rejected": -564.0, + "loss": 0.5857, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8828125, + "rewards/margins": 0.470703125, + "rewards/rejected": -1.3515625, + "step": 584 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 10.983428001403809, + "learning_rate": 3.366189650086284e-07, + "logits/chosen": 2.15625, + "logits/rejected": 2.4375, + "logps/chosen": -444.0, + "logps/rejected": -380.0, + "loss": 0.6206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.046875, + "rewards/margins": 0.6640625, + "rewards/rejected": -1.703125, + "step": 585 + }, + { + "epoch": 1.226582940868655, + "grad_norm": 10.217317581176758, + "learning_rate": 3.360852910008879e-07, + "logits/chosen": 1.21875, + "logits/rejected": 1.515625, + "logps/chosen": -360.0, + "logps/rejected": -432.0, + "loss": 0.6135, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9921875, + "rewards/margins": 0.65234375, + "rewards/rejected": -1.640625, + "step": 586 + }, + { + "epoch": 1.228676085818943, + "grad_norm": 10.667376518249512, + "learning_rate": 3.3555117162924756e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.109375, + "logps/chosen": -290.0, + "logps/rejected": -462.0, + "loss": 0.6056, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.15625, + "rewards/margins": -0.1552734375, + "rewards/rejected": -1.0, + "step": 587 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 10.79523754119873, + "learning_rate": 3.3501660965778707e-07, + "logits/chosen": 2.125, + "logits/rejected": 2.703125, + "logps/chosen": -592.0, + "logps/rejected": -652.0, + "loss": 0.5988, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9453125, + "rewards/margins": 0.53125, + "rewards/rejected": -1.4765625, + "step": 588 + }, + { + "epoch": 1.2328623757195185, + "grad_norm": 11.859354019165039, + "learning_rate": 3.34481607852876e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.796875, + "logps/chosen": -486.0, + "logps/rejected": -350.0, + "loss": 0.6067, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.140625, + "rewards/margins": 0.1669921875, + "rewards/rejected": -1.3125, + "step": 589 + }, + { + "epoch": 1.2349555206698064, + "grad_norm": 10.522369384765625, + "learning_rate": 3.3394616898316085e-07, + "logits/chosen": 1.625, + "logits/rejected": 2.203125, + "logps/chosen": -636.0, + "logps/rejected": -528.0, + "loss": 0.6135, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.96484375, + "rewards/margins": 0.119140625, + "rewards/rejected": -1.0859375, + "step": 590 + }, + { + "epoch": 1.2370486656200943, + "grad_norm": 10.819913864135742, + "learning_rate": 3.3341029581954946e-07, + "logits/chosen": 1.6484375, + "logits/rejected": 1.140625, + "logps/chosen": -270.0, + "logps/rejected": -512.0, + "loss": 0.5995, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.296875, + "rewards/margins": 1.078125, + "rewards/rejected": -2.375, + "step": 591 + }, + { + "epoch": 1.239141810570382, + "grad_norm": 10.902482986450195, + "learning_rate": 3.3287399113519706e-07, + "logits/chosen": 2.71875, + "logits/rejected": 3.3125, + "logps/chosen": -752.0, + "logps/rejected": -600.0, + "loss": 0.6019, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.021484375, + "rewards/rejected": -1.15625, + "step": 592 + }, + { + "epoch": 1.2412349555206699, + "grad_norm": 10.952805519104004, + "learning_rate": 3.323372577054924e-07, + "logits/chosen": 3.375, + "logits/rejected": 2.921875, + "logps/chosen": -374.0, + "logps/rejected": -552.0, + "loss": 0.6354, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1640625, + "rewards/margins": -0.0009765625, + "rewards/rejected": -1.1640625, + "step": 593 + }, + { + "epoch": 1.2433281004709575, + "grad_norm": 11.068962097167969, + "learning_rate": 3.318000983080426e-07, + "logits/chosen": 2.203125, + "logits/rejected": 1.65625, + "logps/chosen": -290.0, + "logps/rejected": -444.0, + "loss": 0.5645, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.359375, + "rewards/margins": 0.66015625, + "rewards/rejected": -2.015625, + "step": 594 + }, + { + "epoch": 1.2454212454212454, + "grad_norm": 10.889310836791992, + "learning_rate": 3.312625157226597e-07, + "logits/chosen": 2.015625, + "logits/rejected": 2.875, + "logps/chosen": -524.0, + "logps/rejected": -400.0, + "loss": 0.6028, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9140625, + "rewards/margins": 0.32421875, + "rewards/rejected": -1.234375, + "step": 595 + }, + { + "epoch": 1.247514390371533, + "grad_norm": 11.577537536621094, + "learning_rate": 3.3072451273134497e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.65625, + "logps/chosen": -700.0, + "logps/rejected": -500.0, + "loss": 0.6479, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.94921875, + "rewards/margins": 0.77734375, + "rewards/rejected": -1.7265625, + "step": 596 + }, + { + "epoch": 1.249607535321821, + "grad_norm": 10.762384414672852, + "learning_rate": 3.3018609211827606e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.296875, + "logps/chosen": -440.0, + "logps/rejected": -704.0, + "loss": 0.5725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.396484375, + "rewards/rejected": -1.6015625, + "step": 597 + }, + { + "epoch": 1.251700680272109, + "grad_norm": 10.538718223571777, + "learning_rate": 3.296472566697914e-07, + "logits/chosen": 1.90625, + "logits/rejected": 2.65625, + "logps/chosen": -454.0, + "logps/rejected": -294.0, + "loss": 0.6096, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.1826171875, + "rewards/rejected": -1.3671875, + "step": 598 + }, + { + "epoch": 1.2537938252223966, + "grad_norm": 10.4353609085083, + "learning_rate": 3.291080091743762e-07, + "logits/chosen": 1.703125, + "logits/rejected": 3.234375, + "logps/chosen": -656.0, + "logps/rejected": -426.0, + "loss": 0.5777, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.109375, + "rewards/margins": 0.2470703125, + "rewards/rejected": -1.359375, + "step": 599 + }, + { + "epoch": 1.2558869701726845, + "grad_norm": 11.074726104736328, + "learning_rate": 3.2856835242264825e-07, + "logits/chosen": 2.140625, + "logits/rejected": 1.7578125, + "logps/chosen": -458.0, + "logps/rejected": -416.0, + "loss": 0.6433, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9140625, + "rewards/margins": 0.30078125, + "rewards/rejected": -1.2109375, + "step": 600 + }, + { + "epoch": 1.2579801151229724, + "grad_norm": 11.208061218261719, + "learning_rate": 3.2802828920734297e-07, + "logits/chosen": 1.7421875, + "logits/rejected": 1.953125, + "logps/chosen": -450.0, + "logps/rejected": -464.0, + "loss": 0.6085, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9609375, + "rewards/margins": 0.494140625, + "rewards/rejected": -1.453125, + "step": 601 + }, + { + "epoch": 1.26007326007326, + "grad_norm": 10.752561569213867, + "learning_rate": 3.274878223232996e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.671875, + "logps/chosen": -364.0, + "logps/rejected": -266.0, + "loss": 0.6114, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.25, + "rewards/margins": 0.0263671875, + "rewards/rejected": -1.28125, + "step": 602 + }, + { + "epoch": 1.262166405023548, + "grad_norm": 10.955470085144043, + "learning_rate": 3.269469545674459e-07, + "logits/chosen": 1.359375, + "logits/rejected": 2.09375, + "logps/chosen": -494.0, + "logps/rejected": -372.0, + "loss": 0.6107, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.58203125, + "rewards/rejected": -1.703125, + "step": 603 + }, + { + "epoch": 1.2642595499738356, + "grad_norm": 12.255962371826172, + "learning_rate": 3.2640568873878457e-07, + "logits/chosen": 1.7578125, + "logits/rejected": 2.34375, + "logps/chosen": -540.0, + "logps/rejected": -412.0, + "loss": 0.6545, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0234375, + "rewards/margins": -0.0986328125, + "rewards/rejected": -0.92578125, + "step": 604 + }, + { + "epoch": 1.2663526949241235, + "grad_norm": 10.650092124938965, + "learning_rate": 3.258640276383781e-07, + "logits/chosen": 1.515625, + "logits/rejected": 1.359375, + "logps/chosen": -224.0, + "logps/rejected": -280.0, + "loss": 0.6096, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.1435546875, + "rewards/rejected": -1.2421875, + "step": 605 + }, + { + "epoch": 1.2684458398744112, + "grad_norm": 10.870160102844238, + "learning_rate": 3.2532197406933475e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.90625, + "logps/chosen": -560.0, + "logps/rejected": -472.0, + "loss": 0.5933, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.046875, + "rewards/margins": 0.328125, + "rewards/rejected": -1.375, + "step": 606 + }, + { + "epoch": 1.270538984824699, + "grad_norm": 11.42349910736084, + "learning_rate": 3.247795308367936e-07, + "logits/chosen": 2.4375, + "logits/rejected": 2.59375, + "logps/chosen": -376.0, + "logps/rejected": -320.0, + "loss": 0.6287, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.5390625, + "rewards/margins": -0.2255859375, + "rewards/rejected": -1.3125, + "step": 607 + }, + { + "epoch": 1.272632129774987, + "grad_norm": 11.02907943725586, + "learning_rate": 3.242367007479103e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.21875, + "logps/chosen": -492.0, + "logps/rejected": -548.0, + "loss": 0.6036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9140625, + "rewards/margins": 0.70703125, + "rewards/rejected": -1.6171875, + "step": 608 + }, + { + "epoch": 1.2747252747252746, + "grad_norm": 10.953951835632324, + "learning_rate": 3.2369348661184234e-07, + "logits/chosen": 1.34375, + "logits/rejected": 1.5625, + "logps/chosen": -384.0, + "logps/rejected": -372.0, + "loss": 0.5955, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.6953125, + "rewards/rejected": -2.0, + "step": 609 + }, + { + "epoch": 1.2768184196755625, + "grad_norm": 10.425783157348633, + "learning_rate": 3.2314989123973505e-07, + "logits/chosen": 1.703125, + "logits/rejected": 1.78125, + "logps/chosen": -234.0, + "logps/rejected": -330.0, + "loss": 0.6275, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.109375, + "rewards/margins": 0.0361328125, + "rewards/rejected": -1.1484375, + "step": 610 + }, + { + "epoch": 1.2789115646258504, + "grad_norm": 9.83671760559082, + "learning_rate": 3.2260591744470634e-07, + "logits/chosen": 2.34375, + "logits/rejected": 1.71875, + "logps/chosen": -488.0, + "logps/rejected": -544.0, + "loss": 0.5903, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6328125, + "rewards/margins": 0.484375, + "rewards/rejected": -1.1171875, + "step": 611 + }, + { + "epoch": 1.281004709576138, + "grad_norm": 11.166946411132812, + "learning_rate": 3.2206156804183277e-07, + "logits/chosen": 1.5625, + "logits/rejected": 1.7734375, + "logps/chosen": -308.0, + "logps/rejected": -308.0, + "loss": 0.5882, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.140625, + "rewards/margins": 0.68359375, + "rewards/rejected": -1.828125, + "step": 612 + }, + { + "epoch": 1.283097854526426, + "grad_norm": 11.616917610168457, + "learning_rate": 3.2151684584813417e-07, + "logits/chosen": 1.78125, + "logits/rejected": 1.3984375, + "logps/chosen": -252.0, + "logps/rejected": -304.0, + "loss": 0.6398, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3125, + "rewards/margins": -0.017578125, + "rewards/rejected": -1.296875, + "step": 613 + }, + { + "epoch": 1.285190999476714, + "grad_norm": 10.931349754333496, + "learning_rate": 3.2097175368256006e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.546875, + "logps/chosen": -512.0, + "logps/rejected": -444.0, + "loss": 0.5923, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.453125, + "rewards/rejected": -1.5703125, + "step": 614 + }, + { + "epoch": 1.2872841444270016, + "grad_norm": 10.645867347717285, + "learning_rate": 3.204262943659744e-07, + "logits/chosen": 2.46875, + "logits/rejected": 3.015625, + "logps/chosen": -664.0, + "logps/rejected": -576.0, + "loss": 0.5853, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4609375, + "rewards/margins": -0.390625, + "rewards/rejected": -1.0703125, + "step": 615 + }, + { + "epoch": 1.2893772893772895, + "grad_norm": 10.39663028717041, + "learning_rate": 3.1988047072114097e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.234375, + "logps/chosen": -466.0, + "logps/rejected": -736.0, + "loss": 0.592, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0, + "rewards/margins": 0.130859375, + "rewards/rejected": -1.1328125, + "step": 616 + }, + { + "epoch": 1.2914704343275771, + "grad_norm": 10.462268829345703, + "learning_rate": 3.193342855727095e-07, + "logits/chosen": 1.8515625, + "logits/rejected": 2.53125, + "logps/chosen": -460.0, + "logps/rejected": -452.0, + "loss": 0.5816, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.921875, + "rewards/margins": 0.1083984375, + "rewards/rejected": -1.03125, + "step": 617 + }, + { + "epoch": 1.293563579277865, + "grad_norm": 10.846890449523926, + "learning_rate": 3.187877417471998e-07, + "logits/chosen": 2.03125, + "logits/rejected": 1.9765625, + "logps/chosen": -211.0, + "logps/rejected": -308.0, + "loss": 0.6014, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.84375, + "rewards/margins": 0.3828125, + "rewards/rejected": -1.2265625, + "step": 618 + }, + { + "epoch": 1.2956567242281527, + "grad_norm": 11.082700729370117, + "learning_rate": 3.182408420729884e-07, + "logits/chosen": 2.375, + "logits/rejected": 2.640625, + "logps/chosen": -424.0, + "logps/rejected": -436.0, + "loss": 0.6238, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.328125, + "rewards/margins": 0.341796875, + "rewards/rejected": -1.671875, + "step": 619 + }, + { + "epoch": 1.2977498691784406, + "grad_norm": 10.534208297729492, + "learning_rate": 3.17693589380293e-07, + "logits/chosen": 3.5, + "logits/rejected": 2.609375, + "logps/chosen": -444.0, + "logps/rejected": -656.0, + "loss": 0.6237, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3828125, + "rewards/margins": -0.2451171875, + "rewards/rejected": -1.1328125, + "step": 620 + }, + { + "epoch": 1.2998430141287285, + "grad_norm": 12.856078147888184, + "learning_rate": 3.1714598650115853e-07, + "logits/chosen": 2.46875, + "logits/rejected": 2.265625, + "logps/chosen": -456.0, + "logps/rejected": -604.0, + "loss": 0.6824, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.375, + "rewards/margins": 0.26171875, + "rewards/rejected": -1.640625, + "step": 621 + }, + { + "epoch": 1.3019361590790162, + "grad_norm": 10.476717948913574, + "learning_rate": 3.1659803626944175e-07, + "logits/chosen": 1.3203125, + "logits/rejected": 1.2734375, + "logps/chosen": -248.0, + "logps/rejected": -306.0, + "loss": 0.6038, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0546875, + "rewards/margins": 0.080078125, + "rewards/rejected": -1.140625, + "step": 622 + }, + { + "epoch": 1.304029304029304, + "grad_norm": 10.692028045654297, + "learning_rate": 3.1604974152079724e-07, + "logits/chosen": 1.0546875, + "logits/rejected": 1.234375, + "logps/chosen": -328.0, + "logps/rejected": -388.0, + "loss": 0.6181, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.1826171875, + "rewards/rejected": -1.28125, + "step": 623 + }, + { + "epoch": 1.306122448979592, + "grad_norm": 10.150449752807617, + "learning_rate": 3.155011050926624e-07, + "logits/chosen": 1.796875, + "logits/rejected": 1.9765625, + "logps/chosen": -434.0, + "logps/rejected": -304.0, + "loss": 0.5883, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8828125, + "rewards/margins": -0.0341796875, + "rewards/rejected": -0.84765625, + "step": 624 + }, + { + "epoch": 1.3082155939298796, + "grad_norm": 11.333098411560059, + "learning_rate": 3.1495212982424283e-07, + "logits/chosen": 1.359375, + "logits/rejected": 2.15625, + "logps/chosen": -540.0, + "logps/rejected": -342.0, + "loss": 0.6208, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0625, + "rewards/margins": -0.3828125, + "rewards/rejected": -1.6875, + "step": 625 + }, + { + "epoch": 1.3103087388801675, + "grad_norm": 10.467708587646484, + "learning_rate": 3.1440281855649764e-07, + "logits/chosen": 2.640625, + "logits/rejected": 2.296875, + "logps/chosen": -520.0, + "logps/rejected": -528.0, + "loss": 0.5716, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3671875, + "rewards/margins": -0.056640625, + "rewards/rejected": -1.3125, + "step": 626 + }, + { + "epoch": 1.3124018838304552, + "grad_norm": 10.712902069091797, + "learning_rate": 3.138531741321246e-07, + "logits/chosen": 2.078125, + "logits/rejected": 1.734375, + "logps/chosen": -312.0, + "logps/rejected": -600.0, + "loss": 0.577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.91796875, + "rewards/margins": 0.2109375, + "rewards/rejected": -1.125, + "step": 627 + }, + { + "epoch": 1.314495028780743, + "grad_norm": 10.024105072021484, + "learning_rate": 3.1330319939554585e-07, + "logits/chosen": 0.46875, + "logits/rejected": 0.92578125, + "logps/chosen": -296.0, + "logps/rejected": -364.0, + "loss": 0.5768, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.1025390625, + "rewards/rejected": -1.203125, + "step": 628 + }, + { + "epoch": 1.3165881737310308, + "grad_norm": 11.217373847961426, + "learning_rate": 3.1275289719289266e-07, + "logits/chosen": 2.59375, + "logits/rejected": 3.640625, + "logps/chosen": -944.0, + "logps/rejected": -416.0, + "loss": 0.6388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.828125, + "rewards/margins": 0.453125, + "rewards/rejected": -1.28125, + "step": 629 + }, + { + "epoch": 1.3186813186813187, + "grad_norm": 10.900577545166016, + "learning_rate": 3.122022703719912e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.484375, + "logps/chosen": -476.0, + "logps/rejected": -506.0, + "loss": 0.6337, + "rewards/accuracies": 0.0, + "rewards/chosen": -1.6171875, + "rewards/margins": -0.7265625, + "rewards/rejected": -0.890625, + "step": 630 + }, + { + "epoch": 1.3207744636316066, + "grad_norm": 10.464300155639648, + "learning_rate": 3.116513217823471e-07, + "logits/chosen": 2.390625, + "logits/rejected": 3.21875, + "logps/chosen": -612.0, + "logps/rejected": -406.0, + "loss": 0.5849, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.46875, + "rewards/rejected": -1.6015625, + "step": 631 + }, + { + "epoch": 1.3228676085818942, + "grad_norm": 10.725011825561523, + "learning_rate": 3.111000542751317e-07, + "logits/chosen": 1.0859375, + "logits/rejected": 1.25, + "logps/chosen": -568.0, + "logps/rejected": -500.0, + "loss": 0.6162, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.439453125, + "rewards/rejected": -1.625, + "step": 632 + }, + { + "epoch": 1.3249607535321821, + "grad_norm": 10.770613670349121, + "learning_rate": 3.105484707031663e-07, + "logits/chosen": 1.3125, + "logits/rejected": 1.6171875, + "logps/chosen": -442.0, + "logps/rejected": -392.0, + "loss": 0.6139, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.546875, + "rewards/margins": -0.4375, + "rewards/rejected": -1.1015625, + "step": 633 + }, + { + "epoch": 1.32705389848247, + "grad_norm": 11.20041561126709, + "learning_rate": 3.0999657392090826e-07, + "logits/chosen": 3.21875, + "logits/rejected": 2.515625, + "logps/chosen": -536.0, + "logps/rejected": -688.0, + "loss": 0.6004, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8203125, + "rewards/margins": -0.03271484375, + "rewards/rejected": -0.7890625, + "step": 634 + }, + { + "epoch": 1.3291470434327577, + "grad_norm": 10.969812393188477, + "learning_rate": 3.0944436678443526e-07, + "logits/chosen": 1.625, + "logits/rejected": 2.75, + "logps/chosen": -284.0, + "logps/rejected": -392.0, + "loss": 0.6037, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0625, + "rewards/margins": 0.078125, + "rewards/rejected": -1.140625, + "step": 635 + }, + { + "epoch": 1.3312401883830456, + "grad_norm": 12.59915828704834, + "learning_rate": 3.088918521514317e-07, + "logits/chosen": 1.8671875, + "logits/rejected": 1.46875, + "logps/chosen": -324.0, + "logps/rejected": -368.0, + "loss": 0.6147, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.015625, + "rewards/margins": -0.044921875, + "rewards/rejected": -0.96875, + "step": 636 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 12.617871284484863, + "learning_rate": 3.083390328811726e-07, + "logits/chosen": 2.125, + "logits/rejected": 2.84375, + "logps/chosen": -398.0, + "logps/rejected": -328.0, + "loss": 0.6532, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.34375, + "rewards/margins": 0.294921875, + "rewards/rejected": -1.640625, + "step": 637 + }, + { + "epoch": 1.3354264782836212, + "grad_norm": 11.845966339111328, + "learning_rate": 3.077859118345102e-07, + "logits/chosen": 1.59375, + "logits/rejected": 2.5625, + "logps/chosen": -388.0, + "logps/rejected": -251.0, + "loss": 0.6508, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.28125, + "rewards/margins": -0.1728515625, + "rewards/rejected": -1.109375, + "step": 638 + }, + { + "epoch": 1.3375196232339088, + "grad_norm": 11.422259330749512, + "learning_rate": 3.072324918738579e-07, + "logits/chosen": 1.8359375, + "logits/rejected": 2.046875, + "logps/chosen": -390.0, + "logps/rejected": -414.0, + "loss": 0.6063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.95703125, + "rewards/margins": 0.625, + "rewards/rejected": -1.578125, + "step": 639 + }, + { + "epoch": 1.3396127681841967, + "grad_norm": 9.103655815124512, + "learning_rate": 3.066787758631763e-07, + "logits/chosen": 1.8984375, + "logits/rejected": 2.234375, + "logps/chosen": -528.0, + "logps/rejected": -428.0, + "loss": 0.5832, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.59375, + "rewards/margins": -0.32421875, + "rewards/rejected": -1.2734375, + "step": 640 + }, + { + "epoch": 1.3417059131344846, + "grad_norm": 11.316737174987793, + "learning_rate": 3.0612476666795776e-07, + "logits/chosen": 1.421875, + "logits/rejected": 1.1484375, + "logps/chosen": -368.0, + "logps/rejected": -556.0, + "loss": 0.6204, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.6171875, + "rewards/rejected": -1.75, + "step": 641 + }, + { + "epoch": 1.3437990580847723, + "grad_norm": 11.12604808807373, + "learning_rate": 3.055704671552122e-07, + "logits/chosen": 2.0625, + "logits/rejected": 2.421875, + "logps/chosen": -456.0, + "logps/rejected": -362.0, + "loss": 0.5931, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3671875, + "rewards/margins": -0.30859375, + "rewards/rejected": -1.0546875, + "step": 642 + }, + { + "epoch": 1.3458922030350602, + "grad_norm": 11.656798362731934, + "learning_rate": 3.0501588019345174e-07, + "logits/chosen": 2.25, + "logits/rejected": 2.90625, + "logps/chosen": -502.0, + "logps/rejected": -408.0, + "loss": 0.6395, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.234375, + "rewards/margins": 0.1474609375, + "rewards/rejected": -1.375, + "step": 643 + }, + { + "epoch": 1.347985347985348, + "grad_norm": 10.157209396362305, + "learning_rate": 3.0446100865267617e-07, + "logits/chosen": 2.3125, + "logits/rejected": 2.21875, + "logps/chosen": -516.0, + "logps/rejected": -704.0, + "loss": 0.5799, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3125, + "rewards/margins": 0.337890625, + "rewards/rejected": -1.6484375, + "step": 644 + }, + { + "epoch": 1.3500784929356358, + "grad_norm": 10.407575607299805, + "learning_rate": 3.039058554043579e-07, + "logits/chosen": 2.046875, + "logits/rejected": 2.75, + "logps/chosen": -482.0, + "logps/rejected": -474.0, + "loss": 0.5835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.921875, + "rewards/margins": 0.6171875, + "rewards/rejected": -1.546875, + "step": 645 + }, + { + "epoch": 1.3521716378859236, + "grad_norm": 10.747139930725098, + "learning_rate": 3.0335042332142706e-07, + "logits/chosen": 1.609375, + "logits/rejected": 1.5390625, + "logps/chosen": -372.0, + "logps/rejected": -227.0, + "loss": 0.6214, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.515625, + "rewards/margins": -0.322265625, + "rewards/rejected": -1.1875, + "step": 646 + }, + { + "epoch": 1.3542647828362115, + "grad_norm": 10.498771667480469, + "learning_rate": 3.0279471527825713e-07, + "logits/chosen": 2.0625, + "logits/rejected": 1.9140625, + "logps/chosen": -412.0, + "logps/rejected": -512.0, + "loss": 0.587, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.33203125, + "rewards/rejected": -1.453125, + "step": 647 + }, + { + "epoch": 1.3563579277864992, + "grad_norm": 10.430938720703125, + "learning_rate": 3.022387341506493e-07, + "logits/chosen": 2.53125, + "logits/rejected": 2.6875, + "logps/chosen": -612.0, + "logps/rejected": -704.0, + "loss": 0.6009, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.953125, + "rewards/margins": -0.43359375, + "rewards/rejected": -1.5234375, + "step": 648 + }, + { + "epoch": 1.358451072736787, + "grad_norm": 10.411450386047363, + "learning_rate": 3.016824828158182e-07, + "logits/chosen": 1.90625, + "logits/rejected": 2.765625, + "logps/chosen": -320.0, + "logps/rejected": -362.0, + "loss": 0.5785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6953125, + "rewards/margins": 0.8515625, + "rewards/rejected": -1.546875, + "step": 649 + }, + { + "epoch": 1.3605442176870748, + "grad_norm": 10.431605339050293, + "learning_rate": 3.0112596415237685e-07, + "logits/chosen": 1.5, + "logits/rejected": 1.5390625, + "logps/chosen": -440.0, + "logps/rejected": -506.0, + "loss": 0.5886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.98046875, + "rewards/margins": 0.53515625, + "rewards/rejected": -1.515625, + "step": 650 + }, + { + "epoch": 1.3626373626373627, + "grad_norm": 10.654319763183594, + "learning_rate": 3.0056918104032135e-07, + "logits/chosen": 1.2265625, + "logits/rejected": 1.140625, + "logps/chosen": -253.0, + "logps/rejected": -406.0, + "loss": 0.5936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.31640625, + "rewards/rejected": -1.46875, + "step": 651 + }, + { + "epoch": 1.3647305075876504, + "grad_norm": 11.069323539733887, + "learning_rate": 3.000121363610167e-07, + "logits/chosen": 1.640625, + "logits/rejected": 2.0625, + "logps/chosen": -253.0, + "logps/rejected": -231.0, + "loss": 0.616, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9609375, + "rewards/margins": 0.0654296875, + "rewards/rejected": -1.0234375, + "step": 652 + }, + { + "epoch": 1.3668236525379382, + "grad_norm": 11.488618850708008, + "learning_rate": 2.994548329971814e-07, + "logits/chosen": 1.640625, + "logits/rejected": 2.828125, + "logps/chosen": -620.0, + "logps/rejected": -424.0, + "loss": 0.6375, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.88671875, + "rewards/margins": 0.1123046875, + "rewards/rejected": -1.0, + "step": 653 + }, + { + "epoch": 1.3689167974882261, + "grad_norm": 10.386863708496094, + "learning_rate": 2.988972738328724e-07, + "logits/chosen": 1.828125, + "logits/rejected": 2.21875, + "logps/chosen": -502.0, + "logps/rejected": -322.0, + "loss": 0.6062, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5, + "rewards/margins": -0.052734375, + "rewards/rejected": -1.4453125, + "step": 654 + }, + { + "epoch": 1.3710099424385138, + "grad_norm": 11.457762718200684, + "learning_rate": 2.98339461753471e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.796875, + "logps/chosen": -540.0, + "logps/rejected": -438.0, + "loss": 0.5958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.84375, + "rewards/margins": 0.52734375, + "rewards/rejected": -1.375, + "step": 655 + }, + { + "epoch": 1.3731030873888017, + "grad_norm": 10.398492813110352, + "learning_rate": 2.9778139964566675e-07, + "logits/chosen": 2.546875, + "logits/rejected": 2.96875, + "logps/chosen": -672.0, + "logps/rejected": -684.0, + "loss": 0.5755, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.328125, + "rewards/margins": 0.375, + "rewards/rejected": -1.703125, + "step": 656 + }, + { + "epoch": 1.3751962323390896, + "grad_norm": 11.246747970581055, + "learning_rate": 2.972230903974433e-07, + "logits/chosen": 2.078125, + "logits/rejected": 1.984375, + "logps/chosen": -394.0, + "logps/rejected": -366.0, + "loss": 0.6048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.76171875, + "rewards/margins": 0.2734375, + "rewards/rejected": -1.03125, + "step": 657 + }, + { + "epoch": 1.3772893772893773, + "grad_norm": 10.025164604187012, + "learning_rate": 2.9666453689806345e-07, + "logits/chosen": 1.6640625, + "logits/rejected": 1.78125, + "logps/chosen": -438.0, + "logps/rejected": -302.0, + "loss": 0.6108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.76953125, + "rewards/margins": 0.455078125, + "rewards/rejected": -1.2265625, + "step": 658 + }, + { + "epoch": 1.3793825222396652, + "grad_norm": 10.379528045654297, + "learning_rate": 2.961057420380538e-07, + "logits/chosen": 2.546875, + "logits/rejected": 2.4375, + "logps/chosen": -904.0, + "logps/rejected": -712.0, + "loss": 0.5591, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.40625, + "rewards/rejected": -1.5390625, + "step": 659 + }, + { + "epoch": 1.3814756671899528, + "grad_norm": 10.235441207885742, + "learning_rate": 2.9554670870919e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.765625, + "logps/chosen": -354.0, + "logps/rejected": -380.0, + "loss": 0.569, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.2109375, + "rewards/margins": -0.0458984375, + "rewards/rejected": -1.1640625, + "step": 660 + }, + { + "epoch": 1.3835688121402407, + "grad_norm": 10.565560340881348, + "learning_rate": 2.949874398044818e-07, + "logits/chosen": 1.75, + "logits/rejected": 1.5546875, + "logps/chosen": -510.0, + "logps/rejected": -556.0, + "loss": 0.6289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5234375, + "rewards/margins": 0.40234375, + "rewards/rejected": -1.9296875, + "step": 661 + }, + { + "epoch": 1.3856619570905284, + "grad_norm": 9.682659149169922, + "learning_rate": 2.944279382181582e-07, + "logits/chosen": 2.625, + "logits/rejected": 2.6875, + "logps/chosen": -532.0, + "logps/rejected": -408.0, + "loss": 0.5742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.97265625, + "rewards/margins": 0.396484375, + "rewards/rejected": -1.3671875, + "step": 662 + }, + { + "epoch": 1.3877551020408163, + "grad_norm": 10.06523609161377, + "learning_rate": 2.938682068456522e-07, + "logits/chosen": 1.859375, + "logits/rejected": 2.09375, + "logps/chosen": -406.0, + "logps/rejected": -432.0, + "loss": 0.5693, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.5234375, + "rewards/rejected": -1.65625, + "step": 663 + }, + { + "epoch": 1.3898482469911042, + "grad_norm": 11.815163612365723, + "learning_rate": 2.9330824858358587e-07, + "logits/chosen": 2.0, + "logits/rejected": 2.28125, + "logps/chosen": -376.0, + "logps/rejected": -362.0, + "loss": 0.6123, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3046875, + "rewards/margins": 0.14453125, + "rewards/rejected": -1.453125, + "step": 664 + }, + { + "epoch": 1.3919413919413919, + "grad_norm": 11.66609001159668, + "learning_rate": 2.9274806632975575e-07, + "logits/chosen": 2.390625, + "logits/rejected": 2.515625, + "logps/chosen": -414.0, + "logps/rejected": -496.0, + "loss": 0.6305, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.80859375, + "rewards/margins": 0.515625, + "rewards/rejected": -1.3203125, + "step": 665 + }, + { + "epoch": 1.3940345368916798, + "grad_norm": 10.703742027282715, + "learning_rate": 2.92187662983117e-07, + "logits/chosen": 2.75, + "logits/rejected": 3.0, + "logps/chosen": -588.0, + "logps/rejected": -520.0, + "loss": 0.6147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.796875, + "rewards/margins": 0.59375, + "rewards/rejected": -1.390625, + "step": 666 + }, + { + "epoch": 1.3961276818419677, + "grad_norm": 10.139862060546875, + "learning_rate": 2.916270414437696e-07, + "logits/chosen": 2.015625, + "logits/rejected": 2.15625, + "logps/chosen": -458.0, + "logps/rejected": -428.0, + "loss": 0.586, + "rewards/accuracies": 0.0, + "rewards/chosen": -1.203125, + "rewards/margins": -0.15625, + "rewards/rejected": -1.046875, + "step": 667 + }, + { + "epoch": 1.3982208267922553, + "grad_norm": 10.547820091247559, + "learning_rate": 2.9106620461294223e-07, + "logits/chosen": 1.53125, + "logits/rejected": 1.265625, + "logps/chosen": -249.0, + "logps/rejected": -484.0, + "loss": 0.602, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.470703125, + "rewards/rejected": -1.65625, + "step": 668 + }, + { + "epoch": 1.4003139717425432, + "grad_norm": 10.163110733032227, + "learning_rate": 2.905051553929778e-07, + "logits/chosen": 1.828125, + "logits/rejected": 2.578125, + "logps/chosen": -760.0, + "logps/rejected": -424.0, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.83984375, + "rewards/margins": 0.359375, + "rewards/rejected": -1.1953125, + "step": 669 + }, + { + "epoch": 1.402407116692831, + "grad_norm": 10.655150413513184, + "learning_rate": 2.899438966873183e-07, + "logits/chosen": 2.375, + "logits/rejected": 2.03125, + "logps/chosen": -382.0, + "logps/rejected": -540.0, + "loss": 0.626, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.40625, + "rewards/margins": -0.1181640625, + "rewards/rejected": -1.2890625, + "step": 670 + }, + { + "epoch": 1.4045002616431188, + "grad_norm": 10.15519905090332, + "learning_rate": 2.8938243140049003e-07, + "logits/chosen": 0.87109375, + "logits/rejected": 1.0859375, + "logps/chosen": -200.0, + "logps/rejected": -216.0, + "loss": 0.5746, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.91796875, + "rewards/margins": 0.20703125, + "rewards/rejected": -1.125, + "step": 671 + }, + { + "epoch": 1.4065934065934065, + "grad_norm": 11.310110092163086, + "learning_rate": 2.8882076243808817e-07, + "logits/chosen": 1.859375, + "logits/rejected": 2.59375, + "logps/chosen": -652.0, + "logps/rejected": -532.0, + "loss": 0.5545, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4140625, + "rewards/margins": 0.46484375, + "rewards/rejected": -1.8828125, + "step": 672 + }, + { + "epoch": 1.4086865515436944, + "grad_norm": 10.634568214416504, + "learning_rate": 2.8825889270676193e-07, + "logits/chosen": 1.4765625, + "logits/rejected": 1.578125, + "logps/chosen": -251.0, + "logps/rejected": -306.0, + "loss": 0.6162, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.171875, + "rewards/margins": 0.234375, + "rewards/rejected": -1.40625, + "step": 673 + }, + { + "epoch": 1.4107796964939823, + "grad_norm": 10.30864429473877, + "learning_rate": 2.8769682511419946e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.96875, + "logps/chosen": -564.0, + "logps/rejected": -436.0, + "loss": 0.6162, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0234375, + "rewards/margins": 0.4140625, + "rewards/rejected": -1.4375, + "step": 674 + }, + { + "epoch": 1.41287284144427, + "grad_norm": 10.896045684814453, + "learning_rate": 2.8713456256911306e-07, + "logits/chosen": 2.84375, + "logits/rejected": 1.96875, + "logps/chosen": -596.0, + "logps/rejected": -748.0, + "loss": 0.576, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109375, + "rewards/margins": 0.462890625, + "rewards/rejected": -1.5703125, + "step": 675 + }, + { + "epoch": 1.4149659863945578, + "grad_norm": 10.742579460144043, + "learning_rate": 2.8657210798122374e-07, + "logits/chosen": 2.53125, + "logits/rejected": 2.4375, + "logps/chosen": -752.0, + "logps/rejected": -628.0, + "loss": 0.5836, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.59375, + "rewards/rejected": -1.828125, + "step": 676 + }, + { + "epoch": 1.4170591313448457, + "grad_norm": 10.400110244750977, + "learning_rate": 2.860094642612463e-07, + "logits/chosen": 1.875, + "logits/rejected": 1.5859375, + "logps/chosen": -520.0, + "logps/rejected": -482.0, + "loss": 0.5986, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3984375, + "rewards/margins": -0.35546875, + "rewards/rejected": -1.046875, + "step": 677 + }, + { + "epoch": 1.4191522762951334, + "grad_norm": 10.457825660705566, + "learning_rate": 2.854466343208745e-07, + "logits/chosen": 1.640625, + "logits/rejected": 2.5, + "logps/chosen": -600.0, + "logps/rejected": -384.0, + "loss": 0.5562, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.21875, + "rewards/margins": 0.17578125, + "rewards/rejected": -1.3984375, + "step": 678 + }, + { + "epoch": 1.4212454212454213, + "grad_norm": 9.994818687438965, + "learning_rate": 2.848836210727655e-07, + "logits/chosen": 1.796875, + "logits/rejected": 1.4296875, + "logps/chosen": -414.0, + "logps/rejected": -426.0, + "loss": 0.5831, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.203125, + "rewards/margins": 0.1728515625, + "rewards/rejected": -1.3828125, + "step": 679 + }, + { + "epoch": 1.423338566195709, + "grad_norm": 11.893540382385254, + "learning_rate": 2.843204274305253e-07, + "logits/chosen": 2.328125, + "logits/rejected": 2.8125, + "logps/chosen": -576.0, + "logps/rejected": -450.0, + "loss": 0.645, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1640625, + "rewards/margins": 0.0302734375, + "rewards/rejected": -1.1875, + "step": 680 + }, + { + "epoch": 1.4254317111459969, + "grad_norm": 11.3270902633667, + "learning_rate": 2.837570563086935e-07, + "logits/chosen": 1.9375, + "logits/rejected": 1.5, + "logps/chosen": -249.0, + "logps/rejected": -394.0, + "loss": 0.6374, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.234375, + "rewards/margins": 0.28125, + "rewards/rejected": -1.515625, + "step": 681 + }, + { + "epoch": 1.4275248560962845, + "grad_norm": 11.636581420898438, + "learning_rate": 2.8319351062272794e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.546875, + "logps/chosen": -380.0, + "logps/rejected": -520.0, + "loss": 0.6472, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.89453125, + "rewards/margins": 0.6171875, + "rewards/rejected": -1.515625, + "step": 682 + }, + { + "epoch": 1.4296180010465724, + "grad_norm": 10.399662971496582, + "learning_rate": 2.8262979328899004e-07, + "logits/chosen": 1.9140625, + "logits/rejected": 1.84375, + "logps/chosen": -800.0, + "logps/rejected": -716.0, + "loss": 0.6063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.875, + "rewards/margins": 0.328125, + "rewards/rejected": -1.203125, + "step": 683 + }, + { + "epoch": 1.4317111459968603, + "grad_norm": 11.006073951721191, + "learning_rate": 2.820659072247294e-07, + "logits/chosen": 1.796875, + "logits/rejected": 1.9921875, + "logps/chosen": -304.0, + "logps/rejected": -352.0, + "loss": 0.6032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9609375, + "rewards/margins": 0.2197265625, + "rewards/rejected": -1.1796875, + "step": 684 + }, + { + "epoch": 1.433804290947148, + "grad_norm": 11.377279281616211, + "learning_rate": 2.8150185534806863e-07, + "logits/chosen": 2.03125, + "logits/rejected": 2.8125, + "logps/chosen": -664.0, + "logps/rejected": -344.0, + "loss": 0.6, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.109375, + "rewards/rejected": -1.4296875, + "step": 685 + }, + { + "epoch": 1.435897435897436, + "grad_norm": 11.495552062988281, + "learning_rate": 2.8093764057798885e-07, + "logits/chosen": 2.765625, + "logits/rejected": 3.203125, + "logps/chosen": -980.0, + "logps/rejected": -768.0, + "loss": 0.6084, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5234375, + "rewards/margins": 0.0234375, + "rewards/rejected": -1.546875, + "step": 686 + }, + { + "epoch": 1.4379905808477238, + "grad_norm": 11.461719512939453, + "learning_rate": 2.803732658343138e-07, + "logits/chosen": 2.34375, + "logits/rejected": 2.984375, + "logps/chosen": -478.0, + "logps/rejected": -452.0, + "loss": 0.5996, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.2109375, + "rewards/rejected": -1.328125, + "step": 687 + }, + { + "epoch": 1.4400837257980115, + "grad_norm": 12.534832954406738, + "learning_rate": 2.7980873403769506e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.09375, + "logps/chosen": -948.0, + "logps/rejected": -548.0, + "loss": 0.6669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.890625, + "rewards/margins": 1.0859375, + "rewards/rejected": -1.9765625, + "step": 688 + }, + { + "epoch": 1.4421768707482994, + "grad_norm": 10.652071952819824, + "learning_rate": 2.792440481095974e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.046875, + "logps/chosen": -286.0, + "logps/rejected": -532.0, + "loss": 0.5648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9609375, + "rewards/margins": 0.443359375, + "rewards/rejected": -1.40625, + "step": 689 + }, + { + "epoch": 1.4442700156985873, + "grad_norm": 11.058365821838379, + "learning_rate": 2.786792109722827e-07, + "logits/chosen": 1.9375, + "logits/rejected": 2.40625, + "logps/chosen": -540.0, + "logps/rejected": -446.0, + "loss": 0.5799, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.03125, + "rewards/margins": 0.294921875, + "rewards/rejected": -1.328125, + "step": 690 + }, + { + "epoch": 1.446363160648875, + "grad_norm": 10.38504695892334, + "learning_rate": 2.7811422554879563e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.984375, + "logps/chosen": -1072.0, + "logps/rejected": -688.0, + "loss": 0.6002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.828125, + "rewards/margins": 1.1640625, + "rewards/rejected": -1.9921875, + "step": 691 + }, + { + "epoch": 1.4484563055991626, + "grad_norm": 10.090402603149414, + "learning_rate": 2.7754909476294824e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.78125, + "logps/chosen": -592.0, + "logps/rejected": -612.0, + "loss": 0.6002, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.984375, + "rewards/margins": 0.1787109375, + "rewards/rejected": -1.1640625, + "step": 692 + }, + { + "epoch": 1.4505494505494505, + "grad_norm": 10.569539070129395, + "learning_rate": 2.769838215393047e-07, + "logits/chosen": 1.8515625, + "logits/rejected": 2.625, + "logps/chosen": -498.0, + "logps/rejected": -552.0, + "loss": 0.6024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0625, + "rewards/margins": 0.734375, + "rewards/rejected": -1.796875, + "step": 693 + }, + { + "epoch": 1.4526425954997384, + "grad_norm": 11.858864784240723, + "learning_rate": 2.7641840880316647e-07, + "logits/chosen": 1.5625, + "logits/rejected": 1.4140625, + "logps/chosen": -239.0, + "logps/rejected": -350.0, + "loss": 0.6419, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.890625, + "rewards/margins": 0.26953125, + "rewards/rejected": -1.15625, + "step": 694 + }, + { + "epoch": 1.454735740450026, + "grad_norm": 11.208900451660156, + "learning_rate": 2.758528594805568e-07, + "logits/chosen": 1.6484375, + "logits/rejected": 1.8203125, + "logps/chosen": -428.0, + "logps/rejected": -512.0, + "loss": 0.6163, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3984375, + "rewards/margins": 0.2333984375, + "rewards/rejected": -1.6328125, + "step": 695 + }, + { + "epoch": 1.456828885400314, + "grad_norm": 10.76639461517334, + "learning_rate": 2.7528717649820604e-07, + "logits/chosen": 1.6171875, + "logits/rejected": 2.140625, + "logps/chosen": -400.0, + "logps/rejected": -280.0, + "loss": 0.5738, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.2265625, + "rewards/rejected": -1.359375, + "step": 696 + }, + { + "epoch": 1.4589220303506019, + "grad_norm": 11.675130844116211, + "learning_rate": 2.7472136278353584e-07, + "logits/chosen": 2.609375, + "logits/rejected": 2.4375, + "logps/chosen": -324.0, + "logps/rejected": -624.0, + "loss": 0.5779, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.34375, + "rewards/margins": 0.466796875, + "rewards/rejected": -1.8125, + "step": 697 + }, + { + "epoch": 1.4610151753008895, + "grad_norm": 11.958902359008789, + "learning_rate": 2.741554212646449e-07, + "logits/chosen": 2.359375, + "logits/rejected": 2.40625, + "logps/chosen": -612.0, + "logps/rejected": -648.0, + "loss": 0.6082, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3125, + "rewards/margins": 0.00390625, + "rewards/rejected": -1.3125, + "step": 698 + }, + { + "epoch": 1.4631083202511774, + "grad_norm": 11.79665470123291, + "learning_rate": 2.735893548702928e-07, + "logits/chosen": 2.375, + "logits/rejected": 2.0625, + "logps/chosen": -344.0, + "logps/rejected": -520.0, + "loss": 0.6233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.99609375, + "rewards/margins": 0.431640625, + "rewards/rejected": -1.4296875, + "step": 699 + }, + { + "epoch": 1.4652014652014653, + "grad_norm": 10.444296836853027, + "learning_rate": 2.730231665298857e-07, + "logits/chosen": 2.328125, + "logits/rejected": 2.8125, + "logps/chosen": -536.0, + "logps/rejected": -390.0, + "loss": 0.6263, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.078125, + "rewards/margins": 0.25390625, + "rewards/rejected": -1.328125, + "step": 700 + }, + { + "epoch": 1.467294610151753, + "grad_norm": 10.155138969421387, + "learning_rate": 2.724568591734607e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.9375, + "logps/chosen": -548.0, + "logps/rejected": -652.0, + "loss": 0.6076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.140625, + "rewards/margins": 0.9375, + "rewards/rejected": -2.078125, + "step": 701 + }, + { + "epoch": 1.469387755102041, + "grad_norm": 11.027820587158203, + "learning_rate": 2.7189043573167084e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.265625, + "logps/chosen": -588.0, + "logps/rejected": -584.0, + "loss": 0.5687, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.046875, + "rewards/margins": 0.30859375, + "rewards/rejected": -1.3515625, + "step": 702 + }, + { + "epoch": 1.4714809000523286, + "grad_norm": 9.480152130126953, + "learning_rate": 2.7132389913576983e-07, + "logits/chosen": 2.015625, + "logits/rejected": 2.578125, + "logps/chosen": -452.0, + "logps/rejected": -362.0, + "loss": 0.5668, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.578125, + "rewards/rejected": -1.7109375, + "step": 703 + }, + { + "epoch": 1.4735740450026165, + "grad_norm": 11.405426025390625, + "learning_rate": 2.7075725231759713e-07, + "logits/chosen": 2.296875, + "logits/rejected": 3.234375, + "logps/chosen": -592.0, + "logps/rejected": -476.0, + "loss": 0.5817, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9765625, + "rewards/margins": 0.609375, + "rewards/rejected": -1.5859375, + "step": 704 + }, + { + "epoch": 1.4756671899529041, + "grad_norm": 10.888693809509277, + "learning_rate": 2.701904982095625e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.578125, + "logps/chosen": -464.0, + "logps/rejected": -438.0, + "loss": 0.5896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.921875, + "rewards/margins": 0.53515625, + "rewards/rejected": -1.453125, + "step": 705 + }, + { + "epoch": 1.477760334903192, + "grad_norm": 10.568636894226074, + "learning_rate": 2.696236397446308e-07, + "logits/chosen": 1.5078125, + "logits/rejected": 1.640625, + "logps/chosen": -334.0, + "logps/rejected": -332.0, + "loss": 0.6034, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.1923828125, + "rewards/rejected": -1.34375, + "step": 706 + }, + { + "epoch": 1.47985347985348, + "grad_norm": 10.257476806640625, + "learning_rate": 2.6905667985630703e-07, + "logits/chosen": 2.0, + "logits/rejected": 1.4140625, + "logps/chosen": -316.0, + "logps/rejected": -656.0, + "loss": 0.5853, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.015625, + "rewards/margins": 0.671875, + "rewards/rejected": -1.6875, + "step": 707 + }, + { + "epoch": 1.4819466248037676, + "grad_norm": 10.183833122253418, + "learning_rate": 2.684896214786214e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.296875, + "logps/chosen": -696.0, + "logps/rejected": -468.0, + "loss": 0.5695, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.365234375, + "rewards/rejected": -1.71875, + "step": 708 + }, + { + "epoch": 1.4840397697540555, + "grad_norm": 11.091374397277832, + "learning_rate": 2.6792246754611315e-07, + "logits/chosen": 1.6640625, + "logits/rejected": 1.796875, + "logps/chosen": -400.0, + "logps/rejected": -434.0, + "loss": 0.5943, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.453125, + "rewards/rejected": -1.7734375, + "step": 709 + }, + { + "epoch": 1.4861329147043434, + "grad_norm": 10.51530647277832, + "learning_rate": 2.673552209938165e-07, + "logits/chosen": 1.25, + "logits/rejected": 2.15625, + "logps/chosen": -524.0, + "logps/rejected": -376.0, + "loss": 0.5623, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.578125, + "rewards/margins": 0.37890625, + "rewards/rejected": -1.953125, + "step": 710 + }, + { + "epoch": 1.488226059654631, + "grad_norm": 10.477109909057617, + "learning_rate": 2.667878847572448e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.09375, + "logps/chosen": -448.0, + "logps/rejected": -604.0, + "loss": 0.606, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.40625, + "rewards/margins": 0.38671875, + "rewards/rejected": -1.796875, + "step": 711 + }, + { + "epoch": 1.490319204604919, + "grad_norm": 10.01471996307373, + "learning_rate": 2.662204617723756e-07, + "logits/chosen": 1.9140625, + "logits/rejected": 2.109375, + "logps/chosen": -452.0, + "logps/rejected": -432.0, + "loss": 0.5975, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.2265625, + "rewards/margins": -0.0439453125, + "rewards/rejected": -1.1875, + "step": 712 + }, + { + "epoch": 1.4924123495552066, + "grad_norm": 10.801339149475098, + "learning_rate": 2.656529549756354e-07, + "logits/chosen": 1.1953125, + "logits/rejected": 1.0625, + "logps/chosen": -231.0, + "logps/rejected": -278.0, + "loss": 0.5738, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.359375, + "rewards/margins": 0.0078125, + "rewards/rejected": -1.3671875, + "step": 713 + }, + { + "epoch": 1.4945054945054945, + "grad_norm": 10.13692569732666, + "learning_rate": 2.6508536730388416e-07, + "logits/chosen": 2.0, + "logits/rejected": 2.09375, + "logps/chosen": -380.0, + "logps/rejected": -346.0, + "loss": 0.5886, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.0009765625, + "rewards/rejected": -1.1171875, + "step": 714 + }, + { + "epoch": 1.4965986394557822, + "grad_norm": 11.655537605285645, + "learning_rate": 2.6451770169440085e-07, + "logits/chosen": 2.078125, + "logits/rejected": 2.125, + "logps/chosen": -472.0, + "logps/rejected": -544.0, + "loss": 0.6434, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.380859375, + "rewards/rejected": -1.5625, + "step": 715 + }, + { + "epoch": 1.49869178440607, + "grad_norm": 10.766107559204102, + "learning_rate": 2.639499610848673e-07, + "logits/chosen": 1.203125, + "logits/rejected": 2.03125, + "logps/chosen": -388.0, + "logps/rejected": -286.0, + "loss": 0.5704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.94921875, + "rewards/margins": 0.71484375, + "rewards/rejected": -1.6640625, + "step": 716 + }, + { + "epoch": 1.500784929356358, + "grad_norm": 12.128816604614258, + "learning_rate": 2.6338214841335364e-07, + "logits/chosen": 2.15625, + "logits/rejected": 2.65625, + "logps/chosen": -348.0, + "logps/rejected": -504.0, + "loss": 0.6176, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8828125, + "rewards/margins": -0.0078125, + "rewards/rejected": -0.875, + "step": 717 + }, + { + "epoch": 1.5028780743066457, + "grad_norm": 9.875317573547363, + "learning_rate": 2.6281426661830295e-07, + "logits/chosen": 1.9140625, + "logits/rejected": 2.375, + "logps/chosen": -424.0, + "logps/rejected": -330.0, + "loss": 0.6105, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.03125, + "rewards/margins": 0.208984375, + "rewards/rejected": -1.2421875, + "step": 718 + }, + { + "epoch": 1.5049712192569336, + "grad_norm": 11.082642555236816, + "learning_rate": 2.622463186385161e-07, + "logits/chosen": 2.25, + "logits/rejected": 2.671875, + "logps/chosen": -572.0, + "logps/rejected": -548.0, + "loss": 0.6359, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.013671875, + "rewards/rejected": -1.3671875, + "step": 719 + }, + { + "epoch": 1.5070643642072215, + "grad_norm": 11.48105525970459, + "learning_rate": 2.616783074131364e-07, + "logits/chosen": 1.765625, + "logits/rejected": 1.1953125, + "logps/chosen": -186.0, + "logps/rejected": -360.0, + "loss": 0.5563, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.25, + "rewards/margins": 0.375, + "rewards/rejected": -1.625, + "step": 720 + }, + { + "epoch": 1.5091575091575091, + "grad_norm": 11.445796012878418, + "learning_rate": 2.6111023588163445e-07, + "logits/chosen": 1.9765625, + "logits/rejected": 2.453125, + "logps/chosen": -444.0, + "logps/rejected": -376.0, + "loss": 0.5152, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.046875, + "rewards/margins": 0.33203125, + "rewards/rejected": -1.3828125, + "step": 721 + }, + { + "epoch": 1.511250654107797, + "grad_norm": 12.939420700073242, + "learning_rate": 2.6054210698379276e-07, + "logits/chosen": 2.03125, + "logits/rejected": 2.0, + "logps/chosen": -460.0, + "logps/rejected": -340.0, + "loss": 0.6704, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.125, + "rewards/margins": 0.017578125, + "rewards/rejected": -1.1484375, + "step": 722 + }, + { + "epoch": 1.513343799058085, + "grad_norm": 11.441888809204102, + "learning_rate": 2.5997392365969097e-07, + "logits/chosen": 2.375, + "logits/rejected": 1.90625, + "logps/chosen": -302.0, + "logps/rejected": -420.0, + "loss": 0.608, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.12890625, + "rewards/rejected": -1.5625, + "step": 723 + }, + { + "epoch": 1.5154369440083726, + "grad_norm": 10.66491985321045, + "learning_rate": 2.5940568884969035e-07, + "logits/chosen": 1.140625, + "logits/rejected": 1.4609375, + "logps/chosen": -438.0, + "logps/rejected": -386.0, + "loss": 0.5736, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.3828125, + "rewards/rejected": -1.703125, + "step": 724 + }, + { + "epoch": 1.5175300889586603, + "grad_norm": 10.134530067443848, + "learning_rate": 2.5883740549441844e-07, + "logits/chosen": 2.109375, + "logits/rejected": 1.671875, + "logps/chosen": -294.0, + "logps/rejected": -322.0, + "loss": 0.5838, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2578125, + "rewards/margins": 0.029296875, + "rewards/rejected": -1.2890625, + "step": 725 + }, + { + "epoch": 1.5196232339089482, + "grad_norm": 10.396265983581543, + "learning_rate": 2.582690765347542e-07, + "logits/chosen": 2.4375, + "logits/rejected": 2.96875, + "logps/chosen": -808.0, + "logps/rejected": -564.0, + "loss": 0.5766, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6328125, + "rewards/margins": 0.205078125, + "rewards/rejected": -1.8359375, + "step": 726 + }, + { + "epoch": 1.521716378859236, + "grad_norm": 11.76471996307373, + "learning_rate": 2.577007049118125e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.296875, + "logps/chosen": -276.0, + "logps/rejected": -1012.0, + "loss": 0.6159, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3671875, + "rewards/margins": 0.53125, + "rewards/rejected": -1.8984375, + "step": 727 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 11.774922370910645, + "learning_rate": 2.57132293566929e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.859375, + "logps/chosen": -712.0, + "logps/rejected": -712.0, + "loss": 0.6371, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.25, + "rewards/margins": 0.87109375, + "rewards/rejected": -2.125, + "step": 728 + }, + { + "epoch": 1.5259026687598116, + "grad_norm": 11.291149139404297, + "learning_rate": 2.565638454416448e-07, + "logits/chosen": 1.9296875, + "logits/rejected": 2.40625, + "logps/chosen": -680.0, + "logps/rejected": -616.0, + "loss": 0.5991, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0390625, + "rewards/margins": 0.8515625, + "rewards/rejected": -1.890625, + "step": 729 + }, + { + "epoch": 1.5279958137100995, + "grad_norm": 10.837662696838379, + "learning_rate": 2.5599536347769157e-07, + "logits/chosen": 1.71875, + "logits/rejected": 1.390625, + "logps/chosen": -616.0, + "logps/rejected": -616.0, + "loss": 0.6112, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.15625, + "rewards/margins": 0.0986328125, + "rewards/rejected": -1.2578125, + "step": 730 + }, + { + "epoch": 1.5300889586603872, + "grad_norm": 10.20396614074707, + "learning_rate": 2.5542685061697595e-07, + "logits/chosen": 2.078125, + "logits/rejected": 2.15625, + "logps/chosen": -680.0, + "logps/rejected": -568.0, + "loss": 0.5881, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9765625, + "rewards/margins": 0.0205078125, + "rewards/rejected": -0.99609375, + "step": 731 + }, + { + "epoch": 1.532182103610675, + "grad_norm": 11.000545501708984, + "learning_rate": 2.548583098015646e-07, + "logits/chosen": 1.96875, + "logits/rejected": 2.078125, + "logps/chosen": -408.0, + "logps/rejected": -600.0, + "loss": 0.582, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3984375, + "rewards/margins": 0.515625, + "rewards/rejected": -1.9140625, + "step": 732 + }, + { + "epoch": 1.534275248560963, + "grad_norm": 11.277151107788086, + "learning_rate": 2.5428974397366856e-07, + "logits/chosen": 1.5703125, + "logits/rejected": 2.15625, + "logps/chosen": -532.0, + "logps/rejected": -494.0, + "loss": 0.6567, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.296875, + "rewards/margins": 0.58984375, + "rewards/rejected": -1.890625, + "step": 733 + }, + { + "epoch": 1.5363683935112507, + "grad_norm": 11.331924438476562, + "learning_rate": 2.537211560756286e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.3125, + "logps/chosen": -430.0, + "logps/rejected": -476.0, + "loss": 0.6137, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.50390625, + "rewards/rejected": -1.625, + "step": 734 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 12.324798583984375, + "learning_rate": 2.531525490498997e-07, + "logits/chosen": 2.25, + "logits/rejected": 3.015625, + "logps/chosen": -720.0, + "logps/rejected": -442.0, + "loss": 0.631, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.6171875, + "rewards/rejected": -1.75, + "step": 735 + }, + { + "epoch": 1.5405546834118262, + "grad_norm": 10.492572784423828, + "learning_rate": 2.525839258390355e-07, + "logits/chosen": 2.46875, + "logits/rejected": 3.21875, + "logps/chosen": -768.0, + "logps/rejected": -608.0, + "loss": 0.5506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.75, + "rewards/margins": 1.3671875, + "rewards/rejected": -2.125, + "step": 736 + }, + { + "epoch": 1.5426478283621141, + "grad_norm": 10.195070266723633, + "learning_rate": 2.520152893856739e-07, + "logits/chosen": 1.1953125, + "logits/rejected": 1.0546875, + "logps/chosen": -298.0, + "logps/rejected": -378.0, + "loss": 0.577, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.33203125, + "rewards/rejected": -1.5078125, + "step": 737 + }, + { + "epoch": 1.5447409733124018, + "grad_norm": 10.100964546203613, + "learning_rate": 2.514466426325209e-07, + "logits/chosen": 1.4765625, + "logits/rejected": 1.953125, + "logps/chosen": -368.0, + "logps/rejected": -368.0, + "loss": 0.5793, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.21875, + "rewards/margins": 0.498046875, + "rewards/rejected": -1.7109375, + "step": 738 + }, + { + "epoch": 1.5468341182626897, + "grad_norm": 10.938796043395996, + "learning_rate": 2.5087798852233593e-07, + "logits/chosen": 1.359375, + "logits/rejected": 1.6171875, + "logps/chosen": -436.0, + "logps/rejected": -358.0, + "loss": 0.6109, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.46484375, + "rewards/rejected": -1.640625, + "step": 739 + }, + { + "epoch": 1.5489272632129776, + "grad_norm": 11.056641578674316, + "learning_rate": 2.503093299979166e-07, + "logits/chosen": 2.65625, + "logits/rejected": 2.875, + "logps/chosen": -552.0, + "logps/rejected": -720.0, + "loss": 0.5651, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.34375, + "rewards/margins": 0.2392578125, + "rewards/rejected": -1.5859375, + "step": 740 + }, + { + "epoch": 1.5510204081632653, + "grad_norm": 10.961803436279297, + "learning_rate": 2.4974067000208334e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.15625, + "logps/chosen": -468.0, + "logps/rejected": -510.0, + "loss": 0.595, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.34375, + "rewards/margins": 0.03515625, + "rewards/rejected": -1.375, + "step": 741 + }, + { + "epoch": 1.5531135531135531, + "grad_norm": 11.027541160583496, + "learning_rate": 2.491720114776641e-07, + "logits/chosen": 1.15625, + "logits/rejected": 1.3125, + "logps/chosen": -276.0, + "logps/rejected": -324.0, + "loss": 0.6082, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9609375, + "rewards/margins": 0.3125, + "rewards/rejected": -1.2734375, + "step": 742 + }, + { + "epoch": 1.555206698063841, + "grad_norm": 11.360031127929688, + "learning_rate": 2.4860335736747915e-07, + "logits/chosen": 1.9296875, + "logits/rejected": 2.1875, + "logps/chosen": -336.0, + "logps/rejected": -468.0, + "loss": 0.6007, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.015625, + "rewards/margins": 0.365234375, + "rewards/rejected": -1.375, + "step": 743 + }, + { + "epoch": 1.5572998430141287, + "grad_norm": 10.475598335266113, + "learning_rate": 2.480347106143261e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.953125, + "logps/chosen": -400.0, + "logps/rejected": -412.0, + "loss": 0.5641, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3828125, + "rewards/margins": 0.5078125, + "rewards/rejected": -1.890625, + "step": 744 + }, + { + "epoch": 1.5593929879644164, + "grad_norm": 10.709178924560547, + "learning_rate": 2.474660741609645e-07, + "logits/chosen": 1.1171875, + "logits/rejected": 1.0234375, + "logps/chosen": -234.0, + "logps/rejected": -248.0, + "loss": 0.5916, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.7265625, + "rewards/margins": -0.01953125, + "rewards/rejected": -1.703125, + "step": 745 + }, + { + "epoch": 1.5614861329147045, + "grad_norm": 11.80731201171875, + "learning_rate": 2.468974509501004e-07, + "logits/chosen": 2.0, + "logits/rejected": 1.5703125, + "logps/chosen": -458.0, + "logps/rejected": -412.0, + "loss": 0.6428, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3125, + "rewards/margins": -0.224609375, + "rewards/rejected": -1.0859375, + "step": 746 + }, + { + "epoch": 1.5635792778649922, + "grad_norm": 11.310405731201172, + "learning_rate": 2.463288439243714e-07, + "logits/chosen": 1.859375, + "logits/rejected": 2.5, + "logps/chosen": -444.0, + "logps/rejected": -324.0, + "loss": 0.617, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265625, + "rewards/margins": 0.2109375, + "rewards/rejected": -1.4765625, + "step": 747 + }, + { + "epoch": 1.5656724228152799, + "grad_norm": 9.95641040802002, + "learning_rate": 2.457602560263314e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.21875, + "logps/chosen": -448.0, + "logps/rejected": -552.0, + "loss": 0.5878, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0859375, + "rewards/margins": 0.3203125, + "rewards/rejected": -1.40625, + "step": 748 + }, + { + "epoch": 1.5677655677655677, + "grad_norm": 12.05051040649414, + "learning_rate": 2.451916901984355e-07, + "logits/chosen": 1.2578125, + "logits/rejected": 1.8125, + "logps/chosen": -444.0, + "logps/rejected": -302.0, + "loss": 0.6069, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.92578125, + "rewards/margins": 0.37109375, + "rewards/rejected": -1.296875, + "step": 749 + }, + { + "epoch": 1.5698587127158556, + "grad_norm": 11.73843002319336, + "learning_rate": 2.446231493830241e-07, + "logits/chosen": 2.15625, + "logits/rejected": 2.0, + "logps/chosen": -482.0, + "logps/rejected": -512.0, + "loss": 0.611, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4609375, + "rewards/margins": 0.1826171875, + "rewards/rejected": -1.640625, + "step": 750 + }, + { + "epoch": 1.5719518576661433, + "grad_norm": 11.1854829788208, + "learning_rate": 2.440546365223084e-07, + "logits/chosen": 1.3828125, + "logits/rejected": 1.7109375, + "logps/chosen": -288.0, + "logps/rejected": -318.0, + "loss": 0.5909, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.296875, + "rewards/margins": 0.013671875, + "rewards/rejected": -1.3125, + "step": 751 + }, + { + "epoch": 1.5740450026164312, + "grad_norm": 10.895852088928223, + "learning_rate": 2.4348615455835516e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.765625, + "logps/chosen": -604.0, + "logps/rejected": -632.0, + "loss": 0.6138, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.046875, + "rewards/margins": 0.1845703125, + "rewards/rejected": -1.234375, + "step": 752 + }, + { + "epoch": 1.576138147566719, + "grad_norm": 11.637216567993164, + "learning_rate": 2.42917706433071e-07, + "logits/chosen": 1.6484375, + "logits/rejected": 1.859375, + "logps/chosen": -498.0, + "logps/rejected": -536.0, + "loss": 0.6268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.97265625, + "rewards/margins": 0.5, + "rewards/rejected": -1.46875, + "step": 753 + }, + { + "epoch": 1.5782312925170068, + "grad_norm": 11.550223350524902, + "learning_rate": 2.423492950881875e-07, + "logits/chosen": 2.09375, + "logits/rejected": 1.7109375, + "logps/chosen": -460.0, + "logps/rejected": -664.0, + "loss": 0.6323, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.03125, + "rewards/margins": 0.49609375, + "rewards/rejected": -1.53125, + "step": 754 + }, + { + "epoch": 1.5803244374672945, + "grad_norm": 12.201874732971191, + "learning_rate": 2.417809234652457e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.625, + "logps/chosen": -872.0, + "logps/rejected": -540.0, + "loss": 0.5869, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.28125, + "rewards/margins": 0.38671875, + "rewards/rejected": -1.671875, + "step": 755 + }, + { + "epoch": 1.5824175824175826, + "grad_norm": 11.802955627441406, + "learning_rate": 2.412125945055816e-07, + "logits/chosen": 1.9765625, + "logits/rejected": 2.984375, + "logps/chosen": -652.0, + "logps/rejected": -408.0, + "loss": 0.6082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0625, + "rewards/margins": 0.43359375, + "rewards/rejected": -1.5, + "step": 756 + }, + { + "epoch": 1.5845107273678702, + "grad_norm": 12.770798683166504, + "learning_rate": 2.406443111503097e-07, + "logits/chosen": 2.15625, + "logits/rejected": 3.140625, + "logps/chosen": -548.0, + "logps/rejected": -500.0, + "loss": 0.6227, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.03125, + "rewards/margins": 0.6171875, + "rewards/rejected": -1.65625, + "step": 757 + }, + { + "epoch": 1.586603872318158, + "grad_norm": 11.293933868408203, + "learning_rate": 2.40076076340309e-07, + "logits/chosen": 2.484375, + "logits/rejected": 2.6875, + "logps/chosen": -776.0, + "logps/rejected": -584.0, + "loss": 0.5771, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.296875, + "rewards/margins": 0.5234375, + "rewards/rejected": -1.8203125, + "step": 758 + }, + { + "epoch": 1.5886970172684458, + "grad_norm": 11.184715270996094, + "learning_rate": 2.3950789301620727e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.609375, + "logps/chosen": -744.0, + "logps/rejected": -560.0, + "loss": 0.6186, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7734375, + "rewards/margins": 0.03125, + "rewards/rejected": -1.8046875, + "step": 759 + }, + { + "epoch": 1.5907901622187337, + "grad_norm": 10.282952308654785, + "learning_rate": 2.389397641183656e-07, + "logits/chosen": 1.265625, + "logits/rejected": 2.125, + "logps/chosen": -390.0, + "logps/rejected": -388.0, + "loss": 0.5607, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.015625, + "rewards/margins": 0.578125, + "rewards/rejected": -1.59375, + "step": 760 + }, + { + "epoch": 1.5928833071690214, + "grad_norm": 11.480621337890625, + "learning_rate": 2.383716925868636e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.234375, + "logps/chosen": -440.0, + "logps/rejected": -498.0, + "loss": 0.6184, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6953125, + "rewards/margins": -0.072265625, + "rewards/rejected": -1.625, + "step": 761 + }, + { + "epoch": 1.5949764521193093, + "grad_norm": 11.712589263916016, + "learning_rate": 2.3780368136148381e-07, + "logits/chosen": 1.9296875, + "logits/rejected": 2.515625, + "logps/chosen": -302.0, + "logps/rejected": -228.0, + "loss": 0.6187, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.21875, + "rewards/margins": 0.083984375, + "rewards/rejected": -1.296875, + "step": 762 + }, + { + "epoch": 1.5970695970695972, + "grad_norm": 10.707878112792969, + "learning_rate": 2.37235733381697e-07, + "logits/chosen": 2.265625, + "logits/rejected": 2.0, + "logps/chosen": -272.0, + "logps/rejected": -340.0, + "loss": 0.5598, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.453125, + "rewards/margins": -0.3046875, + "rewards/rejected": -1.15625, + "step": 763 + }, + { + "epoch": 1.5991627420198848, + "grad_norm": 11.1841402053833, + "learning_rate": 2.3666785158664644e-07, + "logits/chosen": 1.265625, + "logits/rejected": 1.1171875, + "logps/chosen": -346.0, + "logps/rejected": -380.0, + "loss": 0.6387, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.671875, + "rewards/margins": 0.142578125, + "rewards/rejected": -1.8125, + "step": 764 + }, + { + "epoch": 1.6012558869701727, + "grad_norm": 11.163543701171875, + "learning_rate": 2.3610003891513274e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.421875, + "logps/chosen": -640.0, + "logps/rejected": -628.0, + "loss": 0.5559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9140625, + "rewards/margins": 0.6953125, + "rewards/rejected": -1.609375, + "step": 765 + }, + { + "epoch": 1.6033490319204606, + "grad_norm": 11.096171379089355, + "learning_rate": 2.3553229830559918e-07, + "logits/chosen": 2.078125, + "logits/rejected": 2.375, + "logps/chosen": -580.0, + "logps/rejected": -474.0, + "loss": 0.6042, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.376953125, + "rewards/rejected": -1.8046875, + "step": 766 + }, + { + "epoch": 1.6054421768707483, + "grad_norm": 11.36347770690918, + "learning_rate": 2.3496463269611577e-07, + "logits/chosen": 2.484375, + "logits/rejected": 3.140625, + "logps/chosen": -784.0, + "logps/rejected": -536.0, + "loss": 0.5814, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.9375, + "rewards/margins": 0.07958984375, + "rewards/rejected": -1.015625, + "step": 767 + }, + { + "epoch": 1.607535321821036, + "grad_norm": 10.495102882385254, + "learning_rate": 2.3439704502436462e-07, + "logits/chosen": 1.6796875, + "logits/rejected": 1.796875, + "logps/chosen": -376.0, + "logps/rejected": -552.0, + "loss": 0.5767, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.03125, + "rewards/margins": 0.609375, + "rewards/rejected": -1.640625, + "step": 768 + }, + { + "epoch": 1.6096284667713239, + "grad_norm": 11.483415603637695, + "learning_rate": 2.3382953822762432e-07, + "logits/chosen": 1.78125, + "logits/rejected": 1.140625, + "logps/chosen": -334.0, + "logps/rejected": -592.0, + "loss": 0.6309, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.390625, + "rewards/margins": 0.15234375, + "rewards/rejected": -1.5390625, + "step": 769 + }, + { + "epoch": 1.6117216117216118, + "grad_norm": 10.574986457824707, + "learning_rate": 2.3326211524275515e-07, + "logits/chosen": 2.34375, + "logits/rejected": 1.84375, + "logps/chosen": -462.0, + "logps/rejected": -548.0, + "loss": 0.599, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.453125, + "rewards/margins": 0.0673828125, + "rewards/rejected": -1.5234375, + "step": 770 + }, + { + "epoch": 1.6138147566718994, + "grad_norm": 10.306511878967285, + "learning_rate": 2.3269477900618355e-07, + "logits/chosen": 1.28125, + "logits/rejected": 1.75, + "logps/chosen": -342.0, + "logps/rejected": -412.0, + "loss": 0.5745, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.28125, + "rewards/margins": 0.765625, + "rewards/rejected": -2.046875, + "step": 771 + }, + { + "epoch": 1.6159079016221873, + "grad_norm": 10.39566707611084, + "learning_rate": 2.3212753245388691e-07, + "logits/chosen": 2.0625, + "logits/rejected": 2.375, + "logps/chosen": -640.0, + "logps/rejected": -476.0, + "loss": 0.5766, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.34375, + "rewards/margins": 0.30859375, + "rewards/rejected": -1.65625, + "step": 772 + }, + { + "epoch": 1.6180010465724752, + "grad_norm": 11.270380020141602, + "learning_rate": 2.3156037852137865e-07, + "logits/chosen": 1.5, + "logits/rejected": 1.46875, + "logps/chosen": -510.0, + "logps/rejected": -492.0, + "loss": 0.589, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.6015625, + "rewards/margins": -0.119140625, + "rewards/rejected": -1.484375, + "step": 773 + }, + { + "epoch": 1.620094191522763, + "grad_norm": 10.048487663269043, + "learning_rate": 2.3099332014369287e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.703125, + "logps/chosen": -500.0, + "logps/rejected": -468.0, + "loss": 0.5616, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109375, + "rewards/margins": 0.8359375, + "rewards/rejected": -1.9453125, + "step": 774 + }, + { + "epoch": 1.6221873364730508, + "grad_norm": 11.019427299499512, + "learning_rate": 2.3042636025536925e-07, + "logits/chosen": 1.1640625, + "logits/rejected": 0.6953125, + "logps/chosen": -244.0, + "logps/rejected": -402.0, + "loss": 0.5983, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.58203125, + "rewards/rejected": -1.890625, + "step": 775 + }, + { + "epoch": 1.6242804814233387, + "grad_norm": 11.41428279876709, + "learning_rate": 2.298595017904375e-07, + "logits/chosen": 2.3125, + "logits/rejected": 1.78125, + "logps/chosen": -452.0, + "logps/rejected": -448.0, + "loss": 0.6019, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.296875, + "rewards/margins": 0.212890625, + "rewards/rejected": -1.5078125, + "step": 776 + }, + { + "epoch": 1.6263736263736264, + "grad_norm": 10.68587875366211, + "learning_rate": 2.292927476824028e-07, + "logits/chosen": 1.6796875, + "logits/rejected": 1.546875, + "logps/chosen": -362.0, + "logps/rejected": -264.0, + "loss": 0.5849, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8671875, + "rewards/margins": 0.5859375, + "rewards/rejected": -1.453125, + "step": 777 + }, + { + "epoch": 1.628466771323914, + "grad_norm": 11.420637130737305, + "learning_rate": 2.287261008642302e-07, + "logits/chosen": 2.078125, + "logits/rejected": 2.875, + "logps/chosen": -476.0, + "logps/rejected": -362.0, + "loss": 0.5739, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.25, + "rewards/rejected": -1.4609375, + "step": 778 + }, + { + "epoch": 1.630559916274202, + "grad_norm": 11.029525756835938, + "learning_rate": 2.2815956426832922e-07, + "logits/chosen": 2.28125, + "logits/rejected": 2.359375, + "logps/chosen": -446.0, + "logps/rejected": -460.0, + "loss": 0.6079, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.484375, + "rewards/margins": -0.34375, + "rewards/rejected": -1.140625, + "step": 779 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 11.36279010772705, + "learning_rate": 2.275931408265393e-07, + "logits/chosen": 2.46875, + "logits/rejected": 2.03125, + "logps/chosen": -270.0, + "logps/rejected": -510.0, + "loss": 0.5937, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.015625, + "rewards/margins": 0.048828125, + "rewards/rejected": -1.0625, + "step": 780 + }, + { + "epoch": 1.6347462061747775, + "grad_norm": 10.862942695617676, + "learning_rate": 2.270268334701143e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.8125, + "logps/chosen": -784.0, + "logps/rejected": -584.0, + "loss": 0.6022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.84375, + "rewards/margins": 0.86328125, + "rewards/rejected": -1.7109375, + "step": 781 + }, + { + "epoch": 1.6368393511250654, + "grad_norm": 10.789078712463379, + "learning_rate": 2.264606451297072e-07, + "logits/chosen": 2.203125, + "logits/rejected": 3.125, + "logps/chosen": -464.0, + "logps/rejected": -251.0, + "loss": 0.5859, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.203125, + "rewards/margins": 0.13671875, + "rewards/rejected": -1.34375, + "step": 782 + }, + { + "epoch": 1.6389324960753533, + "grad_norm": 10.766769409179688, + "learning_rate": 2.258945787353552e-07, + "logits/chosen": 1.140625, + "logits/rejected": 1.5703125, + "logps/chosen": -492.0, + "logps/rejected": -310.0, + "loss": 0.5794, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.328125, + "rewards/margins": -0.0068359375, + "rewards/rejected": -1.328125, + "step": 783 + }, + { + "epoch": 1.641025641025641, + "grad_norm": 10.560734748840332, + "learning_rate": 2.2532863721646409e-07, + "logits/chosen": 1.7890625, + "logits/rejected": 1.7578125, + "logps/chosen": -448.0, + "logps/rejected": -592.0, + "loss": 0.6047, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2265625, + "rewards/margins": 0.287109375, + "rewards/rejected": -1.515625, + "step": 784 + }, + { + "epoch": 1.6431187859759289, + "grad_norm": 11.903189659118652, + "learning_rate": 2.2476282350179402e-07, + "logits/chosen": 1.5546875, + "logits/rejected": 3.125, + "logps/chosen": -516.0, + "logps/rejected": -296.0, + "loss": 0.6025, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265625, + "rewards/margins": 0.283203125, + "rewards/rejected": -1.5546875, + "step": 785 + }, + { + "epoch": 1.6452119309262168, + "grad_norm": 11.229724884033203, + "learning_rate": 2.2419714051944323e-07, + "logits/chosen": 1.359375, + "logits/rejected": 1.7890625, + "logps/chosen": -318.0, + "logps/rejected": -370.0, + "loss": 0.6236, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.125, + "rewards/margins": 0.53125, + "rewards/rejected": -1.65625, + "step": 786 + }, + { + "epoch": 1.6473050758765044, + "grad_norm": 11.278830528259277, + "learning_rate": 2.2363159119683352e-07, + "logits/chosen": 1.0859375, + "logits/rejected": 1.8671875, + "logps/chosen": -270.0, + "logps/rejected": -286.0, + "loss": 0.5618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.984375, + "rewards/margins": 0.1904296875, + "rewards/rejected": -1.1796875, + "step": 787 + }, + { + "epoch": 1.649398220826792, + "grad_norm": 11.758581161499023, + "learning_rate": 2.2306617846069524e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.96875, + "logps/chosen": -576.0, + "logps/rejected": -408.0, + "loss": 0.6119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.94921875, + "rewards/margins": 0.37890625, + "rewards/rejected": -1.328125, + "step": 788 + }, + { + "epoch": 1.6514913657770802, + "grad_norm": 10.333982467651367, + "learning_rate": 2.2250090523705177e-07, + "logits/chosen": 1.765625, + "logits/rejected": 2.234375, + "logps/chosen": -472.0, + "logps/rejected": -464.0, + "loss": 0.6051, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.96484375, + "rewards/margins": 0.41796875, + "rewards/rejected": -1.3828125, + "step": 789 + }, + { + "epoch": 1.653584510727368, + "grad_norm": 11.33622932434082, + "learning_rate": 2.2193577445120443e-07, + "logits/chosen": 1.8203125, + "logits/rejected": 3.28125, + "logps/chosen": -664.0, + "logps/rejected": -426.0, + "loss": 0.6247, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.421875, + "rewards/margins": 0.099609375, + "rewards/rejected": -1.5234375, + "step": 790 + }, + { + "epoch": 1.6556776556776556, + "grad_norm": 10.776433944702148, + "learning_rate": 2.2137078902771728e-07, + "logits/chosen": 2.25, + "logits/rejected": 2.34375, + "logps/chosen": -284.0, + "logps/rejected": -304.0, + "loss": 0.621, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.34375, + "rewards/margins": -0.06640625, + "rewards/rejected": -1.2734375, + "step": 791 + }, + { + "epoch": 1.6577708006279435, + "grad_norm": 11.183871269226074, + "learning_rate": 2.2080595189040263e-07, + "logits/chosen": 1.0390625, + "logits/rejected": 1.0625, + "logps/chosen": -406.0, + "logps/rejected": -632.0, + "loss": 0.6021, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4375, + "rewards/margins": 1.1015625, + "rewards/rejected": -2.546875, + "step": 792 + }, + { + "epoch": 1.6598639455782314, + "grad_norm": 11.59293270111084, + "learning_rate": 2.2024126596230492e-07, + "logits/chosen": 1.9453125, + "logits/rejected": 1.3203125, + "logps/chosen": -368.0, + "logps/rejected": -490.0, + "loss": 0.6159, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.5, + "rewards/rejected": -1.8203125, + "step": 793 + }, + { + "epoch": 1.661957090528519, + "grad_norm": 11.104989051818848, + "learning_rate": 2.196767341656863e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.4375, + "logps/chosen": -508.0, + "logps/rejected": -824.0, + "loss": 0.566, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.28125, + "rewards/margins": 0.251953125, + "rewards/rejected": -1.53125, + "step": 794 + }, + { + "epoch": 1.664050235478807, + "grad_norm": 11.907608032226562, + "learning_rate": 2.1911235942201115e-07, + "logits/chosen": 1.4375, + "logits/rejected": 1.453125, + "logps/chosen": -368.0, + "logps/rejected": -332.0, + "loss": 0.6334, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.625, + "rewards/rejected": -1.8359375, + "step": 795 + }, + { + "epoch": 1.6661433804290948, + "grad_norm": 10.751801490783691, + "learning_rate": 2.1854814465193132e-07, + "logits/chosen": 2.46875, + "logits/rejected": 2.28125, + "logps/chosen": -362.0, + "logps/rejected": -374.0, + "loss": 0.5655, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.94140625, + "rewards/margins": 0.33984375, + "rewards/rejected": -1.28125, + "step": 796 + }, + { + "epoch": 1.6682365253793825, + "grad_norm": 10.60224437713623, + "learning_rate": 2.1798409277527064e-07, + "logits/chosen": 1.2421875, + "logits/rejected": 1.296875, + "logps/chosen": -588.0, + "logps/rejected": -552.0, + "loss": 0.5637, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.98046875, + "rewards/margins": 0.58984375, + "rewards/rejected": -1.5703125, + "step": 797 + }, + { + "epoch": 1.6703296703296702, + "grad_norm": 10.94421672821045, + "learning_rate": 2.174202067110099e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.75, + "logps/chosen": -648.0, + "logps/rejected": -624.0, + "loss": 0.595, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.796875, + "rewards/margins": 0.013671875, + "rewards/rejected": -1.8125, + "step": 798 + }, + { + "epoch": 1.6724228152799583, + "grad_norm": 13.429807662963867, + "learning_rate": 2.1685648937727202e-07, + "logits/chosen": 2.0625, + "logits/rejected": 1.6171875, + "logps/chosen": -350.0, + "logps/rejected": -510.0, + "loss": 0.6668, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.1279296875, + "rewards/rejected": -1.328125, + "step": 799 + }, + { + "epoch": 1.674515960230246, + "grad_norm": 10.412588119506836, + "learning_rate": 2.162929436913065e-07, + "logits/chosen": 2.125, + "logits/rejected": 2.078125, + "logps/chosen": -584.0, + "logps/rejected": -498.0, + "loss": 0.5531, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.796875, + "rewards/rejected": -2.0, + "step": 800 + }, + { + "epoch": 1.6766091051805336, + "grad_norm": 10.976048469543457, + "learning_rate": 2.157295725694747e-07, + "logits/chosen": 1.5625, + "logits/rejected": 1.890625, + "logps/chosen": -241.0, + "logps/rejected": -296.0, + "loss": 0.6109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.96875, + "rewards/margins": 0.6171875, + "rewards/rejected": -1.5859375, + "step": 801 + }, + { + "epoch": 1.6787022501308215, + "grad_norm": 11.032082557678223, + "learning_rate": 2.1516637892723453e-07, + "logits/chosen": 1.78125, + "logits/rejected": 2.453125, + "logps/chosen": -362.0, + "logps/rejected": -378.0, + "loss": 0.6015, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.44921875, + "rewards/rejected": -1.640625, + "step": 802 + }, + { + "epoch": 1.6807953950811094, + "grad_norm": 10.239130973815918, + "learning_rate": 2.1460336567912553e-07, + "logits/chosen": 2.5625, + "logits/rejected": 3.21875, + "logps/chosen": -492.0, + "logps/rejected": -532.0, + "loss": 0.5695, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.8671875, + "rewards/rejected": -2.015625, + "step": 803 + }, + { + "epoch": 1.682888540031397, + "grad_norm": 11.244982719421387, + "learning_rate": 2.140405357387537e-07, + "logits/chosen": 1.8828125, + "logits/rejected": 2.375, + "logps/chosen": -500.0, + "logps/rejected": -458.0, + "loss": 0.5852, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.171875, + "rewards/margins": 0.296875, + "rewards/rejected": -1.46875, + "step": 804 + }, + { + "epoch": 1.684981684981685, + "grad_norm": 10.452801704406738, + "learning_rate": 2.1347789201877634e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.5, + "logps/chosen": -536.0, + "logps/rejected": -494.0, + "loss": 0.5971, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.96484375, + "rewards/margins": 0.400390625, + "rewards/rejected": -1.3671875, + "step": 805 + }, + { + "epoch": 1.6870748299319729, + "grad_norm": 12.595290184020996, + "learning_rate": 2.1291543743088687e-07, + "logits/chosen": 2.09375, + "logits/rejected": 2.265625, + "logps/chosen": -668.0, + "logps/rejected": -482.0, + "loss": 0.653, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.296875, + "rewards/margins": 0.380859375, + "rewards/rejected": -1.671875, + "step": 806 + }, + { + "epoch": 1.6891679748822606, + "grad_norm": 10.873169898986816, + "learning_rate": 2.1235317488580055e-07, + "logits/chosen": 2.515625, + "logits/rejected": 3.34375, + "logps/chosen": -712.0, + "logps/rejected": -552.0, + "loss": 0.5862, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.265625, + "rewards/margins": -0.224609375, + "rewards/rejected": -1.046875, + "step": 807 + }, + { + "epoch": 1.6912611198325485, + "grad_norm": 12.01298713684082, + "learning_rate": 2.1179110729323816e-07, + "logits/chosen": 0.89453125, + "logits/rejected": 1.3359375, + "logps/chosen": -400.0, + "logps/rejected": -298.0, + "loss": 0.6112, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6796875, + "rewards/margins": -0.1689453125, + "rewards/rejected": -1.515625, + "step": 808 + }, + { + "epoch": 1.6933542647828363, + "grad_norm": 10.95003604888916, + "learning_rate": 2.1122923756191181e-07, + "logits/chosen": 1.4765625, + "logits/rejected": 1.984375, + "logps/chosen": -708.0, + "logps/rejected": -486.0, + "loss": 0.6002, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.46875, + "rewards/margins": 0.30078125, + "rewards/rejected": -1.765625, + "step": 809 + }, + { + "epoch": 1.695447409733124, + "grad_norm": 11.224799156188965, + "learning_rate": 2.1066756859950995e-07, + "logits/chosen": 2.046875, + "logits/rejected": 2.453125, + "logps/chosen": -548.0, + "logps/rejected": -510.0, + "loss": 0.6023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.84765625, + "rewards/margins": 0.63671875, + "rewards/rejected": -1.484375, + "step": 810 + }, + { + "epoch": 1.6975405546834117, + "grad_norm": 13.100383758544922, + "learning_rate": 2.1010610331268168e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.96875, + "logps/chosen": -520.0, + "logps/rejected": -524.0, + "loss": 0.6265, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.328125, + "rewards/margins": 0.390625, + "rewards/rejected": -1.71875, + "step": 811 + }, + { + "epoch": 1.6996336996336996, + "grad_norm": 10.363519668579102, + "learning_rate": 2.0954484460702233e-07, + "logits/chosen": 1.6171875, + "logits/rejected": 2.015625, + "logps/chosen": -552.0, + "logps/rejected": -480.0, + "loss": 0.6076, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.125, + "rewards/margins": 0.0859375, + "rewards/rejected": -1.2109375, + "step": 812 + }, + { + "epoch": 1.7017268445839875, + "grad_norm": 11.107074737548828, + "learning_rate": 2.0898379538705773e-07, + "logits/chosen": 3.125, + "logits/rejected": 2.484375, + "logps/chosen": -668.0, + "logps/rejected": -960.0, + "loss": 0.5918, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.484375, + "rewards/margins": 0.36328125, + "rewards/rejected": -1.84375, + "step": 813 + }, + { + "epoch": 1.7038199895342752, + "grad_norm": 10.36357307434082, + "learning_rate": 2.0842295855623038e-07, + "logits/chosen": 1.40625, + "logits/rejected": 0.96484375, + "logps/chosen": -308.0, + "logps/rejected": -334.0, + "loss": 0.5855, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0546875, + "rewards/margins": 0.138671875, + "rewards/rejected": -1.1953125, + "step": 814 + }, + { + "epoch": 1.705913134484563, + "grad_norm": 10.832947731018066, + "learning_rate": 2.0786233701688295e-07, + "logits/chosen": 2.03125, + "logits/rejected": 2.046875, + "logps/chosen": -616.0, + "logps/rejected": -548.0, + "loss": 0.5809, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.384765625, + "rewards/rejected": -1.875, + "step": 815 + }, + { + "epoch": 1.708006279434851, + "grad_norm": 11.328465461730957, + "learning_rate": 2.073019336702443e-07, + "logits/chosen": 1.6015625, + "logits/rejected": 1.1953125, + "logps/chosen": -310.0, + "logps/rejected": -334.0, + "loss": 0.6189, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.984375, + "rewards/margins": 0.271484375, + "rewards/rejected": -1.2578125, + "step": 816 + }, + { + "epoch": 1.7100994243851386, + "grad_norm": 10.60431957244873, + "learning_rate": 2.0674175141641406e-07, + "logits/chosen": 2.359375, + "logits/rejected": 2.453125, + "logps/chosen": -446.0, + "logps/rejected": -312.0, + "loss": 0.6431, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.0634765625, + "rewards/rejected": -1.1796875, + "step": 817 + }, + { + "epoch": 1.7121925693354265, + "grad_norm": 10.303520202636719, + "learning_rate": 2.0618179315434778e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.96875, + "logps/chosen": -660.0, + "logps/rejected": -372.0, + "loss": 0.5334, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.03125, + "rewards/margins": 0.61328125, + "rewards/rejected": -1.640625, + "step": 818 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 10.645200729370117, + "learning_rate": 2.056220617818418e-07, + "logits/chosen": 1.5546875, + "logits/rejected": 2.296875, + "logps/chosen": -380.0, + "logps/rejected": -398.0, + "loss": 0.5867, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.7734375, + "rewards/rejected": -1.984375, + "step": 819 + }, + { + "epoch": 1.716378859236002, + "grad_norm": 10.647467613220215, + "learning_rate": 2.0506256019551813e-07, + "logits/chosen": 1.0078125, + "logits/rejected": 1.484375, + "logps/chosen": -450.0, + "logps/rejected": -416.0, + "loss": 0.5738, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.671875, + "rewards/margins": 0.2265625, + "rewards/rejected": -1.890625, + "step": 820 + }, + { + "epoch": 1.7184720041862898, + "grad_norm": 11.647187232971191, + "learning_rate": 2.0450329129081003e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.828125, + "logps/chosen": -604.0, + "logps/rejected": -504.0, + "loss": 0.642, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.03125, + "rewards/margins": -0.57421875, + "rewards/rejected": -1.453125, + "step": 821 + }, + { + "epoch": 1.7205651491365777, + "grad_norm": 10.969862937927246, + "learning_rate": 2.0394425796194625e-07, + "logits/chosen": 2.046875, + "logits/rejected": 2.6875, + "logps/chosen": -560.0, + "logps/rejected": -446.0, + "loss": 0.5625, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.50390625, + "rewards/rejected": -1.734375, + "step": 822 + }, + { + "epoch": 1.7226582940868655, + "grad_norm": 10.765937805175781, + "learning_rate": 2.0338546310193655e-07, + "logits/chosen": 1.8984375, + "logits/rejected": 1.75, + "logps/chosen": -468.0, + "logps/rejected": -572.0, + "loss": 0.588, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109375, + "rewards/margins": 0.54296875, + "rewards/rejected": -1.65625, + "step": 823 + }, + { + "epoch": 1.7247514390371532, + "grad_norm": 11.399531364440918, + "learning_rate": 2.0282690960255667e-07, + "logits/chosen": 1.765625, + "logits/rejected": 2.75, + "logps/chosen": -452.0, + "logps/rejected": -452.0, + "loss": 0.6237, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.36328125, + "rewards/rejected": -1.5703125, + "step": 824 + }, + { + "epoch": 1.7268445839874411, + "grad_norm": 11.497965812683105, + "learning_rate": 2.0226860035433326e-07, + "logits/chosen": 2.46875, + "logits/rejected": 2.421875, + "logps/chosen": -556.0, + "logps/rejected": -420.0, + "loss": 0.6331, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2578125, + "rewards/margins": 0.2119140625, + "rewards/rejected": -1.46875, + "step": 825 + }, + { + "epoch": 1.728937728937729, + "grad_norm": 10.845474243164062, + "learning_rate": 2.0171053824652906e-07, + "logits/chosen": 1.984375, + "logits/rejected": 2.1875, + "logps/chosen": -330.0, + "logps/rejected": -490.0, + "loss": 0.5722, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.21875, + "rewards/margins": 0.3359375, + "rewards/rejected": -1.5546875, + "step": 826 + }, + { + "epoch": 1.7310308738880167, + "grad_norm": 11.209230422973633, + "learning_rate": 2.0115272616712755e-07, + "logits/chosen": 2.40625, + "logits/rejected": 3.46875, + "logps/chosen": -824.0, + "logps/rejected": -580.0, + "loss": 0.5757, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3125, + "rewards/margins": 0.431640625, + "rewards/rejected": -1.7421875, + "step": 827 + }, + { + "epoch": 1.7331240188383046, + "grad_norm": 14.437784194946289, + "learning_rate": 2.0059516700281864e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.859375, + "logps/chosen": -856.0, + "logps/rejected": -856.0, + "loss": 0.6504, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109375, + "rewards/margins": 1.3359375, + "rewards/rejected": -2.4375, + "step": 828 + }, + { + "epoch": 1.7352171637885925, + "grad_norm": 11.006444931030273, + "learning_rate": 2.0003786363898327e-07, + "logits/chosen": 1.6953125, + "logits/rejected": 2.59375, + "logps/chosen": -506.0, + "logps/rejected": -406.0, + "loss": 0.5937, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.03125, + "rewards/margins": 0.5625, + "rewards/rejected": -1.59375, + "step": 829 + }, + { + "epoch": 1.7373103087388801, + "grad_norm": 12.13193130493164, + "learning_rate": 1.9948081895967863e-07, + "logits/chosen": 1.9453125, + "logits/rejected": 2.4375, + "logps/chosen": -548.0, + "logps/rejected": -600.0, + "loss": 0.6022, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4765625, + "rewards/margins": 0.392578125, + "rewards/rejected": -1.875, + "step": 830 + }, + { + "epoch": 1.7394034536891678, + "grad_norm": 11.159494400024414, + "learning_rate": 1.9892403584762313e-07, + "logits/chosen": 1.90625, + "logits/rejected": 1.6328125, + "logps/chosen": -728.0, + "logps/rejected": -588.0, + "loss": 0.6099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.234375, + "rewards/margins": 0.70703125, + "rewards/rejected": -1.9375, + "step": 831 + }, + { + "epoch": 1.741496598639456, + "grad_norm": 11.207942008972168, + "learning_rate": 1.9836751718418172e-07, + "logits/chosen": 1.8046875, + "logits/rejected": 2.046875, + "logps/chosen": -360.0, + "logps/rejected": -196.0, + "loss": 0.6046, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1796875, + "rewards/margins": -0.1640625, + "rewards/rejected": -1.015625, + "step": 832 + }, + { + "epoch": 1.7435897435897436, + "grad_norm": 11.720377922058105, + "learning_rate": 1.978112658493507e-07, + "logits/chosen": 1.71875, + "logits/rejected": 2.25, + "logps/chosen": -728.0, + "logps/rejected": -568.0, + "loss": 0.6135, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.4375, + "rewards/margins": 0.19140625, + "rewards/rejected": -2.625, + "step": 833 + }, + { + "epoch": 1.7456828885400313, + "grad_norm": 11.53357219696045, + "learning_rate": 1.972552847217429e-07, + "logits/chosen": 2.046875, + "logits/rejected": 2.375, + "logps/chosen": -428.0, + "logps/rejected": -386.0, + "loss": 0.6111, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.28125, + "rewards/margins": -0.138671875, + "rewards/rejected": -1.1484375, + "step": 834 + }, + { + "epoch": 1.7477760334903192, + "grad_norm": 11.31391716003418, + "learning_rate": 1.9669957667857292e-07, + "logits/chosen": 1.1484375, + "logits/rejected": 1.40625, + "logps/chosen": -240.0, + "logps/rejected": -224.0, + "loss": 0.6174, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.359375, + "rewards/margins": 0.0400390625, + "rewards/rejected": -1.40625, + "step": 835 + }, + { + "epoch": 1.749869178440607, + "grad_norm": 12.320887565612793, + "learning_rate": 1.9614414459564215e-07, + "logits/chosen": 1.53125, + "logits/rejected": 1.3984375, + "logps/chosen": -350.0, + "logps/rejected": -308.0, + "loss": 0.6136, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4375, + "rewards/margins": 0.14453125, + "rewards/rejected": -1.5859375, + "step": 836 + }, + { + "epoch": 1.7519623233908947, + "grad_norm": 13.618435859680176, + "learning_rate": 1.955889913473238e-07, + "logits/chosen": 1.875, + "logits/rejected": 1.8671875, + "logps/chosen": -294.0, + "logps/rejected": -402.0, + "loss": 0.6388, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4140625, + "rewards/margins": 0.40625, + "rewards/rejected": -1.8203125, + "step": 837 + }, + { + "epoch": 1.7540554683411826, + "grad_norm": 10.987975120544434, + "learning_rate": 1.9503411980654825e-07, + "logits/chosen": 2.125, + "logits/rejected": 1.8046875, + "logps/chosen": -524.0, + "logps/rejected": -486.0, + "loss": 0.6343, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0078125, + "rewards/margins": 0.5234375, + "rewards/rejected": -1.53125, + "step": 838 + }, + { + "epoch": 1.7561486132914705, + "grad_norm": 10.229272842407227, + "learning_rate": 1.9447953284478773e-07, + "logits/chosen": 1.6171875, + "logits/rejected": 2.546875, + "logps/chosen": -446.0, + "logps/rejected": -368.0, + "loss": 0.5864, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.953125, + "rewards/margins": 0.2333984375, + "rewards/rejected": -1.1875, + "step": 839 + }, + { + "epoch": 1.7582417582417582, + "grad_norm": 11.316136360168457, + "learning_rate": 1.939252333320422e-07, + "logits/chosen": 1.25, + "logits/rejected": 1.1796875, + "logps/chosen": -272.0, + "logps/rejected": -468.0, + "loss": 0.5621, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1171875, + "rewards/margins": 1.1953125, + "rewards/rejected": -2.3125, + "step": 840 + }, + { + "epoch": 1.7603349031920459, + "grad_norm": 11.072029113769531, + "learning_rate": 1.9337122413682376e-07, + "logits/chosen": 2.5625, + "logits/rejected": 3.140625, + "logps/chosen": -1168.0, + "logps/rejected": -656.0, + "loss": 0.5701, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.296875, + "rewards/margins": 0.26953125, + "rewards/rejected": -1.5703125, + "step": 841 + }, + { + "epoch": 1.762428048142334, + "grad_norm": 11.808143615722656, + "learning_rate": 1.9281750812614204e-07, + "logits/chosen": 3.125, + "logits/rejected": 2.484375, + "logps/chosen": -572.0, + "logps/rejected": -776.0, + "loss": 0.6283, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3125, + "rewards/margins": 0.32421875, + "rewards/rejected": -1.640625, + "step": 842 + }, + { + "epoch": 1.7645211930926217, + "grad_norm": 11.55233383178711, + "learning_rate": 1.9226408816548979e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.5, + "logps/chosen": -760.0, + "logps/rejected": -704.0, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.44140625, + "rewards/rejected": -1.875, + "step": 843 + }, + { + "epoch": 1.7666143380429093, + "grad_norm": 10.958243370056152, + "learning_rate": 1.9171096711882734e-07, + "logits/chosen": 2.0625, + "logits/rejected": 2.296875, + "logps/chosen": -470.0, + "logps/rejected": -426.0, + "loss": 0.5719, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9296875, + "rewards/margins": 0.81640625, + "rewards/rejected": -1.75, + "step": 844 + }, + { + "epoch": 1.7687074829931972, + "grad_norm": 10.45136833190918, + "learning_rate": 1.9115814784856838e-07, + "logits/chosen": 2.265625, + "logits/rejected": 2.828125, + "logps/chosen": -494.0, + "logps/rejected": -460.0, + "loss": 0.6025, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0390625, + "rewards/margins": 0.9375, + "rewards/rejected": -1.9765625, + "step": 845 + }, + { + "epoch": 1.7708006279434851, + "grad_norm": 10.515970230102539, + "learning_rate": 1.9060563321556467e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.6875, + "logps/chosen": -700.0, + "logps/rejected": -684.0, + "loss": 0.5836, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9609375, + "rewards/margins": 1.015625, + "rewards/rejected": -1.96875, + "step": 846 + }, + { + "epoch": 1.7728937728937728, + "grad_norm": 12.13499927520752, + "learning_rate": 1.9005342607909175e-07, + "logits/chosen": 1.8671875, + "logits/rejected": 1.3203125, + "logps/chosen": -244.0, + "logps/rejected": -354.0, + "loss": 0.6331, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.0859375, + "rewards/rejected": -1.28125, + "step": 847 + }, + { + "epoch": 1.7749869178440607, + "grad_norm": 11.108990669250488, + "learning_rate": 1.8950152929683365e-07, + "logits/chosen": 1.453125, + "logits/rejected": 1.828125, + "logps/chosen": -306.0, + "logps/rejected": -262.0, + "loss": 0.6347, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.0, + "rewards/rejected": -1.234375, + "step": 848 + }, + { + "epoch": 1.7770800627943486, + "grad_norm": 10.588078498840332, + "learning_rate": 1.8894994572486834e-07, + "logits/chosen": 1.5546875, + "logits/rejected": 2.046875, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.5927, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0234375, + "rewards/margins": 0.53515625, + "rewards/rejected": -1.5546875, + "step": 849 + }, + { + "epoch": 1.7791732077446363, + "grad_norm": 10.220610618591309, + "learning_rate": 1.8839867821765289e-07, + "logits/chosen": 2.625, + "logits/rejected": 2.71875, + "logps/chosen": -1128.0, + "logps/rejected": -656.0, + "loss": 0.5764, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.279296875, + "rewards/rejected": -1.46875, + "step": 850 + }, + { + "epoch": 1.7812663526949242, + "grad_norm": 10.536320686340332, + "learning_rate": 1.8784772962800886e-07, + "logits/chosen": 2.6875, + "logits/rejected": 2.265625, + "logps/chosen": -298.0, + "logps/rejected": -576.0, + "loss": 0.6031, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1015625, + "rewards/margins": 1.0, + "rewards/rejected": -2.09375, + "step": 851 + }, + { + "epoch": 1.783359497645212, + "grad_norm": 10.931295394897461, + "learning_rate": 1.8729710280710732e-07, + "logits/chosen": 1.96875, + "logits/rejected": 2.15625, + "logps/chosen": -474.0, + "logps/rejected": -418.0, + "loss": 0.6327, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.15625, + "rewards/margins": 0.060546875, + "rewards/rejected": -1.2109375, + "step": 852 + }, + { + "epoch": 1.7854526425954997, + "grad_norm": 10.23117446899414, + "learning_rate": 1.867468006044541e-07, + "logits/chosen": 2.796875, + "logits/rejected": 3.125, + "logps/chosen": -948.0, + "logps/rejected": -948.0, + "loss": 0.5796, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.625, + "rewards/rejected": -1.90625, + "step": 853 + }, + { + "epoch": 1.7875457875457874, + "grad_norm": 10.816021919250488, + "learning_rate": 1.8619682586787537e-07, + "logits/chosen": 1.4140625, + "logits/rejected": 1.8828125, + "logps/chosen": -628.0, + "logps/rejected": -544.0, + "loss": 0.5762, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3125, + "rewards/margins": 0.546875, + "rewards/rejected": -1.859375, + "step": 854 + }, + { + "epoch": 1.7896389324960753, + "grad_norm": 11.199675559997559, + "learning_rate": 1.8564718144350244e-07, + "logits/chosen": 2.25, + "logits/rejected": 3.671875, + "logps/chosen": -760.0, + "logps/rejected": -480.0, + "loss": 0.5731, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3046875, + "rewards/margins": 0.21484375, + "rewards/rejected": -1.515625, + "step": 855 + }, + { + "epoch": 1.7917320774463632, + "grad_norm": 10.781134605407715, + "learning_rate": 1.850978701757572e-07, + "logits/chosen": 2.390625, + "logits/rejected": 2.953125, + "logps/chosen": -732.0, + "logps/rejected": -332.0, + "loss": 0.5674, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.390625, + "rewards/margins": -0.0244140625, + "rewards/rejected": -1.3671875, + "step": 856 + }, + { + "epoch": 1.7938252223966509, + "grad_norm": 10.734904289245605, + "learning_rate": 1.8454889490733757e-07, + "logits/chosen": 1.8203125, + "logits/rejected": 1.9453125, + "logps/chosen": -596.0, + "logps/rejected": -440.0, + "loss": 0.5771, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2890625, + "rewards/margins": -0.1591796875, + "rewards/rejected": -1.1328125, + "step": 857 + }, + { + "epoch": 1.7959183673469388, + "grad_norm": 10.833751678466797, + "learning_rate": 1.840002584792027e-07, + "logits/chosen": 1.3046875, + "logits/rejected": 2.1875, + "logps/chosen": -418.0, + "logps/rejected": -436.0, + "loss": 0.5985, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.578125, + "rewards/margins": 0.2890625, + "rewards/rejected": -1.8671875, + "step": 858 + }, + { + "epoch": 1.7980115122972267, + "grad_norm": 10.765353202819824, + "learning_rate": 1.8345196373055826e-07, + "logits/chosen": 1.375, + "logits/rejected": 1.4296875, + "logps/chosen": -612.0, + "logps/rejected": -342.0, + "loss": 0.5849, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.9296875, + "rewards/margins": -0.6796875, + "rewards/rejected": -1.2421875, + "step": 859 + }, + { + "epoch": 1.8001046572475143, + "grad_norm": 10.382110595703125, + "learning_rate": 1.8290401349884158e-07, + "logits/chosen": 2.109375, + "logits/rejected": 2.671875, + "logps/chosen": -492.0, + "logps/rejected": -326.0, + "loss": 0.5628, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2265625, + "rewards/margins": 0.375, + "rewards/rejected": -1.6015625, + "step": 860 + }, + { + "epoch": 1.8021978021978022, + "grad_norm": 10.998440742492676, + "learning_rate": 1.8235641061970693e-07, + "logits/chosen": 2.5, + "logits/rejected": 1.71875, + "logps/chosen": -320.0, + "logps/rejected": -536.0, + "loss": 0.585, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.7734375, + "rewards/rejected": -2.203125, + "step": 861 + }, + { + "epoch": 1.8042909471480901, + "grad_norm": 10.22006607055664, + "learning_rate": 1.8180915792701165e-07, + "logits/chosen": 1.5390625, + "logits/rejected": 1.7734375, + "logps/chosen": -616.0, + "logps/rejected": -280.0, + "loss": 0.6156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7578125, + "rewards/margins": 0.30078125, + "rewards/rejected": -1.0625, + "step": 862 + }, + { + "epoch": 1.8063840920983778, + "grad_norm": 11.575730323791504, + "learning_rate": 1.8126225825280022e-07, + "logits/chosen": 1.9609375, + "logits/rejected": 2.28125, + "logps/chosen": -544.0, + "logps/rejected": -438.0, + "loss": 0.6018, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.21875, + "rewards/margins": 0.2890625, + "rewards/rejected": -1.5, + "step": 863 + }, + { + "epoch": 1.8084772370486655, + "grad_norm": 10.908926010131836, + "learning_rate": 1.807157144272905e-07, + "logits/chosen": 1.78125, + "logits/rejected": 2.21875, + "logps/chosen": -386.0, + "logps/rejected": -402.0, + "loss": 0.5911, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0625, + "rewards/margins": 0.2060546875, + "rewards/rejected": -1.265625, + "step": 864 + }, + { + "epoch": 1.8105703819989536, + "grad_norm": 11.957222938537598, + "learning_rate": 1.8016952927885893e-07, + "logits/chosen": 2.28125, + "logits/rejected": 2.125, + "logps/chosen": -568.0, + "logps/rejected": -676.0, + "loss": 0.6458, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.578125, + "rewards/margins": 0.31640625, + "rewards/rejected": -1.890625, + "step": 865 + }, + { + "epoch": 1.8126635269492413, + "grad_norm": 10.530887603759766, + "learning_rate": 1.7962370563402566e-07, + "logits/chosen": 1.3359375, + "logits/rejected": 2.109375, + "logps/chosen": -390.0, + "logps/rejected": -246.0, + "loss": 0.5985, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.96875, + "rewards/margins": 0.09375, + "rewards/rejected": -1.0625, + "step": 866 + }, + { + "epoch": 1.814756671899529, + "grad_norm": 10.46704387664795, + "learning_rate": 1.7907824631744e-07, + "logits/chosen": 2.375, + "logits/rejected": 1.953125, + "logps/chosen": -544.0, + "logps/rejected": -450.0, + "loss": 0.5895, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.03125, + "rewards/margins": 0.6328125, + "rewards/rejected": -1.65625, + "step": 867 + }, + { + "epoch": 1.8168498168498168, + "grad_norm": 11.886927604675293, + "learning_rate": 1.7853315415186579e-07, + "logits/chosen": 1.7734375, + "logits/rejected": 1.78125, + "logps/chosen": -508.0, + "logps/rejected": -394.0, + "loss": 0.6206, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.482421875, + "rewards/rejected": -1.6640625, + "step": 868 + }, + { + "epoch": 1.8189429618001047, + "grad_norm": 10.469581604003906, + "learning_rate": 1.779884319581673e-07, + "logits/chosen": 1.90625, + "logits/rejected": 1.875, + "logps/chosen": -440.0, + "logps/rejected": -456.0, + "loss": 0.5681, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.63671875, + "rewards/margins": 0.55078125, + "rewards/rejected": -1.1875, + "step": 869 + }, + { + "epoch": 1.8210361067503924, + "grad_norm": 10.928384780883789, + "learning_rate": 1.7744408255529361e-07, + "logits/chosen": 1.34375, + "logits/rejected": 2.265625, + "logps/chosen": -580.0, + "logps/rejected": -478.0, + "loss": 0.5957, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.310546875, + "rewards/rejected": -1.8046875, + "step": 870 + }, + { + "epoch": 1.8231292517006803, + "grad_norm": 11.822400093078613, + "learning_rate": 1.7690010876026495e-07, + "logits/chosen": 2.28125, + "logits/rejected": 2.21875, + "logps/chosen": -552.0, + "logps/rejected": -442.0, + "loss": 0.642, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.515625, + "rewards/margins": -0.140625, + "rewards/rejected": -1.375, + "step": 871 + }, + { + "epoch": 1.8252223966509682, + "grad_norm": 11.517511367797852, + "learning_rate": 1.7635651338815767e-07, + "logits/chosen": 1.4921875, + "logits/rejected": 1.53125, + "logps/chosen": -350.0, + "logps/rejected": -320.0, + "loss": 0.5991, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.328125, + "rewards/margins": 0.29296875, + "rewards/rejected": -1.6171875, + "step": 872 + }, + { + "epoch": 1.8273155416012559, + "grad_norm": 11.321386337280273, + "learning_rate": 1.758132992520898e-07, + "logits/chosen": 0.82421875, + "logits/rejected": 0.80078125, + "logps/chosen": -312.0, + "logps/rejected": -296.0, + "loss": 0.5651, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1796875, + "rewards/margins": -0.001953125, + "rewards/rejected": -1.171875, + "step": 873 + }, + { + "epoch": 1.8294086865515435, + "grad_norm": 10.976082801818848, + "learning_rate": 1.7527046916320643e-07, + "logits/chosen": 1.4296875, + "logits/rejected": 2.296875, + "logps/chosen": -620.0, + "logps/rejected": -492.0, + "loss": 0.6224, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4140625, + "rewards/margins": 0.09033203125, + "rewards/rejected": -1.5078125, + "step": 874 + }, + { + "epoch": 1.8315018315018317, + "grad_norm": 10.818897247314453, + "learning_rate": 1.7472802593066518e-07, + "logits/chosen": 1.6796875, + "logits/rejected": 1.5859375, + "logps/chosen": -446.0, + "logps/rejected": -418.0, + "loss": 0.625, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.453125, + "rewards/margins": 0.3046875, + "rewards/rejected": -1.765625, + "step": 875 + }, + { + "epoch": 1.8335949764521193, + "grad_norm": 10.937468528747559, + "learning_rate": 1.7418597236162187e-07, + "logits/chosen": 1.5859375, + "logits/rejected": 2.0625, + "logps/chosen": -448.0, + "logps/rejected": -988.0, + "loss": 0.6065, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8671875, + "rewards/margins": 1.140625, + "rewards/rejected": -3.0, + "step": 876 + }, + { + "epoch": 1.835688121402407, + "grad_norm": 11.407981872558594, + "learning_rate": 1.7364431126121546e-07, + "logits/chosen": 1.21875, + "logits/rejected": 1.921875, + "logps/chosen": -292.0, + "logps/rejected": -201.0, + "loss": 0.6084, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.09375, + "rewards/margins": 0.09375, + "rewards/rejected": -1.1875, + "step": 877 + }, + { + "epoch": 1.837781266352695, + "grad_norm": 9.613057136535645, + "learning_rate": 1.7310304543255417e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.21875, + "logps/chosen": -584.0, + "logps/rejected": -384.0, + "loss": 0.5748, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.359375, + "rewards/margins": 0.40625, + "rewards/rejected": -1.765625, + "step": 878 + }, + { + "epoch": 1.8398744113029828, + "grad_norm": 11.72396469116211, + "learning_rate": 1.7256217767670046e-07, + "logits/chosen": 1.9609375, + "logits/rejected": 2.078125, + "logps/chosen": -498.0, + "logps/rejected": -576.0, + "loss": 0.6049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.03125, + "rewards/margins": 1.03125, + "rewards/rejected": -2.0625, + "step": 879 + }, + { + "epoch": 1.8419675562532705, + "grad_norm": 11.825212478637695, + "learning_rate": 1.7202171079265702e-07, + "logits/chosen": 2.25, + "logits/rejected": 1.640625, + "logps/chosen": -396.0, + "logps/rejected": -408.0, + "loss": 0.6028, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.20703125, + "rewards/rejected": -1.390625, + "step": 880 + }, + { + "epoch": 1.8440607012035584, + "grad_norm": 13.535147666931152, + "learning_rate": 1.7148164757735178e-07, + "logits/chosen": 1.4296875, + "logits/rejected": 1.9140625, + "logps/chosen": -492.0, + "logps/rejected": -450.0, + "loss": 0.6377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.765625, + "rewards/margins": 0.515625, + "rewards/rejected": -1.28125, + "step": 881 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 10.785517692565918, + "learning_rate": 1.7094199082562378e-07, + "logits/chosen": 1.3125, + "logits/rejected": 2.203125, + "logps/chosen": -374.0, + "logps/rejected": -320.0, + "loss": 0.6003, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.91796875, + "rewards/margins": 0.244140625, + "rewards/rejected": -1.1640625, + "step": 882 + }, + { + "epoch": 1.848246991104134, + "grad_norm": 10.35688591003418, + "learning_rate": 1.7040274333020858e-07, + "logits/chosen": 1.4296875, + "logits/rejected": 1.8203125, + "logps/chosen": -616.0, + "logps/rejected": -468.0, + "loss": 0.5577, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.29296875, + "rewards/rejected": -1.4296875, + "step": 883 + }, + { + "epoch": 1.8503401360544216, + "grad_norm": 10.81713581085205, + "learning_rate": 1.6986390788172395e-07, + "logits/chosen": 1.5234375, + "logits/rejected": 1.8984375, + "logps/chosen": -322.0, + "logps/rejected": -320.0, + "loss": 0.5617, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.2490234375, + "rewards/rejected": -1.484375, + "step": 884 + }, + { + "epoch": 1.8524332810047097, + "grad_norm": 12.084059715270996, + "learning_rate": 1.6932548726865504e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.828125, + "logps/chosen": -756.0, + "logps/rejected": -904.0, + "loss": 0.6348, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5390625, + "rewards/margins": 0.78125, + "rewards/rejected": -2.3125, + "step": 885 + }, + { + "epoch": 1.8545264259549974, + "grad_norm": 11.9383544921875, + "learning_rate": 1.687874842773403e-07, + "logits/chosen": 1.8359375, + "logits/rejected": 2.875, + "logps/chosen": -528.0, + "logps/rejected": -392.0, + "loss": 0.6028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.75, + "rewards/margins": 0.6875, + "rewards/rejected": -1.4375, + "step": 886 + }, + { + "epoch": 1.856619570905285, + "grad_norm": 11.799210548400879, + "learning_rate": 1.682499016919573e-07, + "logits/chosen": 1.6484375, + "logits/rejected": 1.5703125, + "logps/chosen": -340.0, + "logps/rejected": -364.0, + "loss": 0.6263, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.296875, + "rewards/margins": 0.3671875, + "rewards/rejected": -1.6640625, + "step": 887 + }, + { + "epoch": 1.858712715855573, + "grad_norm": 10.321686744689941, + "learning_rate": 1.6771274229450764e-07, + "logits/chosen": 2.171875, + "logits/rejected": 2.09375, + "logps/chosen": -624.0, + "logps/rejected": -600.0, + "loss": 0.5805, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0546875, + "rewards/margins": 0.46484375, + "rewards/rejected": -1.5234375, + "step": 888 + }, + { + "epoch": 1.8608058608058609, + "grad_norm": 11.320591926574707, + "learning_rate": 1.6717600886480297e-07, + "logits/chosen": 1.7734375, + "logits/rejected": 2.546875, + "logps/chosen": -612.0, + "logps/rejected": -792.0, + "loss": 0.6008, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.5, + "rewards/margins": -0.0703125, + "rewards/rejected": -1.421875, + "step": 889 + }, + { + "epoch": 1.8628990057561485, + "grad_norm": 11.110066413879395, + "learning_rate": 1.6663970418045052e-07, + "logits/chosen": 1.6953125, + "logits/rejected": 2.375, + "logps/chosen": -552.0, + "logps/rejected": -414.0, + "loss": 0.5938, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5, + "rewards/margins": 0.140625, + "rewards/rejected": -1.640625, + "step": 890 + }, + { + "epoch": 1.8649921507064364, + "grad_norm": 12.742599487304688, + "learning_rate": 1.6610383101683913e-07, + "logits/chosen": 1.9140625, + "logits/rejected": 1.6328125, + "logps/chosen": -314.0, + "logps/rejected": -608.0, + "loss": 0.6081, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.47265625, + "rewards/rejected": -1.7109375, + "step": 891 + }, + { + "epoch": 1.8670852956567243, + "grad_norm": 10.694188117980957, + "learning_rate": 1.6556839214712397e-07, + "logits/chosen": 1.6875, + "logits/rejected": 2.109375, + "logps/chosen": -444.0, + "logps/rejected": -458.0, + "loss": 0.5673, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.140625, + "rewards/margins": 0.80078125, + "rewards/rejected": -1.9375, + "step": 892 + }, + { + "epoch": 1.869178440607012, + "grad_norm": 11.569079399108887, + "learning_rate": 1.6503339034221296e-07, + "logits/chosen": 1.703125, + "logits/rejected": 1.5234375, + "logps/chosen": -592.0, + "logps/rejected": -744.0, + "loss": 0.6228, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.33203125, + "rewards/rejected": -1.65625, + "step": 893 + }, + { + "epoch": 1.8712715855572999, + "grad_norm": 11.102907180786133, + "learning_rate": 1.644988283707524e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.203125, + "logps/chosen": -504.0, + "logps/rejected": -656.0, + "loss": 0.5926, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.21875, + "rewards/margins": 0.87109375, + "rewards/rejected": -2.09375, + "step": 894 + }, + { + "epoch": 1.8733647305075878, + "grad_norm": 10.342639923095703, + "learning_rate": 1.639647089991121e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.59375, + "logps/chosen": -348.0, + "logps/rejected": -376.0, + "loss": 0.5621, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.236328125, + "rewards/rejected": -1.4296875, + "step": 895 + }, + { + "epoch": 1.8754578754578755, + "grad_norm": 11.535297393798828, + "learning_rate": 1.6343103499137167e-07, + "logits/chosen": 1.671875, + "logits/rejected": 1.90625, + "logps/chosen": -336.0, + "logps/rejected": -390.0, + "loss": 0.6047, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.703125, + "rewards/margins": 0.48828125, + "rewards/rejected": -2.1875, + "step": 896 + }, + { + "epoch": 1.8775510204081631, + "grad_norm": 10.489069938659668, + "learning_rate": 1.628978091093056e-07, + "logits/chosen": 2.09375, + "logits/rejected": 2.609375, + "logps/chosen": -788.0, + "logps/rejected": -536.0, + "loss": 0.5512, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.359375, + "rewards/margins": 0.1796875, + "rewards/rejected": -1.5390625, + "step": 897 + }, + { + "epoch": 1.879644165358451, + "grad_norm": 11.661781311035156, + "learning_rate": 1.6236503411236996e-07, + "logits/chosen": 2.15625, + "logits/rejected": 2.46875, + "logps/chosen": -358.0, + "logps/rejected": -376.0, + "loss": 0.5957, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.158203125, + "rewards/rejected": -1.3671875, + "step": 898 + }, + { + "epoch": 1.881737310308739, + "grad_norm": 11.3607816696167, + "learning_rate": 1.6183271275768678e-07, + "logits/chosen": 1.9765625, + "logits/rejected": 1.953125, + "logps/chosen": -330.0, + "logps/rejected": -342.0, + "loss": 0.6138, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.234375, + "rewards/margins": 0.30859375, + "rewards/rejected": -1.546875, + "step": 899 + }, + { + "epoch": 1.8838304552590266, + "grad_norm": 10.10186767578125, + "learning_rate": 1.6130084780003093e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.09375, + "logps/chosen": -960.0, + "logps/rejected": -948.0, + "loss": 0.5809, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5, + "rewards/margins": 0.056640625, + "rewards/rejected": -1.5625, + "step": 900 + }, + { + "epoch": 1.8859236002093145, + "grad_norm": 11.025842666625977, + "learning_rate": 1.607694419918151e-07, + "logits/chosen": 1.7890625, + "logits/rejected": 1.96875, + "logps/chosen": -652.0, + "logps/rejected": -684.0, + "loss": 0.5718, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8203125, + "rewards/margins": 0.68359375, + "rewards/rejected": -2.5, + "step": 901 + }, + { + "epoch": 1.8880167451596024, + "grad_norm": 10.691457748413086, + "learning_rate": 1.602384980830762e-07, + "logits/chosen": 2.34375, + "logits/rejected": 1.5390625, + "logps/chosen": -460.0, + "logps/rejected": -402.0, + "loss": 0.5914, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.015625, + "rewards/margins": 0.7578125, + "rewards/rejected": -1.7734375, + "step": 902 + }, + { + "epoch": 1.89010989010989, + "grad_norm": 10.016210556030273, + "learning_rate": 1.597080188214607e-07, + "logits/chosen": 1.390625, + "logits/rejected": 2.40625, + "logps/chosen": -380.0, + "logps/rejected": -376.0, + "loss": 0.559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.98046875, + "rewards/margins": 0.173828125, + "rewards/rejected": -1.15625, + "step": 903 + }, + { + "epoch": 1.892203035060178, + "grad_norm": 12.242632865905762, + "learning_rate": 1.5917800695221019e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.65625, + "logps/chosen": -516.0, + "logps/rejected": -368.0, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9765625, + "rewards/margins": 0.359375, + "rewards/rejected": -1.3359375, + "step": 904 + }, + { + "epoch": 1.8942961800104658, + "grad_norm": 11.27692699432373, + "learning_rate": 1.5864846521814807e-07, + "logits/chosen": 1.671875, + "logits/rejected": 1.5859375, + "logps/chosen": -286.0, + "logps/rejected": -584.0, + "loss": 0.6068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.15625, + "rewards/margins": 0.578125, + "rewards/rejected": -1.734375, + "step": 905 + }, + { + "epoch": 1.8963893249607535, + "grad_norm": 10.626609802246094, + "learning_rate": 1.5811939635966424e-07, + "logits/chosen": 1.8671875, + "logits/rejected": 2.4375, + "logps/chosen": -436.0, + "logps/rejected": -272.0, + "loss": 0.5666, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.99609375, + "rewards/margins": 0.53125, + "rewards/rejected": -1.5234375, + "step": 906 + }, + { + "epoch": 1.8984824699110412, + "grad_norm": 10.588828086853027, + "learning_rate": 1.5759080311470184e-07, + "logits/chosen": 1.921875, + "logits/rejected": 1.3671875, + "logps/chosen": -470.0, + "logps/rejected": -510.0, + "loss": 0.6039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.39453125, + "rewards/rejected": -1.59375, + "step": 907 + }, + { + "epoch": 1.9005756148613293, + "grad_norm": 10.58249568939209, + "learning_rate": 1.570626882187423e-07, + "logits/chosen": 1.671875, + "logits/rejected": 1.75, + "logps/chosen": -230.0, + "logps/rejected": -360.0, + "loss": 0.5565, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.578125, + "rewards/rejected": -1.921875, + "step": 908 + }, + { + "epoch": 1.902668759811617, + "grad_norm": 11.328306198120117, + "learning_rate": 1.5653505440479215e-07, + "logits/chosen": 2.703125, + "logits/rejected": 2.71875, + "logps/chosen": -832.0, + "logps/rejected": -584.0, + "loss": 0.6241, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9453125, + "rewards/margins": 0.41015625, + "rewards/rejected": -1.359375, + "step": 909 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 12.08849811553955, + "learning_rate": 1.5600790440336784e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.296875, + "logps/chosen": -596.0, + "logps/rejected": -576.0, + "loss": 0.6246, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0390625, + "rewards/margins": 0.44921875, + "rewards/rejected": -1.4921875, + "step": 910 + }, + { + "epoch": 1.9068550497121926, + "grad_norm": 10.502798080444336, + "learning_rate": 1.554812409424822e-07, + "logits/chosen": 2.21875, + "logits/rejected": 3.71875, + "logps/chosen": -736.0, + "logps/rejected": -632.0, + "loss": 0.5988, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6171875, + "rewards/margins": 0.416015625, + "rewards/rejected": -2.03125, + "step": 911 + }, + { + "epoch": 1.9089481946624804, + "grad_norm": 11.264755249023438, + "learning_rate": 1.5495506674763014e-07, + "logits/chosen": 1.359375, + "logits/rejected": 1.8125, + "logps/chosen": -228.0, + "logps/rejected": -388.0, + "loss": 0.5653, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1875, + "rewards/margins": 0.75, + "rewards/rejected": -1.9375, + "step": 912 + }, + { + "epoch": 1.9110413396127681, + "grad_norm": 14.091341972351074, + "learning_rate": 1.544293845417749e-07, + "logits/chosen": 1.6171875, + "logits/rejected": 1.953125, + "logps/chosen": -592.0, + "logps/rejected": -284.0, + "loss": 0.6636, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7109375, + "rewards/margins": -0.28125, + "rewards/rejected": -1.4375, + "step": 913 + }, + { + "epoch": 1.913134484563056, + "grad_norm": 10.890253067016602, + "learning_rate": 1.5390419704533341e-07, + "logits/chosen": 2.671875, + "logits/rejected": 3.359375, + "logps/chosen": -800.0, + "logps/rejected": -776.0, + "loss": 0.5613, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.796875, + "rewards/margins": 0.484375, + "rewards/rejected": -2.28125, + "step": 914 + }, + { + "epoch": 1.915227629513344, + "grad_norm": 11.80103588104248, + "learning_rate": 1.5337950697616237e-07, + "logits/chosen": 1.53125, + "logits/rejected": 2.921875, + "logps/chosen": -552.0, + "logps/rejected": -580.0, + "loss": 0.5861, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0, + "rewards/margins": 0.5234375, + "rewards/rejected": -1.5234375, + "step": 915 + }, + { + "epoch": 1.9173207744636316, + "grad_norm": 11.781697273254395, + "learning_rate": 1.5285531704954466e-07, + "logits/chosen": 1.96875, + "logits/rejected": 2.265625, + "logps/chosen": -308.0, + "logps/rejected": -296.0, + "loss": 0.6292, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.03125, + "rewards/margins": 0.4296875, + "rewards/rejected": -1.4609375, + "step": 916 + }, + { + "epoch": 1.9194139194139193, + "grad_norm": 11.060654640197754, + "learning_rate": 1.5233162997817455e-07, + "logits/chosen": 2.4375, + "logits/rejected": 2.125, + "logps/chosen": -302.0, + "logps/rejected": -460.0, + "loss": 0.5788, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.453125, + "rewards/margins": 0.23046875, + "rewards/rejected": -1.6875, + "step": 917 + }, + { + "epoch": 1.9215070643642074, + "grad_norm": 10.4055814743042, + "learning_rate": 1.5180844847214423e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.25, + "logps/chosen": -816.0, + "logps/rejected": -486.0, + "loss": 0.5728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9765625, + "rewards/margins": 0.68359375, + "rewards/rejected": -1.65625, + "step": 918 + }, + { + "epoch": 1.923600209314495, + "grad_norm": 10.363648414611816, + "learning_rate": 1.5128577523892936e-07, + "logits/chosen": 1.84375, + "logits/rejected": 1.7734375, + "logps/chosen": -302.0, + "logps/rejected": -264.0, + "loss": 0.5987, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3671875, + "rewards/margins": -0.1220703125, + "rewards/rejected": -1.2421875, + "step": 919 + }, + { + "epoch": 1.9256933542647827, + "grad_norm": 11.018433570861816, + "learning_rate": 1.5076361298337561e-07, + "logits/chosen": 2.40625, + "logits/rejected": 2.421875, + "logps/chosen": -544.0, + "logps/rejected": -452.0, + "loss": 0.5838, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.09765625, + "rewards/rejected": -1.1875, + "step": 920 + }, + { + "epoch": 1.9277864992150706, + "grad_norm": 11.115631103515625, + "learning_rate": 1.50241964407684e-07, + "logits/chosen": 1.5078125, + "logits/rejected": 1.640625, + "logps/chosen": -388.0, + "logps/rejected": -524.0, + "loss": 0.5934, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5625, + "rewards/margins": 0.345703125, + "rewards/rejected": -1.90625, + "step": 921 + }, + { + "epoch": 1.9298796441653585, + "grad_norm": 11.797347068786621, + "learning_rate": 1.4972083221139747e-07, + "logits/chosen": 2.46875, + "logits/rejected": 2.234375, + "logps/chosen": -616.0, + "logps/rejected": -512.0, + "loss": 0.6202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9609375, + "rewards/margins": 0.5546875, + "rewards/rejected": -1.515625, + "step": 922 + }, + { + "epoch": 1.9319727891156462, + "grad_norm": 11.789385795593262, + "learning_rate": 1.4920021909138656e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.6875, + "logps/chosen": -404.0, + "logps/rejected": -318.0, + "loss": 0.5951, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.17578125, + "rewards/rejected": -1.390625, + "step": 923 + }, + { + "epoch": 1.934065934065934, + "grad_norm": 11.329683303833008, + "learning_rate": 1.4868012774183568e-07, + "logits/chosen": 1.6484375, + "logits/rejected": 1.4140625, + "logps/chosen": -324.0, + "logps/rejected": -548.0, + "loss": 0.611, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.28125, + "rewards/margins": 0.890625, + "rewards/rejected": -2.171875, + "step": 924 + }, + { + "epoch": 1.936159079016222, + "grad_norm": 11.30130672454834, + "learning_rate": 1.4816056085422904e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.75, + "logps/chosen": -438.0, + "logps/rejected": -496.0, + "loss": 0.5717, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.291015625, + "rewards/rejected": -1.609375, + "step": 925 + }, + { + "epoch": 1.9382522239665096, + "grad_norm": 11.121234893798828, + "learning_rate": 1.4764152111733649e-07, + "logits/chosen": 1.9296875, + "logits/rejected": 2.296875, + "logps/chosen": -380.0, + "logps/rejected": -382.0, + "loss": 0.5575, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265625, + "rewards/margins": 0.6328125, + "rewards/rejected": -1.890625, + "step": 926 + }, + { + "epoch": 1.9403453689167975, + "grad_norm": 11.710637092590332, + "learning_rate": 1.471230112172004e-07, + "logits/chosen": 2.640625, + "logits/rejected": 3.015625, + "logps/chosen": -744.0, + "logps/rejected": -580.0, + "loss": 0.5562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.890625, + "rewards/margins": 1.6875, + "rewards/rejected": -2.578125, + "step": 927 + }, + { + "epoch": 1.9424385138670854, + "grad_norm": 11.492278099060059, + "learning_rate": 1.466050338371207e-07, + "logits/chosen": 2.59375, + "logits/rejected": 3.15625, + "logps/chosen": -524.0, + "logps/rejected": -380.0, + "loss": 0.5853, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.75, + "rewards/margins": 0.07421875, + "rewards/rejected": -1.8203125, + "step": 928 + }, + { + "epoch": 1.944531658817373, + "grad_norm": 11.663466453552246, + "learning_rate": 1.460875916576418e-07, + "logits/chosen": 2.0625, + "logits/rejected": 3.46875, + "logps/chosen": -756.0, + "logps/rejected": -580.0, + "loss": 0.59, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.328125, + "rewards/margins": 0.06640625, + "rewards/rejected": -1.3984375, + "step": 929 + }, + { + "epoch": 1.9466248037676608, + "grad_norm": 11.041068077087402, + "learning_rate": 1.4557068735653835e-07, + "logits/chosen": 1.59375, + "logits/rejected": 1.5703125, + "logps/chosen": -466.0, + "logps/rejected": -382.0, + "loss": 0.6007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0625, + "rewards/margins": 0.41015625, + "rewards/rejected": -1.4765625, + "step": 930 + }, + { + "epoch": 1.9487179487179487, + "grad_norm": 11.396171569824219, + "learning_rate": 1.4505432360880155e-07, + "logits/chosen": 2.59375, + "logits/rejected": 2.640625, + "logps/chosen": -664.0, + "logps/rejected": -568.0, + "loss": 0.5673, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.625, + "rewards/rejected": -2.0625, + "step": 931 + }, + { + "epoch": 1.9508110936682366, + "grad_norm": 11.42209243774414, + "learning_rate": 1.4453850308662502e-07, + "logits/chosen": 2.65625, + "logits/rejected": 2.421875, + "logps/chosen": -406.0, + "logps/rejected": -418.0, + "loss": 0.5992, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.25, + "rewards/margins": 0.0673828125, + "rewards/rejected": -1.3203125, + "step": 932 + }, + { + "epoch": 1.9529042386185242, + "grad_norm": 11.87649154663086, + "learning_rate": 1.4402322845939152e-07, + "logits/chosen": 1.171875, + "logits/rejected": 1.328125, + "logps/chosen": -310.0, + "logps/rejected": -552.0, + "loss": 0.6093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4140625, + "rewards/margins": 0.4296875, + "rewards/rejected": -1.84375, + "step": 933 + }, + { + "epoch": 1.9549973835688121, + "grad_norm": 12.434530258178711, + "learning_rate": 1.4350850239365836e-07, + "logits/chosen": 1.484375, + "logits/rejected": 1.7421875, + "logps/chosen": -488.0, + "logps/rejected": -484.0, + "loss": 0.5743, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6875, + "rewards/margins": 0.5703125, + "rewards/rejected": -2.25, + "step": 934 + }, + { + "epoch": 1.9570905285191, + "grad_norm": 11.32255744934082, + "learning_rate": 1.4299432755314434e-07, + "logits/chosen": 1.59375, + "logits/rejected": 1.7265625, + "logps/chosen": -298.0, + "logps/rejected": -268.0, + "loss": 0.5831, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9296875, + "rewards/margins": 0.345703125, + "rewards/rejected": -1.2734375, + "step": 935 + }, + { + "epoch": 1.9591836734693877, + "grad_norm": 10.894527435302734, + "learning_rate": 1.424807065987157e-07, + "logits/chosen": 1.265625, + "logits/rejected": 1.7109375, + "logps/chosen": -326.0, + "logps/rejected": -544.0, + "loss": 0.5713, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0234375, + "rewards/margins": 0.6328125, + "rewards/rejected": -1.65625, + "step": 936 + }, + { + "epoch": 1.9612768184196756, + "grad_norm": 12.204416275024414, + "learning_rate": 1.41967642188372e-07, + "logits/chosen": 2.1875, + "logits/rejected": 3.375, + "logps/chosen": -556.0, + "logps/rejected": -412.0, + "loss": 0.6099, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.6171875, + "rewards/margins": 0.09765625, + "rewards/rejected": -1.71875, + "step": 937 + }, + { + "epoch": 1.9633699633699635, + "grad_norm": 11.9826078414917, + "learning_rate": 1.4145513697723298e-07, + "logits/chosen": 1.0078125, + "logits/rejected": 1.2265625, + "logps/chosen": -532.0, + "logps/rejected": -370.0, + "loss": 0.5968, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.015625, + "rewards/margins": -0.4140625, + "rewards/rejected": -1.6015625, + "step": 938 + }, + { + "epoch": 1.9654631083202512, + "grad_norm": 11.560630798339844, + "learning_rate": 1.409431936175243e-07, + "logits/chosen": 1.90625, + "logits/rejected": 2.140625, + "logps/chosen": -532.0, + "logps/rejected": -540.0, + "loss": 0.5679, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.5859375, + "rewards/rejected": -2.078125, + "step": 939 + }, + { + "epoch": 1.9675562532705388, + "grad_norm": 12.335697174072266, + "learning_rate": 1.404318147585642e-07, + "logits/chosen": 2.140625, + "logits/rejected": 3.125, + "logps/chosen": -580.0, + "logps/rejected": -552.0, + "loss": 0.6404, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.515625, + "rewards/margins": 0.8671875, + "rewards/rejected": -2.375, + "step": 940 + }, + { + "epoch": 1.9696493982208267, + "grad_norm": 11.62963581085205, + "learning_rate": 1.399210030467494e-07, + "logits/chosen": 2.21875, + "logits/rejected": 2.40625, + "logps/chosen": -744.0, + "logps/rejected": -352.0, + "loss": 0.6233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8984375, + "rewards/margins": 0.5703125, + "rewards/rejected": -1.46875, + "step": 941 + }, + { + "epoch": 1.9717425431711146, + "grad_norm": 11.435396194458008, + "learning_rate": 1.3941076112554183e-07, + "logits/chosen": 2.203125, + "logits/rejected": 2.484375, + "logps/chosen": -788.0, + "logps/rejected": -448.0, + "loss": 0.5569, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2734375, + "rewards/margins": 0.42578125, + "rewards/rejected": -1.703125, + "step": 942 + }, + { + "epoch": 1.9738356881214023, + "grad_norm": 11.883343696594238, + "learning_rate": 1.3890109163545475e-07, + "logits/chosen": 2.03125, + "logits/rejected": 1.8046875, + "logps/chosen": -460.0, + "logps/rejected": -512.0, + "loss": 0.6044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0078125, + "rewards/margins": 0.65234375, + "rewards/rejected": -1.6640625, + "step": 943 + }, + { + "epoch": 1.9759288330716902, + "grad_norm": 10.637882232666016, + "learning_rate": 1.3839199721403893e-07, + "logits/chosen": 2.5625, + "logits/rejected": 2.53125, + "logps/chosen": -406.0, + "logps/rejected": -412.0, + "loss": 0.5791, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.90234375, + "rewards/rejected": -2.09375, + "step": 944 + }, + { + "epoch": 1.978021978021978, + "grad_norm": 13.070167541503906, + "learning_rate": 1.37883480495869e-07, + "logits/chosen": 1.421875, + "logits/rejected": 1.109375, + "logps/chosen": -402.0, + "logps/rejected": -460.0, + "loss": 0.638, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.84375, + "rewards/margins": 0.150390625, + "rewards/rejected": -1.9921875, + "step": 945 + }, + { + "epoch": 1.9801151229722658, + "grad_norm": 11.330702781677246, + "learning_rate": 1.373755441125304e-07, + "logits/chosen": 2.015625, + "logits/rejected": 2.578125, + "logps/chosen": -528.0, + "logps/rejected": -358.0, + "loss": 0.597, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5546875, + "rewards/margins": 0.189453125, + "rewards/rejected": -1.75, + "step": 946 + }, + { + "epoch": 1.9822082679225537, + "grad_norm": 12.942609786987305, + "learning_rate": 1.368681906926051e-07, + "logits/chosen": 2.53125, + "logits/rejected": 3.125, + "logps/chosen": -516.0, + "logps/rejected": -316.0, + "loss": 0.6289, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.75, + "rewards/margins": -0.248046875, + "rewards/rejected": -1.5078125, + "step": 947 + }, + { + "epoch": 1.9843014128728416, + "grad_norm": 10.92066478729248, + "learning_rate": 1.363614228616581e-07, + "logits/chosen": 2.515625, + "logits/rejected": 2.125, + "logps/chosen": -378.0, + "logps/rejected": -512.0, + "loss": 0.5687, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.25, + "rewards/margins": 0.208984375, + "rewards/rejected": -1.453125, + "step": 948 + }, + { + "epoch": 1.9863945578231292, + "grad_norm": 12.061305046081543, + "learning_rate": 1.3585524324222406e-07, + "logits/chosen": 1.8359375, + "logits/rejected": 1.5078125, + "logps/chosen": -496.0, + "logps/rejected": -396.0, + "loss": 0.6059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.53125, + "rewards/margins": 0.494140625, + "rewards/rejected": -2.03125, + "step": 949 + }, + { + "epoch": 1.988487702773417, + "grad_norm": 11.782267570495605, + "learning_rate": 1.3534965445379382e-07, + "logits/chosen": 2.125, + "logits/rejected": 2.5, + "logps/chosen": -768.0, + "logps/rejected": -568.0, + "loss": 0.5928, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.875, + "rewards/margins": -0.25390625, + "rewards/rejected": -1.6171875, + "step": 950 + }, + { + "epoch": 1.990580847723705, + "grad_norm": 11.26547908782959, + "learning_rate": 1.3484465911280038e-07, + "logits/chosen": 1.34375, + "logits/rejected": 1.875, + "logps/chosen": -544.0, + "logps/rejected": -572.0, + "loss": 0.5709, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0546875, + "rewards/margins": 1.3125, + "rewards/rejected": -2.359375, + "step": 951 + }, + { + "epoch": 1.9926739926739927, + "grad_norm": 11.085479736328125, + "learning_rate": 1.3434025983260566e-07, + "logits/chosen": 1.578125, + "logits/rejected": 1.6015625, + "logps/chosen": -406.0, + "logps/rejected": -564.0, + "loss": 0.595, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.15625, + "rewards/margins": 0.5859375, + "rewards/rejected": -1.7421875, + "step": 952 + }, + { + "epoch": 1.9947671376242804, + "grad_norm": 11.633567810058594, + "learning_rate": 1.338364592234871e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.34375, + "logps/chosen": -748.0, + "logps/rejected": -600.0, + "loss": 0.6095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.265625, + "rewards/margins": 0.392578125, + "rewards/rejected": -1.65625, + "step": 953 + }, + { + "epoch": 1.9968602825745683, + "grad_norm": 11.245035171508789, + "learning_rate": 1.3333325989262405e-07, + "logits/chosen": 2.65625, + "logits/rejected": 3.3125, + "logps/chosen": -644.0, + "logps/rejected": -672.0, + "loss": 0.5893, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.21875, + "rewards/margins": 0.78125, + "rewards/rejected": -2.0, + "step": 954 + }, + { + "epoch": 1.9989534275248562, + "grad_norm": 11.77606201171875, + "learning_rate": 1.3283066444408403e-07, + "logits/chosen": 1.5625, + "logits/rejected": 1.4140625, + "logps/chosen": -238.0, + "logps/rejected": -316.0, + "loss": 0.6104, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265625, + "rewards/margins": 0.462890625, + "rewards/rejected": -1.734375, + "step": 955 + }, + { + "epoch": 2.001046572475144, + "grad_norm": 11.85208511352539, + "learning_rate": 1.3232867547880933e-07, + "logits/chosen": 2.0625, + "logits/rejected": 3.078125, + "logps/chosen": -556.0, + "logps/rejected": -342.0, + "loss": 0.581, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7109375, + "rewards/margins": -0.08203125, + "rewards/rejected": -1.6328125, + "step": 956 + }, + { + "epoch": 2.0031397174254315, + "grad_norm": 12.282042503356934, + "learning_rate": 1.318272955946043e-07, + "logits/chosen": 0.625, + "logits/rejected": 0.41015625, + "logps/chosen": -222.0, + "logps/rejected": -294.0, + "loss": 0.6083, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.453125, + "rewards/margins": 0.357421875, + "rewards/rejected": -1.8125, + "step": 957 + }, + { + "epoch": 2.0052328623757196, + "grad_norm": 10.775773048400879, + "learning_rate": 1.3132652738612068e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.46875, + "logps/chosen": -442.0, + "logps/rejected": -390.0, + "loss": 0.6026, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.46875, + "rewards/margins": 0.44921875, + "rewards/rejected": -1.921875, + "step": 958 + }, + { + "epoch": 2.0073260073260073, + "grad_norm": 11.614090919494629, + "learning_rate": 1.308263734448449e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.65625, + "logps/chosen": -676.0, + "logps/rejected": -812.0, + "loss": 0.6351, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.125, + "rewards/margins": 0.1513671875, + "rewards/rejected": -1.2734375, + "step": 959 + }, + { + "epoch": 2.009419152276295, + "grad_norm": 10.593873977661133, + "learning_rate": 1.3032683635908465e-07, + "logits/chosen": 1.2109375, + "logits/rejected": 0.734375, + "logps/chosen": -252.0, + "logps/rejected": -426.0, + "loss": 0.5732, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.484375, + "rewards/margins": -0.033203125, + "rewards/rejected": -1.453125, + "step": 960 + }, + { + "epoch": 2.011512297226583, + "grad_norm": 11.892288208007812, + "learning_rate": 1.2982791871395545e-07, + "logits/chosen": 2.515625, + "logits/rejected": 2.5, + "logps/chosen": -692.0, + "logps/rejected": -788.0, + "loss": 0.5866, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.375, + "rewards/margins": 0.314453125, + "rewards/rejected": -1.6875, + "step": 961 + }, + { + "epoch": 2.0136054421768708, + "grad_norm": 10.537877082824707, + "learning_rate": 1.2932962309136702e-07, + "logits/chosen": 1.734375, + "logits/rejected": 2.03125, + "logps/chosen": -584.0, + "logps/rejected": -476.0, + "loss": 0.568, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.34375, + "rewards/margins": -0.2109375, + "rewards/rejected": -1.1328125, + "step": 962 + }, + { + "epoch": 2.0156985871271584, + "grad_norm": 10.745752334594727, + "learning_rate": 1.2883195207001e-07, + "logits/chosen": 0.98828125, + "logits/rejected": 1.15625, + "logps/chosen": -338.0, + "logps/rejected": -260.0, + "loss": 0.595, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.0810546875, + "rewards/rejected": -1.375, + "step": 963 + }, + { + "epoch": 2.0177917320774466, + "grad_norm": 10.625443458557129, + "learning_rate": 1.2833490822534327e-07, + "logits/chosen": 2.453125, + "logits/rejected": 2.640625, + "logps/chosen": -458.0, + "logps/rejected": -326.0, + "loss": 0.5472, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.515625, + "rewards/margins": 0.158203125, + "rewards/rejected": -1.671875, + "step": 964 + }, + { + "epoch": 2.0198848770277342, + "grad_norm": 10.723146438598633, + "learning_rate": 1.2783849412957937e-07, + "logits/chosen": 2.609375, + "logits/rejected": 2.359375, + "logps/chosen": -380.0, + "logps/rejected": -506.0, + "loss": 0.5894, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.21875, + "rewards/margins": 0.435546875, + "rewards/rejected": -1.6484375, + "step": 965 + }, + { + "epoch": 2.021978021978022, + "grad_norm": 11.235821723937988, + "learning_rate": 1.2734271235167214e-07, + "logits/chosen": 1.53125, + "logits/rejected": 1.4765625, + "logps/chosen": -414.0, + "logps/rejected": -588.0, + "loss": 0.5805, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.390625, + "rewards/margins": 0.208984375, + "rewards/rejected": -1.59375, + "step": 966 + }, + { + "epoch": 2.0240711669283096, + "grad_norm": 11.411491394042969, + "learning_rate": 1.2684756545730336e-07, + "logits/chosen": 0.5390625, + "logits/rejected": 1.0859375, + "logps/chosen": -204.0, + "logps/rejected": -188.0, + "loss": 0.5965, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.125, + "rewards/rejected": -1.359375, + "step": 967 + }, + { + "epoch": 2.0261643118785977, + "grad_norm": 11.736352920532227, + "learning_rate": 1.2635305600886905e-07, + "logits/chosen": 1.796875, + "logits/rejected": 2.03125, + "logps/chosen": -604.0, + "logps/rejected": -462.0, + "loss": 0.5857, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.15625, + "rewards/margins": 0.1669921875, + "rewards/rejected": -1.328125, + "step": 968 + }, + { + "epoch": 2.0282574568288854, + "grad_norm": 11.645047187805176, + "learning_rate": 1.2585918656546644e-07, + "logits/chosen": 2.40625, + "logits/rejected": 3.21875, + "logps/chosen": -708.0, + "logps/rejected": -460.0, + "loss": 0.5444, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2421875, + "rewards/margins": 0.6015625, + "rewards/rejected": -1.84375, + "step": 969 + }, + { + "epoch": 2.030350601779173, + "grad_norm": 12.109193801879883, + "learning_rate": 1.2536595968288074e-07, + "logits/chosen": 1.046875, + "logits/rejected": 0.828125, + "logps/chosen": -308.0, + "logps/rejected": -364.0, + "loss": 0.6242, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.1259765625, + "rewards/rejected": -1.4375, + "step": 970 + }, + { + "epoch": 2.032443746729461, + "grad_norm": 10.74459171295166, + "learning_rate": 1.248733779135721e-07, + "logits/chosen": 1.53125, + "logits/rejected": 1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -588.0, + "loss": 0.5712, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4375, + "rewards/margins": 1.015625, + "rewards/rejected": -2.453125, + "step": 971 + }, + { + "epoch": 2.034536891679749, + "grad_norm": 12.447172164916992, + "learning_rate": 1.243814438066619e-07, + "logits/chosen": 1.6328125, + "logits/rejected": 2.765625, + "logps/chosen": -608.0, + "logps/rejected": -382.0, + "loss": 0.5826, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3046875, + "rewards/margins": 0.275390625, + "rewards/rejected": -1.578125, + "step": 972 + }, + { + "epoch": 2.0366300366300365, + "grad_norm": 12.21883773803711, + "learning_rate": 1.2389015990791987e-07, + "logits/chosen": 1.9609375, + "logits/rejected": 1.203125, + "logps/chosen": -412.0, + "logps/rejected": -976.0, + "loss": 0.5723, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.390625, + "rewards/margins": 0.6328125, + "rewards/rejected": -2.03125, + "step": 973 + }, + { + "epoch": 2.0387231815803246, + "grad_norm": 11.15044116973877, + "learning_rate": 1.2339952875975111e-07, + "logits/chosen": 1.3359375, + "logits/rejected": 1.234375, + "logps/chosen": -548.0, + "logps/rejected": -460.0, + "loss": 0.5791, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.46875, + "rewards/margins": 0.6484375, + "rewards/rejected": -2.125, + "step": 974 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 11.227256774902344, + "learning_rate": 1.229095529011827e-07, + "logits/chosen": 1.796875, + "logits/rejected": 1.7890625, + "logps/chosen": -348.0, + "logps/rejected": -416.0, + "loss": 0.6088, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4765625, + "rewards/margins": -0.12890625, + "rewards/rejected": -1.3515625, + "step": 975 + }, + { + "epoch": 2.0429094714809, + "grad_norm": 11.456323623657227, + "learning_rate": 1.2242023486785027e-07, + "logits/chosen": 1.5546875, + "logits/rejected": 1.7890625, + "logps/chosen": -684.0, + "logps/rejected": -498.0, + "loss": 0.5806, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.546875, + "rewards/margins": 0.35546875, + "rewards/rejected": -1.8984375, + "step": 976 + }, + { + "epoch": 2.045002616431188, + "grad_norm": 11.750733375549316, + "learning_rate": 1.219315771919856e-07, + "logits/chosen": 1.3515625, + "logits/rejected": 1.640625, + "logps/chosen": -434.0, + "logps/rejected": -450.0, + "loss": 0.5408, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2421875, + "rewards/margins": 0.75390625, + "rewards/rejected": -2.0, + "step": 977 + }, + { + "epoch": 2.0470957613814758, + "grad_norm": 11.425230979919434, + "learning_rate": 1.2144358240240275e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.4375, + "logps/chosen": -510.0, + "logps/rejected": -680.0, + "loss": 0.5966, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.359375, + "rewards/margins": 1.015625, + "rewards/rejected": -2.375, + "step": 978 + }, + { + "epoch": 2.0491889063317634, + "grad_norm": 10.762430191040039, + "learning_rate": 1.209562530244857e-07, + "logits/chosen": 2.46875, + "logits/rejected": 2.46875, + "logps/chosen": -644.0, + "logps/rejected": -720.0, + "loss": 0.567, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3671875, + "rewards/margins": 0.515625, + "rewards/rejected": -1.8828125, + "step": 979 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 11.131372451782227, + "learning_rate": 1.2046959158017447e-07, + "logits/chosen": 2.015625, + "logits/rejected": 2.3125, + "logps/chosen": -478.0, + "logps/rejected": -502.0, + "loss": 0.5882, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5703125, + "rewards/margins": 0.26953125, + "rewards/rejected": -1.84375, + "step": 980 + }, + { + "epoch": 2.053375196232339, + "grad_norm": 10.981115341186523, + "learning_rate": 1.199836005879529e-07, + "logits/chosen": 2.140625, + "logits/rejected": 2.5625, + "logps/chosen": -544.0, + "logps/rejected": -440.0, + "loss": 0.6054, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.25, + "rewards/margins": 0.2080078125, + "rewards/rejected": -1.4609375, + "step": 981 + }, + { + "epoch": 2.055468341182627, + "grad_norm": 10.351678848266602, + "learning_rate": 1.194982825628351e-07, + "logits/chosen": 2.015625, + "logits/rejected": 2.015625, + "logps/chosen": -358.0, + "logps/rejected": -227.0, + "loss": 0.585, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.34375, + "rewards/margins": 0.0400390625, + "rewards/rejected": -1.3828125, + "step": 982 + }, + { + "epoch": 2.0575614861329146, + "grad_norm": 11.870306968688965, + "learning_rate": 1.1901364001635238e-07, + "logits/chosen": 1.15625, + "logits/rejected": 1.796875, + "logps/chosen": -422.0, + "logps/rejected": -324.0, + "loss": 0.6144, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7421875, + "rewards/margins": -0.3984375, + "rewards/rejected": -1.34375, + "step": 983 + }, + { + "epoch": 2.0596546310832027, + "grad_norm": 12.057429313659668, + "learning_rate": 1.1852967545654076e-07, + "logits/chosen": 2.65625, + "logits/rejected": 3.171875, + "logps/chosen": -600.0, + "logps/rejected": -490.0, + "loss": 0.641, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0625, + "rewards/margins": 0.396484375, + "rewards/rejected": -1.453125, + "step": 984 + }, + { + "epoch": 2.0617477760334904, + "grad_norm": 11.112940788269043, + "learning_rate": 1.1804639138792731e-07, + "logits/chosen": 2.109375, + "logits/rejected": 2.59375, + "logps/chosen": -466.0, + "logps/rejected": -392.0, + "loss": 0.5666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.859375, + "rewards/margins": 0.44140625, + "rewards/rejected": -1.296875, + "step": 985 + }, + { + "epoch": 2.063840920983778, + "grad_norm": 10.575416564941406, + "learning_rate": 1.1756379031151787e-07, + "logits/chosen": 2.59375, + "logits/rejected": 1.8359375, + "logps/chosen": -440.0, + "logps/rejected": -520.0, + "loss": 0.5904, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.71875, + "rewards/margins": -0.080078125, + "rewards/rejected": -1.640625, + "step": 986 + }, + { + "epoch": 2.065934065934066, + "grad_norm": 10.981159210205078, + "learning_rate": 1.170818747247835e-07, + "logits/chosen": 2.796875, + "logits/rejected": 2.578125, + "logps/chosen": -524.0, + "logps/rejected": -688.0, + "loss": 0.6016, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.484375, + "rewards/margins": 0.123046875, + "rewards/rejected": -1.609375, + "step": 987 + }, + { + "epoch": 2.068027210884354, + "grad_norm": 11.04464340209961, + "learning_rate": 1.1660064712164814e-07, + "logits/chosen": 1.7421875, + "logits/rejected": 1.8359375, + "logps/chosen": -620.0, + "logps/rejected": -548.0, + "loss": 0.5942, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5625, + "rewards/margins": 0.271484375, + "rewards/rejected": -1.8359375, + "step": 988 + }, + { + "epoch": 2.0701203558346415, + "grad_norm": 12.590449333190918, + "learning_rate": 1.16120109992475e-07, + "logits/chosen": 1.7109375, + "logits/rejected": 2.03125, + "logps/chosen": -784.0, + "logps/rejected": -640.0, + "loss": 0.5942, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.359375, + "rewards/margins": -0.70703125, + "rewards/rejected": -1.65625, + "step": 989 + }, + { + "epoch": 2.072213500784929, + "grad_norm": 10.410261154174805, + "learning_rate": 1.156402658240544e-07, + "logits/chosen": 1.71875, + "logits/rejected": 1.625, + "logps/chosen": -364.0, + "logps/rejected": -352.0, + "loss": 0.5884, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.28125, + "rewards/margins": -0.0927734375, + "rewards/rejected": -1.1875, + "step": 990 + }, + { + "epoch": 2.0743066457352173, + "grad_norm": 10.231704711914062, + "learning_rate": 1.1516111709959061e-07, + "logits/chosen": 2.0625, + "logits/rejected": 1.9921875, + "logps/chosen": -660.0, + "logps/rejected": -434.0, + "loss": 0.5557, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7578125, + "rewards/margins": -0.0791015625, + "rewards/rejected": -1.6796875, + "step": 991 + }, + { + "epoch": 2.076399790685505, + "grad_norm": 10.845260620117188, + "learning_rate": 1.1468266629868861e-07, + "logits/chosen": 1.34375, + "logits/rejected": 1.5390625, + "logps/chosen": -428.0, + "logps/rejected": -386.0, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0859375, + "rewards/margins": 0.4765625, + "rewards/rejected": -1.5625, + "step": 992 + }, + { + "epoch": 2.0784929356357926, + "grad_norm": 11.413115501403809, + "learning_rate": 1.1420491589734201e-07, + "logits/chosen": 1.8984375, + "logits/rejected": 2.5, + "logps/chosen": -480.0, + "logps/rejected": -352.0, + "loss": 0.6148, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1875, + "rewards/margins": 0.1171875, + "rewards/rejected": -1.296875, + "step": 993 + }, + { + "epoch": 2.0805860805860807, + "grad_norm": 11.16401195526123, + "learning_rate": 1.1372786836791945e-07, + "logits/chosen": 1.875, + "logits/rejected": 2.203125, + "logps/chosen": -836.0, + "logps/rejected": -382.0, + "loss": 0.5992, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.984375, + "rewards/margins": 0.2734375, + "rewards/rejected": -1.2578125, + "step": 994 + }, + { + "epoch": 2.0826792255363684, + "grad_norm": 11.828091621398926, + "learning_rate": 1.132515261791526e-07, + "logits/chosen": 2.5, + "logits/rejected": 2.5, + "logps/chosen": -720.0, + "logps/rejected": -572.0, + "loss": 0.5973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9375, + "rewards/margins": 0.8046875, + "rewards/rejected": -1.7421875, + "step": 995 + }, + { + "epoch": 2.084772370486656, + "grad_norm": 11.524788856506348, + "learning_rate": 1.1277589179612257e-07, + "logits/chosen": 1.9453125, + "logits/rejected": 1.8125, + "logps/chosen": -356.0, + "logps/rejected": -462.0, + "loss": 0.5597, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3984375, + "rewards/margins": 0.01171875, + "rewards/rejected": -1.4140625, + "step": 996 + }, + { + "epoch": 2.086865515436944, + "grad_norm": 11.08915901184082, + "learning_rate": 1.1230096768024787e-07, + "logits/chosen": 1.9140625, + "logits/rejected": 1.8359375, + "logps/chosen": -434.0, + "logps/rejected": -656.0, + "loss": 0.5984, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.546875, + "rewards/margins": -0.099609375, + "rewards/rejected": -1.4453125, + "step": 997 + }, + { + "epoch": 2.088958660387232, + "grad_norm": 11.421136856079102, + "learning_rate": 1.1182675628927133e-07, + "logits/chosen": 1.7421875, + "logits/rejected": 2.5, + "logps/chosen": -472.0, + "logps/rejected": -472.0, + "loss": 0.5609, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.984375, + "rewards/margins": 1.1328125, + "rewards/rejected": -2.125, + "step": 998 + }, + { + "epoch": 2.0910518053375196, + "grad_norm": 10.709765434265137, + "learning_rate": 1.1135326007724723e-07, + "logits/chosen": 2.234375, + "logits/rejected": 1.8828125, + "logps/chosen": -342.0, + "logps/rejected": -524.0, + "loss": 0.5907, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.59765625, + "rewards/rejected": -1.7109375, + "step": 999 + }, + { + "epoch": 2.0931449502878072, + "grad_norm": 11.891133308410645, + "learning_rate": 1.1088048149452881e-07, + "logits/chosen": 1.71875, + "logits/rejected": 2.4375, + "logps/chosen": -490.0, + "logps/rejected": -504.0, + "loss": 0.6031, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4609375, + "rewards/margins": -0.1337890625, + "rewards/rejected": -1.328125, + "step": 1000 + }, + { + "epoch": 2.0952380952380953, + "grad_norm": 12.135580062866211, + "learning_rate": 1.1040842298775572e-07, + "logits/chosen": 2.109375, + "logits/rejected": 1.78125, + "logps/chosen": -300.0, + "logps/rejected": -496.0, + "loss": 0.5992, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.28125, + "rewards/margins": 0.41015625, + "rewards/rejected": -1.6875, + "step": 1001 + }, + { + "epoch": 2.097331240188383, + "grad_norm": 11.127384185791016, + "learning_rate": 1.0993708699984125e-07, + "logits/chosen": 0.69921875, + "logits/rejected": 1.03125, + "logps/chosen": -388.0, + "logps/rejected": -564.0, + "loss": 0.5675, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5703125, + "rewards/margins": 0.478515625, + "rewards/rejected": -2.046875, + "step": 1002 + }, + { + "epoch": 2.0994243851386707, + "grad_norm": 12.559244155883789, + "learning_rate": 1.0946647596995929e-07, + "logits/chosen": 2.015625, + "logits/rejected": 1.7265625, + "logps/chosen": -328.0, + "logps/rejected": -342.0, + "loss": 0.6289, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.265625, + "rewards/margins": 0.0029296875, + "rewards/rejected": -1.2734375, + "step": 1003 + }, + { + "epoch": 2.101517530088959, + "grad_norm": 11.606091499328613, + "learning_rate": 1.0899659233353235e-07, + "logits/chosen": 2.515625, + "logits/rejected": 2.796875, + "logps/chosen": -752.0, + "logps/rejected": -532.0, + "loss": 0.5846, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.34375, + "rewards/margins": 0.1796875, + "rewards/rejected": -1.5234375, + "step": 1004 + }, + { + "epoch": 2.1036106750392465, + "grad_norm": 11.270895004272461, + "learning_rate": 1.0852743852221874e-07, + "logits/chosen": 2.03125, + "logits/rejected": 2.78125, + "logps/chosen": -600.0, + "logps/rejected": -304.0, + "loss": 0.5836, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.34375, + "rewards/rejected": -1.53125, + "step": 1005 + }, + { + "epoch": 2.105703819989534, + "grad_norm": 11.021132469177246, + "learning_rate": 1.0805901696389961e-07, + "logits/chosen": 1.765625, + "logits/rejected": 2.375, + "logps/chosen": -294.0, + "logps/rejected": -340.0, + "loss": 0.5985, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.56640625, + "rewards/rejected": -1.78125, + "step": 1006 + }, + { + "epoch": 2.1077969649398223, + "grad_norm": 12.249751091003418, + "learning_rate": 1.075913300826668e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.75, + "logps/chosen": -692.0, + "logps/rejected": -908.0, + "loss": 0.6222, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4375, + "rewards/margins": 0.30078125, + "rewards/rejected": -1.734375, + "step": 1007 + }, + { + "epoch": 2.10989010989011, + "grad_norm": 11.67434310913086, + "learning_rate": 1.0712438029881024e-07, + "logits/chosen": 2.578125, + "logits/rejected": 2.984375, + "logps/chosen": -692.0, + "logps/rejected": -592.0, + "loss": 0.5725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.296875, + "rewards/rejected": -1.6484375, + "step": 1008 + }, + { + "epoch": 2.1119832548403976, + "grad_norm": 11.299335479736328, + "learning_rate": 1.0665817002880547e-07, + "logits/chosen": 2.109375, + "logits/rejected": 2.09375, + "logps/chosen": -378.0, + "logps/rejected": -302.0, + "loss": 0.5981, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.0654296875, + "rewards/rejected": -1.25, + "step": 1009 + }, + { + "epoch": 2.1140763997906853, + "grad_norm": 11.45093822479248, + "learning_rate": 1.0619270168530069e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.796875, + "logps/chosen": -808.0, + "logps/rejected": -1016.0, + "loss": 0.6147, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4765625, + "rewards/margins": 0.69921875, + "rewards/rejected": -2.171875, + "step": 1010 + }, + { + "epoch": 2.1161695447409734, + "grad_norm": 10.814870834350586, + "learning_rate": 1.0572797767710492e-07, + "logits/chosen": 1.6796875, + "logits/rejected": 2.1875, + "logps/chosen": -392.0, + "logps/rejected": -302.0, + "loss": 0.5401, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.640625, + "rewards/margins": -0.15234375, + "rewards/rejected": -1.484375, + "step": 1011 + }, + { + "epoch": 2.118262689691261, + "grad_norm": 10.025163650512695, + "learning_rate": 1.0526400040917522e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.140625, + "logps/chosen": -468.0, + "logps/rejected": -560.0, + "loss": 0.5737, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0390625, + "rewards/margins": 0.34765625, + "rewards/rejected": -1.3828125, + "step": 1012 + }, + { + "epoch": 2.1203558346415488, + "grad_norm": 11.721614837646484, + "learning_rate": 1.048007722826041e-07, + "logits/chosen": 2.21875, + "logits/rejected": 3.125, + "logps/chosen": -644.0, + "logps/rejected": -420.0, + "loss": 0.5554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9765625, + "rewards/margins": 1.0625, + "rewards/rejected": -2.03125, + "step": 1013 + }, + { + "epoch": 2.122448979591837, + "grad_norm": 10.800631523132324, + "learning_rate": 1.0433829569460719e-07, + "logits/chosen": 2.421875, + "logits/rejected": 2.8125, + "logps/chosen": -500.0, + "logps/rejected": -388.0, + "loss": 0.6006, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.82421875, + "rewards/margins": 0.1044921875, + "rewards/rejected": -0.9296875, + "step": 1014 + }, + { + "epoch": 2.1245421245421245, + "grad_norm": 12.868189811706543, + "learning_rate": 1.038765730385111e-07, + "logits/chosen": 1.875, + "logits/rejected": 2.03125, + "logps/chosen": -500.0, + "logps/rejected": -300.0, + "loss": 0.6121, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6640625, + "rewards/margins": -0.0888671875, + "rewards/rejected": -1.578125, + "step": 1015 + }, + { + "epoch": 2.126635269492412, + "grad_norm": 10.62038803100586, + "learning_rate": 1.0341560670374084e-07, + "logits/chosen": 1.75, + "logits/rejected": 1.7421875, + "logps/chosen": -376.0, + "logps/rejected": -504.0, + "loss": 0.5804, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.205078125, + "rewards/rejected": -1.6328125, + "step": 1016 + }, + { + "epoch": 2.1287284144427003, + "grad_norm": 11.46533489227295, + "learning_rate": 1.0295539907580711e-07, + "logits/chosen": 2.3125, + "logits/rejected": 2.6875, + "logps/chosen": -704.0, + "logps/rejected": -588.0, + "loss": 0.596, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2890625, + "rewards/margins": 1.234375, + "rewards/rejected": -2.53125, + "step": 1017 + }, + { + "epoch": 2.130821559392988, + "grad_norm": 12.483968734741211, + "learning_rate": 1.0249595253629467e-07, + "logits/chosen": 2.0, + "logits/rejected": 2.234375, + "logps/chosen": -340.0, + "logps/rejected": -474.0, + "loss": 0.6082, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.46875, + "rewards/margins": 0.8828125, + "rewards/rejected": -2.34375, + "step": 1018 + }, + { + "epoch": 2.1329147043432757, + "grad_norm": 12.161870002746582, + "learning_rate": 1.0203726946284953e-07, + "logits/chosen": 2.15625, + "logits/rejected": 3.046875, + "logps/chosen": -728.0, + "logps/rejected": -592.0, + "loss": 0.5685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.91015625, + "rewards/margins": 0.8984375, + "rewards/rejected": -1.8125, + "step": 1019 + }, + { + "epoch": 2.1350078492935634, + "grad_norm": 11.978885650634766, + "learning_rate": 1.015793522291666e-07, + "logits/chosen": 2.1875, + "logits/rejected": 2.71875, + "logps/chosen": -596.0, + "logps/rejected": -612.0, + "loss": 0.6069, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.125, + "rewards/margins": -0.2734375, + "rewards/rejected": -1.8515625, + "step": 1020 + }, + { + "epoch": 2.1371009942438515, + "grad_norm": 10.298712730407715, + "learning_rate": 1.0112220320497752e-07, + "logits/chosen": 1.21875, + "logits/rejected": 0.703125, + "logps/chosen": -160.0, + "logps/rejected": -278.0, + "loss": 0.5887, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.390625, + "rewards/rejected": -1.484375, + "step": 1021 + }, + { + "epoch": 2.139194139194139, + "grad_norm": 11.2387113571167, + "learning_rate": 1.0066582475603872e-07, + "logits/chosen": 2.453125, + "logits/rejected": 1.9375, + "logps/chosen": -428.0, + "logps/rejected": -458.0, + "loss": 0.5564, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.294921875, + "rewards/rejected": -1.640625, + "step": 1022 + }, + { + "epoch": 2.141287284144427, + "grad_norm": 10.46700668334961, + "learning_rate": 1.0021021924411874e-07, + "logits/chosen": 2.015625, + "logits/rejected": 1.6640625, + "logps/chosen": -430.0, + "logps/rejected": -644.0, + "loss": 0.586, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3828125, + "rewards/margins": 0.1650390625, + "rewards/rejected": -1.546875, + "step": 1023 + }, + { + "epoch": 2.143380429094715, + "grad_norm": 10.900199890136719, + "learning_rate": 9.975538902698597e-08, + "logits/chosen": 1.65625, + "logits/rejected": 2.28125, + "logps/chosen": -510.0, + "logps/rejected": -462.0, + "loss": 0.597, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4140625, + "rewards/margins": 0.232421875, + "rewards/rejected": -1.6484375, + "step": 1024 + }, + { + "epoch": 2.1454735740450026, + "grad_norm": 11.70801830291748, + "learning_rate": 9.930133645839689e-08, + "logits/chosen": 1.9453125, + "logits/rejected": 1.6796875, + "logps/chosen": -568.0, + "logps/rejected": -608.0, + "loss": 0.6152, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.150390625, + "rewards/rejected": -1.484375, + "step": 1025 + }, + { + "epoch": 2.1475667189952903, + "grad_norm": 10.580524444580078, + "learning_rate": 9.884806388808362e-08, + "logits/chosen": 2.484375, + "logits/rejected": 2.953125, + "logps/chosen": -488.0, + "logps/rejected": -508.0, + "loss": 0.5511, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.40625, + "rewards/margins": 0.126953125, + "rewards/rejected": -1.5390625, + "step": 1026 + }, + { + "epoch": 2.1496598639455784, + "grad_norm": 10.129754066467285, + "learning_rate": 9.83955736617416e-08, + "logits/chosen": 2.421875, + "logits/rejected": 3.1875, + "logps/chosen": -612.0, + "logps/rejected": -482.0, + "loss": 0.5524, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.375, + "rewards/margins": 0.8046875, + "rewards/rejected": -2.171875, + "step": 1027 + }, + { + "epoch": 2.151753008895866, + "grad_norm": 10.40230941772461, + "learning_rate": 9.794386812101759e-08, + "logits/chosen": 2.59375, + "logits/rejected": 2.3125, + "logps/chosen": -390.0, + "logps/rejected": -716.0, + "loss": 0.5755, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.15625, + "rewards/margins": 0.796875, + "rewards/rejected": -1.953125, + "step": 1028 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 12.058321952819824, + "learning_rate": 9.749294960349783e-08, + "logits/chosen": 1.390625, + "logits/rejected": 2.546875, + "logps/chosen": -388.0, + "logps/rejected": -344.0, + "loss": 0.5656, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.03125, + "rewards/margins": 0.1201171875, + "rewards/rejected": -1.1484375, + "step": 1029 + }, + { + "epoch": 2.155939298796442, + "grad_norm": 11.147902488708496, + "learning_rate": 9.704282044269563e-08, + "logits/chosen": 1.4765625, + "logits/rejected": 1.671875, + "logps/chosen": -412.0, + "logps/rejected": -576.0, + "loss": 0.6192, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6640625, + "rewards/margins": 1.0234375, + "rewards/rejected": -2.6875, + "step": 1030 + }, + { + "epoch": 2.1580324437467295, + "grad_norm": 10.608804702758789, + "learning_rate": 9.659348296803916e-08, + "logits/chosen": 1.859375, + "logits/rejected": 1.71875, + "logps/chosen": -394.0, + "logps/rejected": -468.0, + "loss": 0.5777, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.65625, + "rewards/rejected": -1.96875, + "step": 1031 + }, + { + "epoch": 2.160125588697017, + "grad_norm": 11.858631134033203, + "learning_rate": 9.61449395048598e-08, + "logits/chosen": 1.8984375, + "logits/rejected": 2.5625, + "logps/chosen": -832.0, + "logps/rejected": -760.0, + "loss": 0.5936, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.03125, + "rewards/margins": 0.259765625, + "rewards/rejected": -2.28125, + "step": 1032 + }, + { + "epoch": 2.162218733647305, + "grad_norm": 12.69530200958252, + "learning_rate": 9.569719237437995e-08, + "logits/chosen": 0.765625, + "logits/rejected": 1.1171875, + "logps/chosen": -294.0, + "logps/rejected": -278.0, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.734375, + "rewards/margins": 0.0712890625, + "rewards/rejected": -1.8046875, + "step": 1033 + }, + { + "epoch": 2.164311878597593, + "grad_norm": 11.752124786376953, + "learning_rate": 9.525024389370076e-08, + "logits/chosen": 2.46875, + "logits/rejected": 2.59375, + "logps/chosen": -708.0, + "logps/rejected": -592.0, + "loss": 0.5949, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.5625, + "rewards/margins": -0.0126953125, + "rewards/rejected": -1.546875, + "step": 1034 + }, + { + "epoch": 2.1664050235478807, + "grad_norm": 11.493417739868164, + "learning_rate": 9.480409637579037e-08, + "logits/chosen": 2.015625, + "logits/rejected": 1.78125, + "logps/chosen": -552.0, + "logps/rejected": -668.0, + "loss": 0.5822, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.8671875, + "rewards/rejected": -2.21875, + "step": 1035 + }, + { + "epoch": 2.1684981684981683, + "grad_norm": 10.749217987060547, + "learning_rate": 9.43587521294721e-08, + "logits/chosen": 2.796875, + "logits/rejected": 2.90625, + "logps/chosen": -736.0, + "logps/rejected": -576.0, + "loss": 0.5537, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.359375, + "rewards/margins": 0.56640625, + "rewards/rejected": -1.9296875, + "step": 1036 + }, + { + "epoch": 2.1705913134484565, + "grad_norm": 11.612716674804688, + "learning_rate": 9.39142134594123e-08, + "logits/chosen": 2.375, + "logits/rejected": 2.40625, + "logps/chosen": -588.0, + "logps/rejected": -364.0, + "loss": 0.6297, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.375, + "rewards/margins": 0.107421875, + "rewards/rejected": -1.484375, + "step": 1037 + }, + { + "epoch": 2.172684458398744, + "grad_norm": 11.497761726379395, + "learning_rate": 9.34704826661082e-08, + "logits/chosen": 1.28125, + "logits/rejected": 1.8203125, + "logps/chosen": -414.0, + "logps/rejected": -496.0, + "loss": 0.5917, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.421875, + "rewards/margins": 0.515625, + "rewards/rejected": -1.9375, + "step": 1038 + }, + { + "epoch": 2.174777603349032, + "grad_norm": 11.313504219055176, + "learning_rate": 9.302756204587662e-08, + "logits/chosen": 0.98046875, + "logits/rejected": 1.3984375, + "logps/chosen": -240.0, + "logps/rejected": -246.0, + "loss": 0.5883, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.109375, + "rewards/margins": 0.189453125, + "rewards/rejected": -1.3046875, + "step": 1039 + }, + { + "epoch": 2.17687074829932, + "grad_norm": 12.031342506408691, + "learning_rate": 9.25854538908413e-08, + "logits/chosen": 2.015625, + "logits/rejected": 2.0625, + "logps/chosen": -616.0, + "logps/rejected": -576.0, + "loss": 0.5908, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.28125, + "rewards/margins": 0.212890625, + "rewards/rejected": -1.5, + "step": 1040 + }, + { + "epoch": 2.1789638932496076, + "grad_norm": 11.418907165527344, + "learning_rate": 9.214416048892185e-08, + "logits/chosen": 1.4921875, + "logits/rejected": 1.2734375, + "logps/chosen": -336.0, + "logps/rejected": -382.0, + "loss": 0.6313, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.203125, + "rewards/margins": 0.125, + "rewards/rejected": -1.328125, + "step": 1041 + }, + { + "epoch": 2.1810570381998953, + "grad_norm": 11.404754638671875, + "learning_rate": 9.170368412382117e-08, + "logits/chosen": 2.171875, + "logits/rejected": 1.71875, + "logps/chosen": -384.0, + "logps/rejected": -624.0, + "loss": 0.5772, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.625, + "rewards/margins": 0.234375, + "rewards/rejected": -1.859375, + "step": 1042 + }, + { + "epoch": 2.183150183150183, + "grad_norm": 12.32582950592041, + "learning_rate": 9.126402707501426e-08, + "logits/chosen": 2.46875, + "logits/rejected": 3.390625, + "logps/chosen": -576.0, + "logps/rejected": -360.0, + "loss": 0.5829, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.375, + "rewards/margins": 0.146484375, + "rewards/rejected": -1.5234375, + "step": 1043 + }, + { + "epoch": 2.185243328100471, + "grad_norm": 11.554696083068848, + "learning_rate": 9.08251916177361e-08, + "logits/chosen": 1.796875, + "logits/rejected": 2.0625, + "logps/chosen": -239.0, + "logps/rejected": -284.0, + "loss": 0.6263, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.423828125, + "rewards/rejected": -1.5546875, + "step": 1044 + }, + { + "epoch": 2.1873364730507587, + "grad_norm": 13.371793746948242, + "learning_rate": 9.038718002296962e-08, + "logits/chosen": 2.8125, + "logits/rejected": 2.671875, + "logps/chosen": -408.0, + "logps/rejected": -456.0, + "loss": 0.575, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.36328125, + "rewards/rejected": -1.4609375, + "step": 1045 + }, + { + "epoch": 2.1894296180010464, + "grad_norm": 10.569348335266113, + "learning_rate": 8.994999455743467e-08, + "logits/chosen": 1.71875, + "logits/rejected": 1.671875, + "logps/chosen": -406.0, + "logps/rejected": -440.0, + "loss": 0.5697, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4375, + "rewards/margins": 0.328125, + "rewards/rejected": -1.765625, + "step": 1046 + }, + { + "epoch": 2.1915227629513345, + "grad_norm": 10.752891540527344, + "learning_rate": 8.951363748357547e-08, + "logits/chosen": 0.55859375, + "logits/rejected": 1.203125, + "logps/chosen": -205.0, + "logps/rejected": -202.0, + "loss": 0.5561, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.171875, + "rewards/margins": 0.224609375, + "rewards/rejected": -1.3984375, + "step": 1047 + }, + { + "epoch": 2.193615907901622, + "grad_norm": 10.378030776977539, + "learning_rate": 8.907811105954968e-08, + "logits/chosen": 1.640625, + "logits/rejected": 1.8046875, + "logps/chosen": -486.0, + "logps/rejected": -636.0, + "loss": 0.5671, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.140625, + "rewards/margins": 0.703125, + "rewards/rejected": -1.84375, + "step": 1048 + }, + { + "epoch": 2.19570905285191, + "grad_norm": 11.485549926757812, + "learning_rate": 8.864341753921596e-08, + "logits/chosen": 1.1875, + "logits/rejected": 1.90625, + "logps/chosen": -360.0, + "logps/rejected": -376.0, + "loss": 0.5675, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.421875, + "rewards/margins": 0.35546875, + "rewards/rejected": -1.78125, + "step": 1049 + }, + { + "epoch": 2.197802197802198, + "grad_norm": 12.059419631958008, + "learning_rate": 8.820955917212295e-08, + "logits/chosen": 1.6796875, + "logits/rejected": 1.9296875, + "logps/chosen": -508.0, + "logps/rejected": -580.0, + "loss": 0.6126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8515625, + "rewards/margins": 0.50390625, + "rewards/rejected": -1.359375, + "step": 1050 + }, + { + "epoch": 2.1998953427524857, + "grad_norm": 10.468564987182617, + "learning_rate": 8.777653820349714e-08, + "logits/chosen": 1.8125, + "logits/rejected": 1.6875, + "logps/chosen": -368.0, + "logps/rejected": -544.0, + "loss": 0.5885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9921875, + "rewards/margins": 0.9296875, + "rewards/rejected": -1.921875, + "step": 1051 + }, + { + "epoch": 2.2019884877027733, + "grad_norm": 11.059528350830078, + "learning_rate": 8.734435687423162e-08, + "logits/chosen": 1.96875, + "logits/rejected": 0.75, + "logps/chosen": -235.0, + "logps/rejected": -372.0, + "loss": 0.6141, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.546875, + "rewards/margins": -0.056640625, + "rewards/rejected": -1.484375, + "step": 1052 + }, + { + "epoch": 2.204081632653061, + "grad_norm": 11.069193840026855, + "learning_rate": 8.691301742087442e-08, + "logits/chosen": 2.359375, + "logits/rejected": 2.125, + "logps/chosen": -426.0, + "logps/rejected": -460.0, + "loss": 0.6007, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.27734375, + "rewards/rejected": -1.6015625, + "step": 1053 + }, + { + "epoch": 2.206174777603349, + "grad_norm": 11.647639274597168, + "learning_rate": 8.648252207561646e-08, + "logits/chosen": 2.34375, + "logits/rejected": 2.421875, + "logps/chosen": -438.0, + "logps/rejected": -548.0, + "loss": 0.591, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6484375, + "rewards/margins": 0.1826171875, + "rewards/rejected": -1.8359375, + "step": 1054 + }, + { + "epoch": 2.208267922553637, + "grad_norm": 12.953960418701172, + "learning_rate": 8.605287306628074e-08, + "logits/chosen": 2.5, + "logits/rejected": 1.8984375, + "logps/chosen": -404.0, + "logps/rejected": -572.0, + "loss": 0.6395, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.375, + "rewards/margins": 0.0546875, + "rewards/rejected": -1.4296875, + "step": 1055 + }, + { + "epoch": 2.2103610675039245, + "grad_norm": 12.250329971313477, + "learning_rate": 8.562407261631043e-08, + "logits/chosen": 2.203125, + "logits/rejected": 1.359375, + "logps/chosen": -384.0, + "logps/rejected": -468.0, + "loss": 0.5657, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.453125, + "rewards/rejected": -1.5546875, + "step": 1056 + }, + { + "epoch": 2.2124542124542126, + "grad_norm": 11.850432395935059, + "learning_rate": 8.519612294475724e-08, + "logits/chosen": 2.046875, + "logits/rejected": 1.9375, + "logps/chosen": -336.0, + "logps/rejected": -456.0, + "loss": 0.617, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.609375, + "rewards/margins": -0.0546875, + "rewards/rejected": -1.5546875, + "step": 1057 + }, + { + "epoch": 2.2145473574045003, + "grad_norm": 11.546448707580566, + "learning_rate": 8.476902626626997e-08, + "logits/chosen": 1.859375, + "logits/rejected": 1.5703125, + "logps/chosen": -388.0, + "logps/rejected": -400.0, + "loss": 0.5888, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.359375, + "rewards/margins": 0.177734375, + "rewards/rejected": -1.53125, + "step": 1058 + }, + { + "epoch": 2.216640502354788, + "grad_norm": 11.809260368347168, + "learning_rate": 8.434278479108352e-08, + "logits/chosen": 1.46875, + "logits/rejected": 1.859375, + "logps/chosen": -416.0, + "logps/rejected": -440.0, + "loss": 0.6061, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.099609375, + "rewards/rejected": -1.296875, + "step": 1059 + }, + { + "epoch": 2.218733647305076, + "grad_norm": 10.582293510437012, + "learning_rate": 8.39174007250069e-08, + "logits/chosen": 2.09375, + "logits/rejected": 2.421875, + "logps/chosen": -616.0, + "logps/rejected": -438.0, + "loss": 0.549, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.390625, + "rewards/margins": 0.16015625, + "rewards/rejected": -1.546875, + "step": 1060 + }, + { + "epoch": 2.2208267922553637, + "grad_norm": 11.779565811157227, + "learning_rate": 8.349287626941198e-08, + "logits/chosen": 2.375, + "logits/rejected": 2.78125, + "logps/chosen": -624.0, + "logps/rejected": -480.0, + "loss": 0.6026, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4609375, + "rewards/margins": 0.126953125, + "rewards/rejected": -1.5859375, + "step": 1061 + }, + { + "epoch": 2.2229199372056514, + "grad_norm": 11.7490873336792, + "learning_rate": 8.306921362122195e-08, + "logits/chosen": 2.25, + "logits/rejected": 2.34375, + "logps/chosen": -544.0, + "logps/rejected": -660.0, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5234375, + "rewards/margins": 0.5546875, + "rewards/rejected": -2.078125, + "step": 1062 + }, + { + "epoch": 2.2250130821559395, + "grad_norm": 11.424396514892578, + "learning_rate": 8.264641497290072e-08, + "logits/chosen": 1.828125, + "logits/rejected": 2.296875, + "logps/chosen": -468.0, + "logps/rejected": -476.0, + "loss": 0.5615, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.2314453125, + "rewards/rejected": -1.375, + "step": 1063 + }, + { + "epoch": 2.227106227106227, + "grad_norm": 10.509209632873535, + "learning_rate": 8.22244825124404e-08, + "logits/chosen": 2.375, + "logits/rejected": 2.265625, + "logps/chosen": -430.0, + "logps/rejected": -426.0, + "loss": 0.5448, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3203125, + "rewards/margins": -0.0625, + "rewards/rejected": -1.2578125, + "step": 1064 + }, + { + "epoch": 2.229199372056515, + "grad_norm": 10.109917640686035, + "learning_rate": 8.18034184233507e-08, + "logits/chosen": 1.8046875, + "logits/rejected": 2.03125, + "logps/chosen": -442.0, + "logps/rejected": -434.0, + "loss": 0.5692, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5234375, + "rewards/margins": -0.03125, + "rewards/rejected": -1.4921875, + "step": 1065 + }, + { + "epoch": 2.2312925170068025, + "grad_norm": 11.283658027648926, + "learning_rate": 8.13832248846476e-08, + "logits/chosen": 2.03125, + "logits/rejected": 2.515625, + "logps/chosen": -474.0, + "logps/rejected": -612.0, + "loss": 0.6121, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.453125, + "rewards/margins": 1.1015625, + "rewards/rejected": -2.5625, + "step": 1066 + }, + { + "epoch": 2.2333856619570907, + "grad_norm": 12.197108268737793, + "learning_rate": 8.0963904070842e-08, + "logits/chosen": 1.84375, + "logits/rejected": 2.71875, + "logps/chosen": -648.0, + "logps/rejected": -478.0, + "loss": 0.6114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4765625, + "rewards/margins": 0.75, + "rewards/rejected": -2.21875, + "step": 1067 + }, + { + "epoch": 2.2354788069073783, + "grad_norm": 11.025632858276367, + "learning_rate": 8.054545815192828e-08, + "logits/chosen": 0.71484375, + "logits/rejected": 0.984375, + "logps/chosen": -274.0, + "logps/rejected": -215.0, + "loss": 0.5513, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.375, + "rewards/margins": -0.26171875, + "rewards/rejected": -1.109375, + "step": 1068 + }, + { + "epoch": 2.237571951857666, + "grad_norm": 12.587658882141113, + "learning_rate": 8.01278892933731e-08, + "logits/chosen": 1.8203125, + "logits/rejected": 2.4375, + "logps/chosen": -484.0, + "logps/rejected": -410.0, + "loss": 0.6006, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.15625, + "rewards/margins": 0.341796875, + "rewards/rejected": -1.5, + "step": 1069 + }, + { + "epoch": 2.239665096807954, + "grad_norm": 11.071043968200684, + "learning_rate": 7.971119965610481e-08, + "logits/chosen": 1.765625, + "logits/rejected": 1.4296875, + "logps/chosen": -410.0, + "logps/rejected": -688.0, + "loss": 0.5699, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.6640625, + "rewards/margins": 0.73046875, + "rewards/rejected": -2.390625, + "step": 1070 + }, + { + "epoch": 2.241758241758242, + "grad_norm": 11.76019287109375, + "learning_rate": 7.929539139650132e-08, + "logits/chosen": 1.90625, + "logits/rejected": 2.75, + "logps/chosen": -460.0, + "logps/rejected": -516.0, + "loss": 0.6034, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.375, + "rewards/margins": 0.56640625, + "rewards/rejected": -1.9375, + "step": 1071 + }, + { + "epoch": 2.2438513867085295, + "grad_norm": 10.587785720825195, + "learning_rate": 7.888046666637941e-08, + "logits/chosen": 2.234375, + "logits/rejected": 2.3125, + "logps/chosen": -616.0, + "logps/rejected": -536.0, + "loss": 0.5602, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.515625, + "rewards/margins": 0.34375, + "rewards/rejected": -1.859375, + "step": 1072 + }, + { + "epoch": 2.2459445316588176, + "grad_norm": 10.504775047302246, + "learning_rate": 7.846642761298378e-08, + "logits/chosen": 2.296875, + "logits/rejected": 1.6640625, + "logps/chosen": -334.0, + "logps/rejected": -656.0, + "loss": 0.5856, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.578125, + "rewards/margins": 0.5390625, + "rewards/rejected": -2.125, + "step": 1073 + }, + { + "epoch": 2.2480376766091053, + "grad_norm": 11.749526023864746, + "learning_rate": 7.805327637897571e-08, + "logits/chosen": 2.1875, + "logits/rejected": 3.03125, + "logps/chosen": -596.0, + "logps/rejected": -430.0, + "loss": 0.5889, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.234375, + "rewards/margins": 0.138671875, + "rewards/rejected": -1.3671875, + "step": 1074 + }, + { + "epoch": 2.250130821559393, + "grad_norm": 11.638836860656738, + "learning_rate": 7.764101510242188e-08, + "logits/chosen": 1.53125, + "logits/rejected": 1.7578125, + "logps/chosen": -252.0, + "logps/rejected": -296.0, + "loss": 0.5808, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.65625, + "rewards/margins": 0.123046875, + "rewards/rejected": -1.78125, + "step": 1075 + }, + { + "epoch": 2.252223966509681, + "grad_norm": 11.232507705688477, + "learning_rate": 7.722964591678327e-08, + "logits/chosen": 2.828125, + "logits/rejected": 2.453125, + "logps/chosen": -428.0, + "logps/rejected": -504.0, + "loss": 0.5817, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4453125, + "rewards/margins": -0.0771484375, + "rewards/rejected": -1.375, + "step": 1076 + }, + { + "epoch": 2.2543171114599687, + "grad_norm": 10.541213035583496, + "learning_rate": 7.681917095090483e-08, + "logits/chosen": 1.65625, + "logits/rejected": 1.6484375, + "logps/chosen": -390.0, + "logps/rejected": -350.0, + "loss": 0.5898, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.0009765625, + "rewards/rejected": -1.3359375, + "step": 1077 + }, + { + "epoch": 2.2564102564102564, + "grad_norm": 10.962115287780762, + "learning_rate": 7.640959232900337e-08, + "logits/chosen": 2.234375, + "logits/rejected": 2.625, + "logps/chosen": -628.0, + "logps/rejected": -338.0, + "loss": 0.5776, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3984375, + "rewards/margins": 0.25390625, + "rewards/rejected": -1.6484375, + "step": 1078 + }, + { + "epoch": 2.258503401360544, + "grad_norm": 11.30371379852295, + "learning_rate": 7.600091217065716e-08, + "logits/chosen": 1.2265625, + "logits/rejected": 0.98046875, + "logps/chosen": -360.0, + "logps/rejected": -460.0, + "loss": 0.5604, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.265625, + "rewards/margins": 0.173828125, + "rewards/rejected": -1.4375, + "step": 1079 + }, + { + "epoch": 2.260596546310832, + "grad_norm": 12.675722122192383, + "learning_rate": 7.559313259079511e-08, + "logits/chosen": 1.546875, + "logits/rejected": 2.875, + "logps/chosen": -556.0, + "logps/rejected": -332.0, + "loss": 0.621, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.60546875, + "rewards/rejected": -1.78125, + "step": 1080 + }, + { + "epoch": 2.26268969126112, + "grad_norm": 11.507694244384766, + "learning_rate": 7.518625569968563e-08, + "logits/chosen": 0.78515625, + "logits/rejected": 1.7578125, + "logps/chosen": -298.0, + "logps/rejected": -284.0, + "loss": 0.6126, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.046875, + "rewards/margins": 0.54296875, + "rewards/rejected": -1.59375, + "step": 1081 + }, + { + "epoch": 2.2647828362114075, + "grad_norm": 10.317126274108887, + "learning_rate": 7.478028360292546e-08, + "logits/chosen": 0.9296875, + "logits/rejected": 0.94140625, + "logps/chosen": -364.0, + "logps/rejected": -332.0, + "loss": 0.5658, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6015625, + "rewards/margins": -0.150390625, + "rewards/rejected": -1.453125, + "step": 1082 + }, + { + "epoch": 2.2668759811616956, + "grad_norm": 11.577229499816895, + "learning_rate": 7.437521840142908e-08, + "logits/chosen": 1.515625, + "logits/rejected": 2.09375, + "logps/chosen": -442.0, + "logps/rejected": -408.0, + "loss": 0.6232, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.578125, + "rewards/margins": 0.16796875, + "rewards/rejected": -1.7421875, + "step": 1083 + }, + { + "epoch": 2.2689691261119833, + "grad_norm": 11.189661979675293, + "learning_rate": 7.397106219141791e-08, + "logits/chosen": 2.359375, + "logits/rejected": 1.5859375, + "logps/chosen": -456.0, + "logps/rejected": -500.0, + "loss": 0.5966, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.375, + "rewards/margins": 0.46484375, + "rewards/rejected": -1.84375, + "step": 1084 + }, + { + "epoch": 2.271062271062271, + "grad_norm": 10.638973236083984, + "learning_rate": 7.356781706440928e-08, + "logits/chosen": 1.7421875, + "logits/rejected": 2.3125, + "logps/chosen": -576.0, + "logps/rejected": -490.0, + "loss": 0.5327, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.435546875, + "rewards/rejected": -1.7890625, + "step": 1085 + }, + { + "epoch": 2.2731554160125587, + "grad_norm": 11.273877143859863, + "learning_rate": 7.316548510720549e-08, + "logits/chosen": 2.671875, + "logits/rejected": 1.8125, + "logps/chosen": -464.0, + "logps/rejected": -516.0, + "loss": 0.571, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.359375, + "rewards/margins": 0.431640625, + "rewards/rejected": -1.7890625, + "step": 1086 + }, + { + "epoch": 2.2752485609628468, + "grad_norm": 11.14000129699707, + "learning_rate": 7.276406840188328e-08, + "logits/chosen": 1.4609375, + "logits/rejected": 1.578125, + "logps/chosen": -488.0, + "logps/rejected": -588.0, + "loss": 0.5683, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5546875, + "rewards/margins": 0.02734375, + "rewards/rejected": -1.578125, + "step": 1087 + }, + { + "epoch": 2.2773417059131345, + "grad_norm": 13.463235855102539, + "learning_rate": 7.236356902578304e-08, + "logits/chosen": 2.1875, + "logits/rejected": 2.3125, + "logps/chosen": -556.0, + "logps/rejected": -568.0, + "loss": 0.6012, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.0615234375, + "rewards/rejected": -1.5546875, + "step": 1088 + }, + { + "epoch": 2.279434850863422, + "grad_norm": 11.114773750305176, + "learning_rate": 7.196398905149775e-08, + "logits/chosen": 1.2734375, + "logits/rejected": 0.984375, + "logps/chosen": -274.0, + "logps/rejected": -348.0, + "loss": 0.5931, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.34375, + "rewards/rejected": -1.7734375, + "step": 1089 + }, + { + "epoch": 2.2815279958137102, + "grad_norm": 10.774499893188477, + "learning_rate": 7.156533054686264e-08, + "logits/chosen": 2.171875, + "logits/rejected": 2.09375, + "logps/chosen": -512.0, + "logps/rejected": -402.0, + "loss": 0.548, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.140625, + "rewards/margins": 0.2099609375, + "rewards/rejected": -1.34375, + "step": 1090 + }, + { + "epoch": 2.283621140763998, + "grad_norm": 11.815675735473633, + "learning_rate": 7.116759557494416e-08, + "logits/chosen": 0.9765625, + "logits/rejected": 1.65625, + "logps/chosen": -504.0, + "logps/rejected": -338.0, + "loss": 0.6095, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.390625, + "rewards/margins": 0.119140625, + "rewards/rejected": -1.5078125, + "step": 1091 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 11.589235305786133, + "learning_rate": 7.077078619402966e-08, + "logits/chosen": 2.78125, + "logits/rejected": 2.0625, + "logps/chosen": -608.0, + "logps/rejected": -752.0, + "loss": 0.5925, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265625, + "rewards/margins": 0.26171875, + "rewards/rejected": -1.53125, + "step": 1092 + }, + { + "epoch": 2.2878074306645737, + "grad_norm": 11.489415168762207, + "learning_rate": 7.037490445761629e-08, + "logits/chosen": 1.7734375, + "logits/rejected": 2.65625, + "logps/chosen": -604.0, + "logps/rejected": -540.0, + "loss": 0.612, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.5, + "rewards/rejected": -1.6328125, + "step": 1093 + }, + { + "epoch": 2.2899005756148614, + "grad_norm": 11.35988712310791, + "learning_rate": 6.997995241440086e-08, + "logits/chosen": 2.78125, + "logits/rejected": 2.71875, + "logps/chosen": -1120.0, + "logps/rejected": -828.0, + "loss": 0.5576, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0078125, + "rewards/margins": 0.2890625, + "rewards/rejected": -1.296875, + "step": 1094 + }, + { + "epoch": 2.291993720565149, + "grad_norm": 11.50139331817627, + "learning_rate": 6.958593210826879e-08, + "logits/chosen": 1.5390625, + "logits/rejected": 1.46875, + "logps/chosen": -392.0, + "logps/rejected": -406.0, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3125, + "rewards/margins": 0.361328125, + "rewards/rejected": -1.671875, + "step": 1095 + }, + { + "epoch": 2.294086865515437, + "grad_norm": 11.497011184692383, + "learning_rate": 6.919284557828384e-08, + "logits/chosen": 2.65625, + "logits/rejected": 3.0, + "logps/chosen": -498.0, + "logps/rejected": -432.0, + "loss": 0.5669, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.53125, + "rewards/margins": 0.271484375, + "rewards/rejected": -1.796875, + "step": 1096 + }, + { + "epoch": 2.296180010465725, + "grad_norm": 11.839705467224121, + "learning_rate": 6.88006948586776e-08, + "logits/chosen": 2.0, + "logits/rejected": 1.9375, + "logps/chosen": -326.0, + "logps/rejected": -400.0, + "loss": 0.5985, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.484375, + "rewards/margins": 0.62109375, + "rewards/rejected": -2.09375, + "step": 1097 + }, + { + "epoch": 2.2982731554160125, + "grad_norm": 12.152945518493652, + "learning_rate": 6.840948197883847e-08, + "logits/chosen": 1.484375, + "logits/rejected": 1.21875, + "logps/chosen": -362.0, + "logps/rejected": -422.0, + "loss": 0.5717, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.171875, + "rewards/margins": 0.51171875, + "rewards/rejected": -1.6796875, + "step": 1098 + }, + { + "epoch": 2.3003663003663, + "grad_norm": 11.379349708557129, + "learning_rate": 6.80192089633019e-08, + "logits/chosen": 1.34375, + "logits/rejected": 2.0, + "logps/chosen": -466.0, + "logps/rejected": -424.0, + "loss": 0.6013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.234375, + "rewards/margins": 0.54296875, + "rewards/rejected": -1.7734375, + "step": 1099 + }, + { + "epoch": 2.3024594453165883, + "grad_norm": 11.67951488494873, + "learning_rate": 6.762987783173914e-08, + "logits/chosen": 2.8125, + "logits/rejected": 2.71875, + "logps/chosen": -772.0, + "logps/rejected": -464.0, + "loss": 0.576, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4140625, + "rewards/margins": 0.216796875, + "rewards/rejected": -1.625, + "step": 1100 + }, + { + "epoch": 2.304552590266876, + "grad_norm": 11.653464317321777, + "learning_rate": 6.724149059894758e-08, + "logits/chosen": 1.9375, + "logits/rejected": 2.671875, + "logps/chosen": -392.0, + "logps/rejected": -320.0, + "loss": 0.5662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.515625, + "rewards/rejected": -1.6640625, + "step": 1101 + }, + { + "epoch": 2.3066457352171637, + "grad_norm": 10.961610794067383, + "learning_rate": 6.685404927483948e-08, + "logits/chosen": 2.046875, + "logits/rejected": 2.28125, + "logps/chosen": -728.0, + "logps/rejected": -516.0, + "loss": 0.5507, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.328125, + "rewards/margins": 0.080078125, + "rewards/rejected": -1.40625, + "step": 1102 + }, + { + "epoch": 2.3087388801674518, + "grad_norm": 12.406999588012695, + "learning_rate": 6.646755586443231e-08, + "logits/chosen": 2.375, + "logits/rejected": 3.78125, + "logps/chosen": -540.0, + "logps/rejected": -332.0, + "loss": 0.6069, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0625, + "rewards/margins": 0.2158203125, + "rewards/rejected": -1.28125, + "step": 1103 + }, + { + "epoch": 2.3108320251177394, + "grad_norm": 13.117595672607422, + "learning_rate": 6.60820123678381e-08, + "logits/chosen": 2.546875, + "logits/rejected": 3.328125, + "logps/chosen": -772.0, + "logps/rejected": -532.0, + "loss": 0.5875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5546875, + "rewards/margins": 0.55859375, + "rewards/rejected": -2.109375, + "step": 1104 + }, + { + "epoch": 2.312925170068027, + "grad_norm": 11.685809135437012, + "learning_rate": 6.56974207802528e-08, + "logits/chosen": 2.109375, + "logits/rejected": 2.046875, + "logps/chosen": -492.0, + "logps/rejected": -336.0, + "loss": 0.5702, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.392578125, + "rewards/rejected": -1.5, + "step": 1105 + }, + { + "epoch": 2.315018315018315, + "grad_norm": 10.609480857849121, + "learning_rate": 6.531378309194625e-08, + "logits/chosen": 1.1328125, + "logits/rejected": 1.515625, + "logps/chosen": -394.0, + "logps/rejected": -248.0, + "loss": 0.5857, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.234375, + "rewards/rejected": -1.5546875, + "step": 1106 + }, + { + "epoch": 2.317111459968603, + "grad_norm": 11.093987464904785, + "learning_rate": 6.493110128825207e-08, + "logits/chosen": 2.34375, + "logits/rejected": 2.40625, + "logps/chosen": -418.0, + "logps/rejected": -344.0, + "loss": 0.5887, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.453125, + "rewards/margins": -0.3125, + "rewards/rejected": -1.140625, + "step": 1107 + }, + { + "epoch": 2.3192046049188906, + "grad_norm": 11.48231029510498, + "learning_rate": 6.454937734955702e-08, + "logits/chosen": 2.265625, + "logits/rejected": 2.328125, + "logps/chosen": -600.0, + "logps/rejected": -494.0, + "loss": 0.5699, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.53125, + "rewards/margins": -0.0576171875, + "rewards/rejected": -1.4765625, + "step": 1108 + }, + { + "epoch": 2.3212977498691783, + "grad_norm": 11.149823188781738, + "learning_rate": 6.416861325129081e-08, + "logits/chosen": 2.4375, + "logits/rejected": 2.296875, + "logps/chosen": -544.0, + "logps/rejected": -506.0, + "loss": 0.5575, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.265625, + "rewards/margins": 0.52734375, + "rewards/rejected": -1.796875, + "step": 1109 + }, + { + "epoch": 2.3233908948194664, + "grad_norm": 12.775769233703613, + "learning_rate": 6.378881096391602e-08, + "logits/chosen": 1.859375, + "logits/rejected": 1.984375, + "logps/chosen": -398.0, + "logps/rejected": -470.0, + "loss": 0.5861, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.59375, + "rewards/margins": -0.033203125, + "rewards/rejected": -1.5625, + "step": 1110 + }, + { + "epoch": 2.325484039769754, + "grad_norm": 10.973607063293457, + "learning_rate": 6.340997245291798e-08, + "logits/chosen": 1.9296875, + "logits/rejected": 2.4375, + "logps/chosen": -488.0, + "logps/rejected": -452.0, + "loss": 0.5227, + "rewards/accuracies": 0.0, + "rewards/chosen": -1.546875, + "rewards/margins": -0.19921875, + "rewards/rejected": -1.3515625, + "step": 1111 + }, + { + "epoch": 2.3275771847200417, + "grad_norm": 12.10044002532959, + "learning_rate": 6.303209967879422e-08, + "logits/chosen": 1.78125, + "logits/rejected": 2.21875, + "logps/chosen": -536.0, + "logps/rejected": -584.0, + "loss": 0.5664, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.46875, + "rewards/margins": 0.404296875, + "rewards/rejected": -1.8671875, + "step": 1112 + }, + { + "epoch": 2.32967032967033, + "grad_norm": 12.980206489562988, + "learning_rate": 6.26551945970446e-08, + "logits/chosen": 1.6875, + "logits/rejected": 2.046875, + "logps/chosen": -492.0, + "logps/rejected": -412.0, + "loss": 0.6242, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.423828125, + "rewards/rejected": -1.640625, + "step": 1113 + }, + { + "epoch": 2.3317634746206175, + "grad_norm": 10.602160453796387, + "learning_rate": 6.22792591581613e-08, + "logits/chosen": 2.875, + "logits/rejected": 2.8125, + "logps/chosen": -540.0, + "logps/rejected": -568.0, + "loss": 0.5601, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.703125, + "rewards/margins": 0.048828125, + "rewards/rejected": -1.75, + "step": 1114 + }, + { + "epoch": 2.333856619570905, + "grad_norm": 12.55278205871582, + "learning_rate": 6.190429530761851e-08, + "logits/chosen": 1.1875, + "logits/rejected": 1.5, + "logps/chosen": -384.0, + "logps/rejected": -236.0, + "loss": 0.633, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.46875, + "rewards/margins": -0.0048828125, + "rewards/rejected": -1.46875, + "step": 1115 + }, + { + "epoch": 2.3359497645211933, + "grad_norm": 11.216866493225098, + "learning_rate": 6.153030498586239e-08, + "logits/chosen": 1.671875, + "logits/rejected": 1.671875, + "logps/chosen": -318.0, + "logps/rejected": -358.0, + "loss": 0.5519, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.640625, + "rewards/rejected": -1.75, + "step": 1116 + }, + { + "epoch": 2.338042909471481, + "grad_norm": 11.164164543151855, + "learning_rate": 6.115729012830089e-08, + "logits/chosen": 1.171875, + "logits/rejected": 0.64453125, + "logps/chosen": -328.0, + "logps/rejected": -520.0, + "loss": 0.5421, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.640625, + "rewards/margins": 0.5078125, + "rewards/rejected": -2.15625, + "step": 1117 + }, + { + "epoch": 2.3401360544217686, + "grad_norm": 11.40911865234375, + "learning_rate": 6.078525266529446e-08, + "logits/chosen": 1.296875, + "logits/rejected": 0.609375, + "logps/chosen": -244.0, + "logps/rejected": -372.0, + "loss": 0.5939, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.33984375, + "rewards/rejected": -1.671875, + "step": 1118 + }, + { + "epoch": 2.3422291993720563, + "grad_norm": 10.803315162658691, + "learning_rate": 6.041419452214497e-08, + "logits/chosen": 1.375, + "logits/rejected": 1.8203125, + "logps/chosen": -424.0, + "logps/rejected": -334.0, + "loss": 0.5617, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.1533203125, + "rewards/rejected": -1.34375, + "step": 1119 + }, + { + "epoch": 2.3443223443223444, + "grad_norm": 10.840127944946289, + "learning_rate": 6.00441176190864e-08, + "logits/chosen": 1.6484375, + "logits/rejected": 1.46875, + "logps/chosen": -406.0, + "logps/rejected": -528.0, + "loss": 0.5366, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.703125, + "rewards/margins": 0.4453125, + "rewards/rejected": -2.15625, + "step": 1120 + }, + { + "epoch": 2.346415489272632, + "grad_norm": 11.171231269836426, + "learning_rate": 5.967502387127494e-08, + "logits/chosen": 1.171875, + "logits/rejected": 1.3046875, + "logps/chosen": -332.0, + "logps/rejected": -344.0, + "loss": 0.5896, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.453125, + "rewards/margins": 0.474609375, + "rewards/rejected": -1.9296875, + "step": 1121 + }, + { + "epoch": 2.3485086342229198, + "grad_norm": 11.783544540405273, + "learning_rate": 5.930691518877897e-08, + "logits/chosen": 1.6875, + "logits/rejected": 1.640625, + "logps/chosen": -446.0, + "logps/rejected": -440.0, + "loss": 0.5661, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.421875, + "rewards/margins": 0.271484375, + "rewards/rejected": -1.6953125, + "step": 1122 + }, + { + "epoch": 2.350601779173208, + "grad_norm": 12.148418426513672, + "learning_rate": 5.8939793476568814e-08, + "logits/chosen": 2.203125, + "logits/rejected": 2.4375, + "logps/chosen": -520.0, + "logps/rejected": -752.0, + "loss": 0.5712, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.46875, + "rewards/margins": 0.384765625, + "rewards/rejected": -1.8515625, + "step": 1123 + }, + { + "epoch": 2.3526949241234956, + "grad_norm": 12.935588836669922, + "learning_rate": 5.857366063450755e-08, + "logits/chosen": 1.984375, + "logits/rejected": 2.515625, + "logps/chosen": -528.0, + "logps/rejected": -474.0, + "loss": 0.6003, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5, + "rewards/margins": 0.2265625, + "rewards/rejected": -1.71875, + "step": 1124 + }, + { + "epoch": 2.3547880690737832, + "grad_norm": 11.099451065063477, + "learning_rate": 5.8208518557340725e-08, + "logits/chosen": 0.9375, + "logits/rejected": 2.4375, + "logps/chosen": -772.0, + "logps/rejected": -492.0, + "loss": 0.5653, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3828125, + "rewards/margins": 0.5703125, + "rewards/rejected": -1.953125, + "step": 1125 + }, + { + "epoch": 2.3568812140240714, + "grad_norm": 11.670310974121094, + "learning_rate": 5.784436913468656e-08, + "logits/chosen": 1.5390625, + "logits/rejected": 1.125, + "logps/chosen": -278.0, + "logps/rejected": -378.0, + "loss": 0.6023, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.453125, + "rewards/margins": 0.60546875, + "rewards/rejected": -2.0625, + "step": 1126 + }, + { + "epoch": 2.358974358974359, + "grad_norm": 11.362000465393066, + "learning_rate": 5.7481214251026286e-08, + "logits/chosen": 2.78125, + "logits/rejected": 2.71875, + "logps/chosen": -400.0, + "logps/rejected": -446.0, + "loss": 0.5735, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2578125, + "rewards/margins": 0.59375, + "rewards/rejected": -1.8515625, + "step": 1127 + }, + { + "epoch": 2.3610675039246467, + "grad_norm": 11.208846092224121, + "learning_rate": 5.7119055785694426e-08, + "logits/chosen": 2.03125, + "logits/rejected": 1.71875, + "logps/chosen": -316.0, + "logps/rejected": -552.0, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.365234375, + "rewards/rejected": -1.5703125, + "step": 1128 + }, + { + "epoch": 2.363160648874935, + "grad_norm": 11.73038387298584, + "learning_rate": 5.675789561286913e-08, + "logits/chosen": 1.6640625, + "logits/rejected": 2.65625, + "logps/chosen": -464.0, + "logps/rejected": -280.0, + "loss": 0.5804, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5234375, + "rewards/margins": 0.060546875, + "rewards/rejected": -1.578125, + "step": 1129 + }, + { + "epoch": 2.3652537938252225, + "grad_norm": 10.981866836547852, + "learning_rate": 5.639773560156211e-08, + "logits/chosen": 2.5, + "logits/rejected": 1.9921875, + "logps/chosen": -828.0, + "logps/rejected": -892.0, + "loss": 0.5706, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0859375, + "rewards/margins": 0.921875, + "rewards/rejected": -2.0, + "step": 1130 + }, + { + "epoch": 2.36734693877551, + "grad_norm": 10.912532806396484, + "learning_rate": 5.6038577615609356e-08, + "logits/chosen": 2.359375, + "logits/rejected": 2.59375, + "logps/chosen": -532.0, + "logps/rejected": -428.0, + "loss": 0.5606, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.25, + "rewards/margins": 0.36328125, + "rewards/rejected": -1.6171875, + "step": 1131 + }, + { + "epoch": 2.369440083725798, + "grad_norm": 11.479551315307617, + "learning_rate": 5.5680423513661484e-08, + "logits/chosen": 2.28125, + "logits/rejected": 2.65625, + "logps/chosen": -544.0, + "logps/rejected": -500.0, + "loss": 0.5733, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2734375, + "rewards/margins": 0.125, + "rewards/rejected": -1.3984375, + "step": 1132 + }, + { + "epoch": 2.371533228676086, + "grad_norm": 11.486126899719238, + "learning_rate": 5.532327514917377e-08, + "logits/chosen": 2.171875, + "logits/rejected": 2.1875, + "logps/chosen": -688.0, + "logps/rejected": -418.0, + "loss": 0.5891, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.369140625, + "rewards/rejected": -1.703125, + "step": 1133 + }, + { + "epoch": 2.3736263736263736, + "grad_norm": 11.69915771484375, + "learning_rate": 5.496713437039675e-08, + "logits/chosen": 2.6875, + "logits/rejected": 3.09375, + "logps/chosen": -480.0, + "logps/rejected": -464.0, + "loss": 0.6, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.28125, + "rewards/margins": 0.6640625, + "rewards/rejected": -1.9375, + "step": 1134 + }, + { + "epoch": 2.3757195185766613, + "grad_norm": 10.8568754196167, + "learning_rate": 5.461200302036689e-08, + "logits/chosen": 3.0, + "logits/rejected": 2.359375, + "logps/chosen": -440.0, + "logps/rejected": -656.0, + "loss": 0.6175, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.359375, + "rewards/rejected": -1.5078125, + "step": 1135 + }, + { + "epoch": 2.3778126635269494, + "grad_norm": 10.859973907470703, + "learning_rate": 5.4257882936896834e-08, + "logits/chosen": 0.388671875, + "logits/rejected": 0.380859375, + "logps/chosen": -193.0, + "logps/rejected": -226.0, + "loss": 0.574, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0703125, + "rewards/margins": 0.21875, + "rewards/rejected": -1.2890625, + "step": 1136 + }, + { + "epoch": 2.379905808477237, + "grad_norm": 11.677450180053711, + "learning_rate": 5.390477595256566e-08, + "logits/chosen": 1.5625, + "logits/rejected": 2.1875, + "logps/chosen": -528.0, + "logps/rejected": -464.0, + "loss": 0.5571, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.69140625, + "rewards/rejected": -2.1875, + "step": 1137 + }, + { + "epoch": 2.3819989534275248, + "grad_norm": 11.75250244140625, + "learning_rate": 5.355268389470979e-08, + "logits/chosen": 2.328125, + "logits/rejected": 2.53125, + "logps/chosen": -680.0, + "logps/rejected": -396.0, + "loss": 0.5862, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.53125, + "rewards/margins": 0.048828125, + "rewards/rejected": -1.5859375, + "step": 1138 + }, + { + "epoch": 2.3840920983778124, + "grad_norm": 11.258225440979004, + "learning_rate": 5.320160858541352e-08, + "logits/chosen": 1.0859375, + "logits/rejected": 1.6328125, + "logps/chosen": -260.0, + "logps/rejected": -206.0, + "loss": 0.6036, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.15625, + "rewards/margins": 0.169921875, + "rewards/rejected": -1.328125, + "step": 1139 + }, + { + "epoch": 2.3861852433281006, + "grad_norm": 10.5697603225708, + "learning_rate": 5.285155184149918e-08, + "logits/chosen": 2.4375, + "logits/rejected": 3.21875, + "logps/chosen": -704.0, + "logps/rejected": -632.0, + "loss": 0.5534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.82421875, + "rewards/margins": 0.828125, + "rewards/rejected": -1.6484375, + "step": 1140 + }, + { + "epoch": 2.3882783882783882, + "grad_norm": 10.948512077331543, + "learning_rate": 5.2502515474518105e-08, + "logits/chosen": 2.734375, + "logits/rejected": 2.125, + "logps/chosen": -498.0, + "logps/rejected": -640.0, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.296875, + "rewards/margins": 0.390625, + "rewards/rejected": -1.6875, + "step": 1141 + }, + { + "epoch": 2.390371533228676, + "grad_norm": 10.868309020996094, + "learning_rate": 5.2154501290741196e-08, + "logits/chosen": 2.6875, + "logits/rejected": 2.3125, + "logps/chosen": -480.0, + "logps/rejected": -588.0, + "loss": 0.5845, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.9375, + "rewards/margins": -0.09375, + "rewards/rejected": -1.84375, + "step": 1142 + }, + { + "epoch": 2.392464678178964, + "grad_norm": 12.436022758483887, + "learning_rate": 5.180751109114958e-08, + "logits/chosen": 2.71875, + "logits/rejected": 3.015625, + "logps/chosen": -956.0, + "logps/rejected": -572.0, + "loss": 0.5957, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.109375, + "rewards/margins": -0.32421875, + "rewards/rejected": -1.7890625, + "step": 1143 + }, + { + "epoch": 2.3945578231292517, + "grad_norm": 11.055791854858398, + "learning_rate": 5.146154667142509e-08, + "logits/chosen": 2.1875, + "logits/rejected": 1.9296875, + "logps/chosen": -724.0, + "logps/rejected": -588.0, + "loss": 0.5652, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2265625, + "rewards/margins": 0.53125, + "rewards/rejected": -1.7578125, + "step": 1144 + }, + { + "epoch": 2.3966509680795394, + "grad_norm": 11.497798919677734, + "learning_rate": 5.1116609821941295e-08, + "logits/chosen": 1.5625, + "logits/rejected": 2.015625, + "logps/chosen": -444.0, + "logps/rejected": -258.0, + "loss": 0.5711, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.4765625, + "rewards/margins": -0.23828125, + "rewards/rejected": -1.234375, + "step": 1145 + }, + { + "epoch": 2.3987441130298275, + "grad_norm": 11.741530418395996, + "learning_rate": 5.0772702327753885e-08, + "logits/chosen": 1.03125, + "logits/rejected": 1.2890625, + "logps/chosen": -398.0, + "logps/rejected": -354.0, + "loss": 0.5506, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.203125, + "rewards/margins": 0.55078125, + "rewards/rejected": -1.7578125, + "step": 1146 + }, + { + "epoch": 2.400837257980115, + "grad_norm": 11.657857894897461, + "learning_rate": 5.042982596859181e-08, + "logits/chosen": 2.375, + "logits/rejected": 2.640625, + "logps/chosen": -840.0, + "logps/rejected": -422.0, + "loss": 0.5945, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.296875, + "rewards/margins": -0.0908203125, + "rewards/rejected": -2.203125, + "step": 1147 + }, + { + "epoch": 2.402930402930403, + "grad_norm": 12.115666389465332, + "learning_rate": 5.008798251884766e-08, + "logits/chosen": 1.765625, + "logits/rejected": 1.7265625, + "logps/chosen": -304.0, + "logps/rejected": -490.0, + "loss": 0.5824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9609375, + "rewards/margins": 0.56640625, + "rewards/rejected": -1.53125, + "step": 1148 + }, + { + "epoch": 2.405023547880691, + "grad_norm": 13.008218765258789, + "learning_rate": 4.97471737475689e-08, + "logits/chosen": 2.234375, + "logits/rejected": 2.5625, + "logps/chosen": -540.0, + "logps/rejected": -440.0, + "loss": 0.6402, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6640625, + "rewards/margins": 0.2021484375, + "rewards/rejected": -1.8671875, + "step": 1149 + }, + { + "epoch": 2.4071166928309786, + "grad_norm": 12.10905933380127, + "learning_rate": 4.940740141844843e-08, + "logits/chosen": 1.609375, + "logits/rejected": 2.328125, + "logps/chosen": -652.0, + "logps/rejected": -450.0, + "loss": 0.5795, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5625, + "rewards/margins": 0.1015625, + "rewards/rejected": -1.6640625, + "step": 1150 + }, + { + "epoch": 2.4092098377812663, + "grad_norm": 11.480064392089844, + "learning_rate": 4.9068667289815444e-08, + "logits/chosen": 1.6328125, + "logits/rejected": 2.109375, + "logps/chosen": -478.0, + "logps/rejected": -468.0, + "loss": 0.602, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.125, + "rewards/margins": -0.115234375, + "rewards/rejected": -2.0, + "step": 1151 + }, + { + "epoch": 2.411302982731554, + "grad_norm": 11.225536346435547, + "learning_rate": 4.873097311462662e-08, + "logits/chosen": 1.8203125, + "logits/rejected": 1.90625, + "logps/chosen": -286.0, + "logps/rejected": -362.0, + "loss": 0.597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9453125, + "rewards/margins": 1.0234375, + "rewards/rejected": -1.96875, + "step": 1152 + }, + { + "epoch": 2.413396127681842, + "grad_norm": 12.10788345336914, + "learning_rate": 4.839432064045664e-08, + "logits/chosen": 1.9375, + "logits/rejected": 2.1875, + "logps/chosen": -422.0, + "logps/rejected": -456.0, + "loss": 0.5994, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.390625, + "rewards/margins": 0.212890625, + "rewards/rejected": -1.6015625, + "step": 1153 + }, + { + "epoch": 2.4154892726321298, + "grad_norm": 12.278083801269531, + "learning_rate": 4.805871160948957e-08, + "logits/chosen": 2.4375, + "logits/rejected": 2.09375, + "logps/chosen": -460.0, + "logps/rejected": -472.0, + "loss": 0.5527, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.63671875, + "rewards/rejected": -1.828125, + "step": 1154 + }, + { + "epoch": 2.4175824175824174, + "grad_norm": 12.179211616516113, + "learning_rate": 4.772414775850942e-08, + "logits/chosen": 1.7734375, + "logits/rejected": 1.890625, + "logps/chosen": -304.0, + "logps/rejected": -1008.0, + "loss": 0.5881, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2734375, + "rewards/margins": 1.046875, + "rewards/rejected": -2.3125, + "step": 1155 + }, + { + "epoch": 2.4196755625327055, + "grad_norm": 12.0684175491333, + "learning_rate": 4.739063081889161e-08, + "logits/chosen": 2.734375, + "logits/rejected": 2.921875, + "logps/chosen": -552.0, + "logps/rejected": -592.0, + "loss": 0.5928, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.953125, + "rewards/margins": -0.208984375, + "rewards/rejected": -1.7421875, + "step": 1156 + }, + { + "epoch": 2.421768707482993, + "grad_norm": 13.287016868591309, + "learning_rate": 4.705816251659352e-08, + "logits/chosen": 2.359375, + "logits/rejected": 1.7578125, + "logps/chosen": -672.0, + "logps/rejected": -648.0, + "loss": 0.5639, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.326171875, + "rewards/rejected": -1.5390625, + "step": 1157 + }, + { + "epoch": 2.423861852433281, + "grad_norm": 10.981544494628906, + "learning_rate": 4.6726744572145964e-08, + "logits/chosen": 1.6015625, + "logits/rejected": 1.484375, + "logps/chosen": -464.0, + "logps/rejected": -470.0, + "loss": 0.559, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.609375, + "rewards/margins": 0.12890625, + "rewards/rejected": -1.734375, + "step": 1158 + }, + { + "epoch": 2.4259549973835686, + "grad_norm": 10.738006591796875, + "learning_rate": 4.639637870064416e-08, + "logits/chosen": 1.1015625, + "logits/rejected": 1.234375, + "logps/chosen": -372.0, + "logps/rejected": -382.0, + "loss": 0.5604, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.46875, + "rewards/margins": 0.189453125, + "rewards/rejected": -1.65625, + "step": 1159 + }, + { + "epoch": 2.4280481423338567, + "grad_norm": 11.973393440246582, + "learning_rate": 4.606706661173869e-08, + "logits/chosen": 2.125, + "logits/rejected": 2.171875, + "logps/chosen": -624.0, + "logps/rejected": -480.0, + "loss": 0.5848, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.375, + "rewards/margins": 0.59375, + "rewards/rejected": -1.96875, + "step": 1160 + }, + { + "epoch": 2.4301412872841444, + "grad_norm": 11.391327857971191, + "learning_rate": 4.573881000962693e-08, + "logits/chosen": 0.96875, + "logits/rejected": 1.359375, + "logps/chosen": -272.0, + "logps/rejected": -222.0, + "loss": 0.5727, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4375, + "rewards/margins": 0.1806640625, + "rewards/rejected": -1.6171875, + "step": 1161 + }, + { + "epoch": 2.4322344322344325, + "grad_norm": 11.80610466003418, + "learning_rate": 4.5411610593043916e-08, + "logits/chosen": 2.546875, + "logits/rejected": 2.9375, + "logps/chosen": -680.0, + "logps/rejected": -740.0, + "loss": 0.5857, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.5625, + "rewards/margins": -0.0625, + "rewards/rejected": -1.5, + "step": 1162 + }, + { + "epoch": 2.43432757718472, + "grad_norm": 11.733301162719727, + "learning_rate": 4.508547005525395e-08, + "logits/chosen": 2.3125, + "logits/rejected": 2.578125, + "logps/chosen": -636.0, + "logps/rejected": -498.0, + "loss": 0.5774, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6953125, + "rewards/margins": -0.0986328125, + "rewards/rejected": -1.59375, + "step": 1163 + }, + { + "epoch": 2.436420722135008, + "grad_norm": 11.919504165649414, + "learning_rate": 4.4760390084041395e-08, + "logits/chosen": 2.234375, + "logits/rejected": 3.0, + "logps/chosen": -474.0, + "logps/rejected": -466.0, + "loss": 0.5693, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.38671875, + "rewards/rejected": -1.484375, + "step": 1164 + }, + { + "epoch": 2.4385138670852955, + "grad_norm": 10.880753517150879, + "learning_rate": 4.4436372361702287e-08, + "logits/chosen": 2.5, + "logits/rejected": 3.015625, + "logps/chosen": -752.0, + "logps/rejected": -604.0, + "loss": 0.5715, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.578125, + "rewards/rejected": -1.859375, + "step": 1165 + }, + { + "epoch": 2.4406070120355836, + "grad_norm": 11.72057819366455, + "learning_rate": 4.4113418565035556e-08, + "logits/chosen": 1.3828125, + "logits/rejected": 1.21875, + "logps/chosen": -304.0, + "logps/rejected": -418.0, + "loss": 0.5687, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.625, + "rewards/margins": 0.1015625, + "rewards/rejected": -1.7265625, + "step": 1166 + }, + { + "epoch": 2.4427001569858713, + "grad_norm": 11.343137741088867, + "learning_rate": 4.379153036533411e-08, + "logits/chosen": 0.75, + "logits/rejected": 0.9921875, + "logps/chosen": -436.0, + "logps/rejected": -408.0, + "loss": 0.5857, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.421875, + "rewards/margins": 0.625, + "rewards/rejected": -2.046875, + "step": 1167 + }, + { + "epoch": 2.444793301936159, + "grad_norm": 11.733400344848633, + "learning_rate": 4.3470709428376414e-08, + "logits/chosen": 2.0625, + "logits/rejected": 2.25, + "logps/chosen": -470.0, + "logps/rejected": -416.0, + "loss": 0.5901, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4375, + "rewards/margins": 0.080078125, + "rewards/rejected": -1.515625, + "step": 1168 + }, + { + "epoch": 2.446886446886447, + "grad_norm": 12.811859130859375, + "learning_rate": 4.315095741441796e-08, + "logits/chosen": 1.359375, + "logits/rejected": 1.4765625, + "logps/chosen": -576.0, + "logps/rejected": -384.0, + "loss": 0.599, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7734375, + "rewards/margins": 0.072265625, + "rewards/rejected": -1.84375, + "step": 1169 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 11.338305473327637, + "learning_rate": 4.283227597818252e-08, + "logits/chosen": 1.8125, + "logits/rejected": 1.8828125, + "logps/chosen": -580.0, + "logps/rejected": -752.0, + "loss": 0.5989, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.34375, + "rewards/margins": 0.5625, + "rewards/rejected": -1.90625, + "step": 1170 + }, + { + "epoch": 2.4510727367870224, + "grad_norm": 11.298980712890625, + "learning_rate": 4.251466676885338e-08, + "logits/chosen": 2.4375, + "logits/rejected": 2.25, + "logps/chosen": -588.0, + "logps/rejected": -620.0, + "loss": 0.5822, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5546875, + "rewards/margins": 0.0263671875, + "rewards/rejected": -1.578125, + "step": 1171 + }, + { + "epoch": 2.45316588173731, + "grad_norm": 11.491308212280273, + "learning_rate": 4.21981314300653e-08, + "logits/chosen": 2.078125, + "logits/rejected": 2.65625, + "logps/chosen": -536.0, + "logps/rejected": -708.0, + "loss": 0.5373, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.1962890625, + "rewards/rejected": -1.34375, + "step": 1172 + }, + { + "epoch": 2.455259026687598, + "grad_norm": 11.813673973083496, + "learning_rate": 4.188267159989565e-08, + "logits/chosen": 1.5, + "logits/rejected": 2.234375, + "logps/chosen": -480.0, + "logps/rejected": -310.0, + "loss": 0.5628, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.578125, + "rewards/margins": 0.021484375, + "rewards/rejected": -1.59375, + "step": 1173 + }, + { + "epoch": 2.457352171637886, + "grad_norm": 12.04299259185791, + "learning_rate": 4.156828891085592e-08, + "logits/chosen": 1.640625, + "logits/rejected": 1.71875, + "logps/chosen": -336.0, + "logps/rejected": -456.0, + "loss": 0.5216, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.4140625, + "rewards/rejected": -1.515625, + "step": 1174 + }, + { + "epoch": 2.4594453165881736, + "grad_norm": 12.253011703491211, + "learning_rate": 4.125498498988334e-08, + "logits/chosen": 1.765625, + "logits/rejected": 1.96875, + "logps/chosen": -572.0, + "logps/rejected": -488.0, + "loss": 0.57, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.265625, + "rewards/margins": 0.828125, + "rewards/rejected": -2.09375, + "step": 1175 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 10.873589515686035, + "learning_rate": 4.094276145833286e-08, + "logits/chosen": 2.46875, + "logits/rejected": 2.28125, + "logps/chosen": -398.0, + "logps/rejected": -572.0, + "loss": 0.5735, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.375, + "rewards/margins": 0.6328125, + "rewards/rejected": -2.0, + "step": 1176 + }, + { + "epoch": 2.4636316064887493, + "grad_norm": 12.69218921661377, + "learning_rate": 4.0631619931967995e-08, + "logits/chosen": 1.5, + "logits/rejected": 0.99609375, + "logps/chosen": -306.0, + "logps/rejected": -462.0, + "loss": 0.6132, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.875, + "rewards/rejected": -2.1875, + "step": 1177 + }, + { + "epoch": 2.465724751439037, + "grad_norm": 11.359230995178223, + "learning_rate": 4.032156202095291e-08, + "logits/chosen": 2.15625, + "logits/rejected": 2.21875, + "logps/chosen": -378.0, + "logps/rejected": -620.0, + "loss": 0.5502, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.265625, + "rewards/margins": 1.171875, + "rewards/rejected": -2.4375, + "step": 1178 + }, + { + "epoch": 2.467817896389325, + "grad_norm": 12.737220764160156, + "learning_rate": 4.001258932984418e-08, + "logits/chosen": 3.0625, + "logits/rejected": 2.125, + "logps/chosen": -768.0, + "logps/rejected": -636.0, + "loss": 0.5639, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.27734375, + "rewards/rejected": -1.5625, + "step": 1179 + }, + { + "epoch": 2.469911041339613, + "grad_norm": 11.188276290893555, + "learning_rate": 3.970470345758236e-08, + "logits/chosen": 2.421875, + "logits/rejected": 1.9375, + "logps/chosen": -808.0, + "logps/rejected": -640.0, + "loss": 0.5684, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.09375, + "rewards/margins": 1.3203125, + "rewards/rejected": -2.40625, + "step": 1180 + }, + { + "epoch": 2.4720041862899005, + "grad_norm": 12.295984268188477, + "learning_rate": 3.939790599748357e-08, + "logits/chosen": 2.84375, + "logits/rejected": 2.90625, + "logps/chosen": -680.0, + "logps/rejected": -608.0, + "loss": 0.6257, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.390625, + "rewards/margins": 0.5703125, + "rewards/rejected": -1.9609375, + "step": 1181 + }, + { + "epoch": 2.4740973312401886, + "grad_norm": 12.252960205078125, + "learning_rate": 3.909219853723124e-08, + "logits/chosen": 2.375, + "logits/rejected": 2.5625, + "logps/chosen": -728.0, + "logps/rejected": -430.0, + "loss": 0.5755, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.53515625, + "rewards/rejected": -1.71875, + "step": 1182 + }, + { + "epoch": 2.4761904761904763, + "grad_norm": 11.002705574035645, + "learning_rate": 3.878758265886848e-08, + "logits/chosen": 0.890625, + "logits/rejected": 0.40625, + "logps/chosen": -184.0, + "logps/rejected": -226.0, + "loss": 0.566, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0625, + "rewards/margins": 0.13671875, + "rewards/rejected": -1.203125, + "step": 1183 + }, + { + "epoch": 2.478283621140764, + "grad_norm": 12.764391899108887, + "learning_rate": 3.848405993878906e-08, + "logits/chosen": 1.46875, + "logits/rejected": 2.015625, + "logps/chosen": -528.0, + "logps/rejected": -628.0, + "loss": 0.6226, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1171875, + "rewards/margins": 1.0078125, + "rewards/rejected": -2.125, + "step": 1184 + }, + { + "epoch": 2.4803767660910516, + "grad_norm": 12.30169677734375, + "learning_rate": 3.818163194772964e-08, + "logits/chosen": 1.1640625, + "logits/rejected": 1.3203125, + "logps/chosen": -384.0, + "logps/rejected": -320.0, + "loss": 0.617, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.859375, + "rewards/margins": -0.072265625, + "rewards/rejected": -1.7890625, + "step": 1185 + }, + { + "epoch": 2.4824699110413397, + "grad_norm": 11.491966247558594, + "learning_rate": 3.788030025076183e-08, + "logits/chosen": 1.0390625, + "logits/rejected": 1.6015625, + "logps/chosen": -316.0, + "logps/rejected": -314.0, + "loss": 0.5852, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.83203125, + "rewards/margins": 0.80859375, + "rewards/rejected": -1.640625, + "step": 1186 + }, + { + "epoch": 2.4845630559916274, + "grad_norm": 11.358709335327148, + "learning_rate": 3.758006640728381e-08, + "logits/chosen": 1.7734375, + "logits/rejected": 1.890625, + "logps/chosen": -436.0, + "logps/rejected": -404.0, + "loss": 0.5957, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.34375, + "rewards/rejected": -1.546875, + "step": 1187 + }, + { + "epoch": 2.486656200941915, + "grad_norm": 10.98845100402832, + "learning_rate": 3.728093197101228e-08, + "logits/chosen": 2.71875, + "logits/rejected": 2.921875, + "logps/chosen": -864.0, + "logps/rejected": -584.0, + "loss": 0.5767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9453125, + "rewards/margins": 0.9609375, + "rewards/rejected": -1.90625, + "step": 1188 + }, + { + "epoch": 2.488749345892203, + "grad_norm": 10.837740898132324, + "learning_rate": 3.698289848997448e-08, + "logits/chosen": 2.234375, + "logits/rejected": 2.765625, + "logps/chosen": -540.0, + "logps/rejected": -544.0, + "loss": 0.5729, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.203125, + "rewards/margins": 0.400390625, + "rewards/rejected": -1.6015625, + "step": 1189 + }, + { + "epoch": 2.490842490842491, + "grad_norm": 10.766701698303223, + "learning_rate": 3.6685967506500306e-08, + "logits/chosen": 1.1640625, + "logits/rejected": 0.87890625, + "logps/chosen": -314.0, + "logps/rejected": -348.0, + "loss": 0.5811, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.046875, + "rewards/margins": 0.234375, + "rewards/rejected": -1.28125, + "step": 1190 + }, + { + "epoch": 2.4929356357927785, + "grad_norm": 11.748739242553711, + "learning_rate": 3.639014055721417e-08, + "logits/chosen": 0.9765625, + "logits/rejected": 0.91015625, + "logps/chosen": -236.0, + "logps/rejected": -266.0, + "loss": 0.5899, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.03125, + "rewards/margins": 0.34765625, + "rewards/rejected": -1.3828125, + "step": 1191 + }, + { + "epoch": 2.495028780743066, + "grad_norm": 11.828984260559082, + "learning_rate": 3.609541917302693e-08, + "logits/chosen": 1.65625, + "logits/rejected": 1.5703125, + "logps/chosen": -368.0, + "logps/rejected": -504.0, + "loss": 0.5884, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1171875, + "rewards/margins": 0.173828125, + "rewards/rejected": -1.296875, + "step": 1192 + }, + { + "epoch": 2.4971219256933543, + "grad_norm": 12.25663948059082, + "learning_rate": 3.580180487912831e-08, + "logits/chosen": 1.9921875, + "logits/rejected": 2.5625, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 0.5784, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5625, + "rewards/margins": 0.609375, + "rewards/rejected": -2.171875, + "step": 1193 + }, + { + "epoch": 2.499215070643642, + "grad_norm": 13.72636604309082, + "learning_rate": 3.550929919497876e-08, + "logits/chosen": 1.5625, + "logits/rejected": 1.7421875, + "logps/chosen": -352.0, + "logps/rejected": -510.0, + "loss": 0.6214, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.078125, + "rewards/margins": 1.09375, + "rewards/rejected": -2.171875, + "step": 1194 + }, + { + "epoch": 2.50130821559393, + "grad_norm": 11.22259521484375, + "learning_rate": 3.521790363430161e-08, + "logits/chosen": 1.984375, + "logits/rejected": 2.78125, + "logps/chosen": -696.0, + "logps/rejected": -616.0, + "loss": 0.5602, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.125, + "rewards/margins": 0.61328125, + "rewards/rejected": -1.734375, + "step": 1195 + }, + { + "epoch": 2.503401360544218, + "grad_norm": 12.130874633789062, + "learning_rate": 3.4927619705075236e-08, + "logits/chosen": 1.9921875, + "logits/rejected": 2.0625, + "logps/chosen": -468.0, + "logps/rejected": -400.0, + "loss": 0.5579, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.34375, + "rewards/margins": 0.6953125, + "rewards/rejected": -2.03125, + "step": 1196 + }, + { + "epoch": 2.5054945054945055, + "grad_norm": 13.3442964553833, + "learning_rate": 3.463844890952541e-08, + "logits/chosen": 1.5625, + "logits/rejected": 2.375, + "logps/chosen": -498.0, + "logps/rejected": -540.0, + "loss": 0.6025, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.76953125, + "rewards/rejected": -2.109375, + "step": 1197 + }, + { + "epoch": 2.507587650444793, + "grad_norm": 11.669981002807617, + "learning_rate": 3.4350392744117424e-08, + "logits/chosen": 2.46875, + "logits/rejected": 3.671875, + "logps/chosen": -796.0, + "logps/rejected": -368.0, + "loss": 0.6119, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.046875, + "rewards/margins": 0.478515625, + "rewards/rejected": -1.5234375, + "step": 1198 + }, + { + "epoch": 2.5096807953950813, + "grad_norm": 11.026544570922852, + "learning_rate": 3.406345269954817e-08, + "logits/chosen": 1.75, + "logits/rejected": 2.09375, + "logps/chosen": -484.0, + "logps/rejected": -312.0, + "loss": 0.5777, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.6875, + "rewards/margins": -0.025390625, + "rewards/rejected": -1.65625, + "step": 1199 + }, + { + "epoch": 2.511773940345369, + "grad_norm": 12.441347122192383, + "learning_rate": 3.3777630260738765e-08, + "logits/chosen": 1.1171875, + "logits/rejected": 1.46875, + "logps/chosen": -476.0, + "logps/rejected": -532.0, + "loss": 0.6326, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5078125, + "rewards/margins": 0.28125, + "rewards/rejected": -1.7890625, + "step": 1200 + }, + { + "epoch": 2.5138670852956566, + "grad_norm": 11.129204750061035, + "learning_rate": 3.349292690682657e-08, + "logits/chosen": 1.0546875, + "logits/rejected": 1.4140625, + "logps/chosen": -420.0, + "logps/rejected": -438.0, + "loss": 0.5981, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9140625, + "rewards/margins": 0.703125, + "rewards/rejected": -1.6171875, + "step": 1201 + }, + { + "epoch": 2.5159602302459447, + "grad_norm": 11.990527153015137, + "learning_rate": 3.320934411115776e-08, + "logits/chosen": 2.0, + "logits/rejected": 1.1796875, + "logps/chosen": -250.0, + "logps/rejected": -328.0, + "loss": 0.5998, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.298828125, + "rewards/rejected": -1.6328125, + "step": 1202 + }, + { + "epoch": 2.5180533751962324, + "grad_norm": 11.312446594238281, + "learning_rate": 3.2926883341279474e-08, + "logits/chosen": 0.984375, + "logits/rejected": 1.1484375, + "logps/chosen": -372.0, + "logps/rejected": -544.0, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.2021484375, + "rewards/rejected": -1.6875, + "step": 1203 + }, + { + "epoch": 2.52014652014652, + "grad_norm": 11.853565216064453, + "learning_rate": 3.264554605893246e-08, + "logits/chosen": 2.375, + "logits/rejected": 2.59375, + "logps/chosen": -556.0, + "logps/rejected": -576.0, + "loss": 0.5507, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.34375, + "rewards/margins": -0.0029296875, + "rewards/rejected": -1.3359375, + "step": 1204 + }, + { + "epoch": 2.5222396650968077, + "grad_norm": 10.736586570739746, + "learning_rate": 3.236533372004338e-08, + "logits/chosen": 1.984375, + "logits/rejected": 2.28125, + "logps/chosen": -444.0, + "logps/rejected": -462.0, + "loss": 0.5873, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.375, + "rewards/margins": 0.244140625, + "rewards/rejected": -1.6171875, + "step": 1205 + }, + { + "epoch": 2.524332810047096, + "grad_norm": 12.196707725524902, + "learning_rate": 3.2086247774717155e-08, + "logits/chosen": 2.296875, + "logits/rejected": 2.953125, + "logps/chosen": -592.0, + "logps/rejected": -616.0, + "loss": 0.6162, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.375, + "rewards/margins": 0.734375, + "rewards/rejected": -2.109375, + "step": 1206 + }, + { + "epoch": 2.5264259549973835, + "grad_norm": 11.649630546569824, + "learning_rate": 3.1808289667229795e-08, + "logits/chosen": 1.671875, + "logits/rejected": 1.625, + "logps/chosen": -388.0, + "logps/rejected": -560.0, + "loss": 0.5946, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.359375, + "rewards/margins": 0.59375, + "rewards/rejected": -1.953125, + "step": 1207 + }, + { + "epoch": 2.528519099947671, + "grad_norm": 11.674649238586426, + "learning_rate": 3.153146083602052e-08, + "logits/chosen": 0.8515625, + "logits/rejected": 0.6171875, + "logps/chosen": -215.0, + "logps/rejected": -300.0, + "loss": 0.556, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.296875, + "rewards/margins": 0.2734375, + "rewards/rejected": -1.5703125, + "step": 1208 + }, + { + "epoch": 2.5306122448979593, + "grad_norm": 13.151446342468262, + "learning_rate": 3.12557627136847e-08, + "logits/chosen": 2.03125, + "logits/rejected": 1.171875, + "logps/chosen": -230.0, + "logps/rejected": -368.0, + "loss": 0.6325, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4453125, + "rewards/margins": 0.16015625, + "rewards/rejected": -1.609375, + "step": 1209 + }, + { + "epoch": 2.532705389848247, + "grad_norm": 11.518951416015625, + "learning_rate": 3.098119672696622e-08, + "logits/chosen": 1.09375, + "logits/rejected": 1.5078125, + "logps/chosen": -292.0, + "logps/rejected": -245.0, + "loss": 0.5736, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3046875, + "rewards/margins": 0.0732421875, + "rewards/rejected": -1.375, + "step": 1210 + }, + { + "epoch": 2.5347985347985347, + "grad_norm": 10.688014030456543, + "learning_rate": 3.070776429675003e-08, + "logits/chosen": 1.828125, + "logits/rejected": 2.34375, + "logps/chosen": -548.0, + "logps/rejected": -572.0, + "loss": 0.6095, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.53125, + "rewards/rejected": -1.953125, + "step": 1211 + }, + { + "epoch": 2.5368916797488223, + "grad_norm": 11.002942085266113, + "learning_rate": 3.0435466838054944e-08, + "logits/chosen": 2.0625, + "logits/rejected": 2.953125, + "logps/chosen": -716.0, + "logps/rejected": -544.0, + "loss": 0.5662, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.296875, + "rewards/margins": 0.173828125, + "rewards/rejected": -1.46875, + "step": 1212 + }, + { + "epoch": 2.5389848246991105, + "grad_norm": 11.257405281066895, + "learning_rate": 3.0164305760026364e-08, + "logits/chosen": 1.1640625, + "logits/rejected": 1.828125, + "logps/chosen": -340.0, + "logps/rejected": -292.0, + "loss": 0.5656, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.625, + "rewards/margins": 0.13671875, + "rewards/rejected": -1.765625, + "step": 1213 + }, + { + "epoch": 2.541077969649398, + "grad_norm": 10.610968589782715, + "learning_rate": 2.9894282465928896e-08, + "logits/chosen": 0.87890625, + "logits/rejected": 0.88671875, + "logps/chosen": -230.0, + "logps/rejected": -290.0, + "loss": 0.5164, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.359375, + "rewards/rejected": -1.7890625, + "step": 1214 + }, + { + "epoch": 2.5431711145996863, + "grad_norm": 12.95569133758545, + "learning_rate": 2.9625398353138885e-08, + "logits/chosen": 1.5703125, + "logits/rejected": 2.078125, + "logps/chosen": -396.0, + "logps/rejected": -386.0, + "loss": 0.5799, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.8046875, + "rewards/rejected": -1.953125, + "step": 1215 + }, + { + "epoch": 2.545264259549974, + "grad_norm": 12.78900146484375, + "learning_rate": 2.9357654813137606e-08, + "logits/chosen": 1.3359375, + "logits/rejected": 1.5, + "logps/chosen": -242.0, + "logps/rejected": -292.0, + "loss": 0.6043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1875, + "rewards/margins": 0.369140625, + "rewards/rejected": -1.5625, + "step": 1216 + }, + { + "epoch": 2.5473574045002616, + "grad_norm": 11.339468002319336, + "learning_rate": 2.9091053231503798e-08, + "logits/chosen": 2.40625, + "logits/rejected": 2.03125, + "logps/chosen": -446.0, + "logps/rejected": -636.0, + "loss": 0.5539, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0078125, + "rewards/margins": 1.125, + "rewards/rejected": -2.125, + "step": 1217 + }, + { + "epoch": 2.5494505494505493, + "grad_norm": 12.718293190002441, + "learning_rate": 2.882559498790651e-08, + "logits/chosen": 2.03125, + "logits/rejected": 1.3359375, + "logps/chosen": -560.0, + "logps/rejected": -684.0, + "loss": 0.6018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.25, + "rewards/margins": 1.0234375, + "rewards/rejected": -2.265625, + "step": 1218 + }, + { + "epoch": 2.5515436944008374, + "grad_norm": 11.234786987304688, + "learning_rate": 2.856128145609793e-08, + "logits/chosen": 2.40625, + "logits/rejected": 2.703125, + "logps/chosen": -600.0, + "logps/rejected": -444.0, + "loss": 0.6019, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4453125, + "rewards/margins": 0.0361328125, + "rewards/rejected": -1.484375, + "step": 1219 + }, + { + "epoch": 2.553636839351125, + "grad_norm": 12.245379447937012, + "learning_rate": 2.8298114003906423e-08, + "logits/chosen": 1.3515625, + "logits/rejected": 1.71875, + "logps/chosen": -442.0, + "logps/rejected": -528.0, + "loss": 0.5604, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.796875, + "rewards/rejected": -2.234375, + "step": 1220 + }, + { + "epoch": 2.5557299843014127, + "grad_norm": 11.113319396972656, + "learning_rate": 2.8036093993229405e-08, + "logits/chosen": 2.71875, + "logits/rejected": 3.15625, + "logps/chosen": -612.0, + "logps/rejected": -524.0, + "loss": 0.5613, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9921875, + "rewards/margins": -0.15234375, + "rewards/rejected": -1.84375, + "step": 1221 + }, + { + "epoch": 2.557823129251701, + "grad_norm": 12.352375030517578, + "learning_rate": 2.777522278002615e-08, + "logits/chosen": 2.421875, + "logits/rejected": 2.421875, + "logps/chosen": -480.0, + "logps/rejected": -426.0, + "loss": 0.5971, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.53125, + "rewards/margins": -0.00390625, + "rewards/rejected": -1.5234375, + "step": 1222 + }, + { + "epoch": 2.5599162742019885, + "grad_norm": 11.425958633422852, + "learning_rate": 2.7515501714310855e-08, + "logits/chosen": 1.6015625, + "logits/rejected": 1.28125, + "logps/chosen": -208.0, + "logps/rejected": -354.0, + "loss": 0.5876, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.546875, + "rewards/rejected": -1.8828125, + "step": 1223 + }, + { + "epoch": 2.562009419152276, + "grad_norm": 11.302983283996582, + "learning_rate": 2.7256932140145904e-08, + "logits/chosen": 1.875, + "logits/rejected": 2.625, + "logps/chosen": -600.0, + "logps/rejected": -352.0, + "loss": 0.5834, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6875, + "rewards/margins": 0.5078125, + "rewards/rejected": -2.203125, + "step": 1224 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 13.678489685058594, + "learning_rate": 2.6999515395634473e-08, + "logits/chosen": 2.375, + "logits/rejected": 3.203125, + "logps/chosen": -648.0, + "logps/rejected": -692.0, + "loss": 0.6591, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.48828125, + "rewards/rejected": -1.703125, + "step": 1225 + }, + { + "epoch": 2.566195709052852, + "grad_norm": 11.15230655670166, + "learning_rate": 2.6743252812913822e-08, + "logits/chosen": 2.46875, + "logits/rejected": 2.734375, + "logps/chosen": -508.0, + "logps/rejected": -432.0, + "loss": 0.5963, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3984375, + "rewards/margins": -0.1875, + "rewards/rejected": -1.2109375, + "step": 1226 + }, + { + "epoch": 2.5682888540031397, + "grad_norm": 12.096667289733887, + "learning_rate": 2.6488145718148505e-08, + "logits/chosen": 1.703125, + "logits/rejected": 2.15625, + "logps/chosen": -454.0, + "logps/rejected": -422.0, + "loss": 0.597, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.75, + "rewards/rejected": -1.8515625, + "step": 1227 + }, + { + "epoch": 2.570381998953428, + "grad_norm": 11.339020729064941, + "learning_rate": 2.623419543152337e-08, + "logits/chosen": 1.9375, + "logits/rejected": 2.25, + "logps/chosen": -560.0, + "logps/rejected": -540.0, + "loss": 0.5966, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3984375, + "rewards/margins": 0.314453125, + "rewards/rejected": -1.71875, + "step": 1228 + }, + { + "epoch": 2.5724751439037155, + "grad_norm": 11.454265594482422, + "learning_rate": 2.5981403267236717e-08, + "logits/chosen": 1.171875, + "logits/rejected": 0.69140625, + "logps/chosen": -238.0, + "logps/rejected": -348.0, + "loss": 0.56, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.380859375, + "rewards/rejected": -1.609375, + "step": 1229 + }, + { + "epoch": 2.574568288854003, + "grad_norm": 12.722302436828613, + "learning_rate": 2.572977053349346e-08, + "logits/chosen": 2.3125, + "logits/rejected": 2.53125, + "logps/chosen": -454.0, + "logps/rejected": -368.0, + "loss": 0.5985, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.22265625, + "rewards/rejected": -1.4609375, + "step": 1230 + }, + { + "epoch": 2.576661433804291, + "grad_norm": 11.2119722366333, + "learning_rate": 2.5479298532498732e-08, + "logits/chosen": 1.1328125, + "logits/rejected": 1.5625, + "logps/chosen": -412.0, + "logps/rejected": -418.0, + "loss": 0.5958, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4765625, + "rewards/margins": -0.0478515625, + "rewards/rejected": -1.4296875, + "step": 1231 + }, + { + "epoch": 2.578754578754579, + "grad_norm": 10.447643280029297, + "learning_rate": 2.5229988560450544e-08, + "logits/chosen": 1.0390625, + "logits/rejected": 0.65234375, + "logps/chosen": -294.0, + "logps/rejected": -502.0, + "loss": 0.5653, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.328125, + "rewards/margins": 0.94921875, + "rewards/rejected": -2.28125, + "step": 1232 + }, + { + "epoch": 2.5808477237048666, + "grad_norm": 13.349222183227539, + "learning_rate": 2.498184190753343e-08, + "logits/chosen": 1.2421875, + "logits/rejected": 1.1640625, + "logps/chosen": -394.0, + "logps/rejected": -440.0, + "loss": 0.6531, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2265625, + "rewards/margins": 0.068359375, + "rewards/rejected": -1.296875, + "step": 1233 + }, + { + "epoch": 2.5829408686551543, + "grad_norm": 11.777934074401855, + "learning_rate": 2.4734859857911862e-08, + "logits/chosen": 2.046875, + "logits/rejected": 2.125, + "logps/chosen": -700.0, + "logps/rejected": -632.0, + "loss": 0.6121, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7421875, + "rewards/margins": 0.48046875, + "rewards/rejected": -2.21875, + "step": 1234 + }, + { + "epoch": 2.5850340136054424, + "grad_norm": 12.234633445739746, + "learning_rate": 2.4489043689723397e-08, + "logits/chosen": 1.8515625, + "logits/rejected": 2.578125, + "logps/chosen": -400.0, + "logps/rejected": -278.0, + "loss": 0.6035, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.578125, + "rewards/margins": -0.0146484375, + "rewards/rejected": -1.5625, + "step": 1235 + }, + { + "epoch": 2.58712715855573, + "grad_norm": 11.81685733795166, + "learning_rate": 2.4244394675072046e-08, + "logits/chosen": 1.9921875, + "logits/rejected": 2.203125, + "logps/chosen": -470.0, + "logps/rejected": -456.0, + "loss": 0.6209, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.765625, + "rewards/margins": -0.1513671875, + "rewards/rejected": -1.6171875, + "step": 1236 + }, + { + "epoch": 2.5892203035060177, + "grad_norm": 11.48193359375, + "learning_rate": 2.400091408002187e-08, + "logits/chosen": 2.078125, + "logits/rejected": 1.703125, + "logps/chosen": -458.0, + "logps/rejected": -668.0, + "loss": 0.5771, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6796875, + "rewards/margins": 0.6328125, + "rewards/rejected": -2.3125, + "step": 1237 + }, + { + "epoch": 2.5913134484563054, + "grad_norm": 10.787525177001953, + "learning_rate": 2.3758603164590344e-08, + "logits/chosen": 2.078125, + "logits/rejected": 2.640625, + "logps/chosen": -772.0, + "logps/rejected": -812.0, + "loss": 0.5692, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.94921875, + "rewards/margins": 0.6796875, + "rewards/rejected": -1.625, + "step": 1238 + }, + { + "epoch": 2.5934065934065935, + "grad_norm": 11.714813232421875, + "learning_rate": 2.3517463182741777e-08, + "logits/chosen": 1.46875, + "logits/rejected": 1.7109375, + "logps/chosen": -346.0, + "logps/rejected": -388.0, + "loss": 0.5601, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5859375, + "rewards/margins": 0.2138671875, + "rewards/rejected": -1.796875, + "step": 1239 + }, + { + "epoch": 2.595499738356881, + "grad_norm": 12.473237037658691, + "learning_rate": 2.3277495382380804e-08, + "logits/chosen": 3.3125, + "logits/rejected": 3.03125, + "logps/chosen": -632.0, + "logps/rejected": -552.0, + "loss": 0.6142, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.015625, + "rewards/margins": -0.048828125, + "rewards/rejected": -1.96875, + "step": 1240 + }, + { + "epoch": 2.597592883307169, + "grad_norm": 12.750757217407227, + "learning_rate": 2.3038701005346117e-08, + "logits/chosen": 2.171875, + "logits/rejected": 1.8046875, + "logps/chosen": -432.0, + "logps/rejected": -432.0, + "loss": 0.6186, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.34375, + "rewards/margins": 0.427734375, + "rewards/rejected": -1.765625, + "step": 1241 + }, + { + "epoch": 2.599686028257457, + "grad_norm": 11.450703620910645, + "learning_rate": 2.2801081287403963e-08, + "logits/chosen": 2.734375, + "logits/rejected": 2.53125, + "logps/chosen": -296.0, + "logps/rejected": -472.0, + "loss": 0.5627, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.21875, + "rewards/margins": 0.40234375, + "rewards/rejected": -1.625, + "step": 1242 + }, + { + "epoch": 2.6017791732077447, + "grad_norm": 12.390066146850586, + "learning_rate": 2.2564637458241473e-08, + "logits/chosen": 1.859375, + "logits/rejected": 2.890625, + "logps/chosen": -588.0, + "logps/rejected": -466.0, + "loss": 0.6006, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.50390625, + "rewards/rejected": -1.7109375, + "step": 1243 + }, + { + "epoch": 2.6038723181580323, + "grad_norm": 10.352241516113281, + "learning_rate": 2.2329370741460762e-08, + "logits/chosen": 0.734375, + "logits/rejected": 0.84375, + "logps/chosen": -200.0, + "logps/rejected": -292.0, + "loss": 0.5558, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.5703125, + "rewards/rejected": -1.765625, + "step": 1244 + }, + { + "epoch": 2.60596546310832, + "grad_norm": 12.06119155883789, + "learning_rate": 2.2095282354572198e-08, + "logits/chosen": 2.75, + "logits/rejected": 2.734375, + "logps/chosen": -548.0, + "logps/rejected": -506.0, + "loss": 0.6082, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.609375, + "rewards/margins": 0.330078125, + "rewards/rejected": -1.9453125, + "step": 1245 + }, + { + "epoch": 2.608058608058608, + "grad_norm": 11.703369140625, + "learning_rate": 2.1862373508988392e-08, + "logits/chosen": 1.6328125, + "logits/rejected": 1.953125, + "logps/chosen": -440.0, + "logps/rejected": -384.0, + "loss": 0.5871, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7109375, + "rewards/margins": 0.4921875, + "rewards/rejected": -2.203125, + "step": 1246 + }, + { + "epoch": 2.610151753008896, + "grad_norm": 11.315919876098633, + "learning_rate": 2.1630645410017693e-08, + "logits/chosen": 2.125, + "logits/rejected": 2.9375, + "logps/chosen": -676.0, + "logps/rejected": -334.0, + "loss": 0.5555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.158203125, + "rewards/rejected": -1.25, + "step": 1247 + }, + { + "epoch": 2.612244897959184, + "grad_norm": 12.547874450683594, + "learning_rate": 2.140009925685815e-08, + "logits/chosen": 1.5234375, + "logits/rejected": 2.09375, + "logps/chosen": -756.0, + "logps/rejected": -506.0, + "loss": 0.6101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.43359375, + "rewards/rejected": -1.5234375, + "step": 1248 + }, + { + "epoch": 2.6143380429094716, + "grad_norm": 10.966303825378418, + "learning_rate": 2.1170736242591206e-08, + "logits/chosen": 2.1875, + "logits/rejected": 1.9375, + "logps/chosen": -540.0, + "logps/rejected": -720.0, + "loss": 0.5907, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.5625, + "rewards/margins": 0.26171875, + "rewards/rejected": -1.8203125, + "step": 1249 + }, + { + "epoch": 2.6164311878597593, + "grad_norm": 12.1686429977417, + "learning_rate": 2.0942557554175444e-08, + "logits/chosen": 2.71875, + "logits/rejected": 3.09375, + "logps/chosen": -588.0, + "logps/rejected": -608.0, + "loss": 0.5666, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.6796875, + "rewards/rejected": -2.015625, + "step": 1250 + }, + { + "epoch": 2.618524332810047, + "grad_norm": 10.310696601867676, + "learning_rate": 2.0715564372440647e-08, + "logits/chosen": 1.0390625, + "logits/rejected": 0.66015625, + "logps/chosen": -249.0, + "logps/rejected": -456.0, + "loss": 0.5393, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.53125, + "rewards/margins": 1.0390625, + "rewards/rejected": -2.5625, + "step": 1251 + }, + { + "epoch": 2.620617477760335, + "grad_norm": 11.116397857666016, + "learning_rate": 2.0489757872081454e-08, + "logits/chosen": 2.1875, + "logits/rejected": 2.765625, + "logps/chosen": -668.0, + "logps/rejected": -528.0, + "loss": 0.581, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.203125, + "rewards/margins": 0.5078125, + "rewards/rejected": -1.7109375, + "step": 1252 + }, + { + "epoch": 2.6227106227106227, + "grad_norm": 12.374103546142578, + "learning_rate": 2.026513922165159e-08, + "logits/chosen": 0.67578125, + "logits/rejected": 0.76953125, + "logps/chosen": -306.0, + "logps/rejected": -352.0, + "loss": 0.5843, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.375, + "rewards/rejected": -1.6640625, + "step": 1253 + }, + { + "epoch": 2.6248037676609104, + "grad_norm": 12.416691780090332, + "learning_rate": 2.0041709583557405e-08, + "logits/chosen": 2.65625, + "logits/rejected": 2.34375, + "logps/chosen": -576.0, + "logps/rejected": -612.0, + "loss": 0.6033, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5078125, + "rewards/margins": 0.353515625, + "rewards/rejected": -1.859375, + "step": 1254 + }, + { + "epoch": 2.6268969126111985, + "grad_norm": 13.592479705810547, + "learning_rate": 1.981947011405226e-08, + "logits/chosen": 0.6484375, + "logits/rejected": 0.91015625, + "logps/chosen": -290.0, + "logps/rejected": -298.0, + "loss": 0.6248, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.494140625, + "rewards/rejected": -1.6875, + "step": 1255 + }, + { + "epoch": 2.628990057561486, + "grad_norm": 11.766193389892578, + "learning_rate": 1.9598421963230253e-08, + "logits/chosen": 1.96875, + "logits/rejected": 1.5859375, + "logps/chosen": -478.0, + "logps/rejected": -500.0, + "loss": 0.6101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.4296875, + "rewards/rejected": -1.859375, + "step": 1256 + }, + { + "epoch": 2.631083202511774, + "grad_norm": 13.772175788879395, + "learning_rate": 1.9378566275020433e-08, + "logits/chosen": 1.421875, + "logits/rejected": 1.625, + "logps/chosen": -414.0, + "logps/rejected": -324.0, + "loss": 0.6685, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.09375, + "rewards/margins": 0.28125, + "rewards/rejected": -1.375, + "step": 1257 + }, + { + "epoch": 2.6331763474620615, + "grad_norm": 12.496501922607422, + "learning_rate": 1.915990418718091e-08, + "logits/chosen": 1.1484375, + "logits/rejected": 1.8046875, + "logps/chosen": -434.0, + "logps/rejected": -350.0, + "loss": 0.5714, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.234375, + "rewards/margins": 0.84765625, + "rewards/rejected": -2.078125, + "step": 1258 + }, + { + "epoch": 2.6352694924123496, + "grad_norm": 12.116174697875977, + "learning_rate": 1.8942436831292678e-08, + "logits/chosen": 2.078125, + "logits/rejected": 2.265625, + "logps/chosen": -560.0, + "logps/rejected": -462.0, + "loss": 0.6301, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6484375, + "rewards/margins": 0.15625, + "rewards/rejected": -1.8046875, + "step": 1259 + }, + { + "epoch": 2.6373626373626373, + "grad_norm": 12.080883979797363, + "learning_rate": 1.87261653327542e-08, + "logits/chosen": 2.0, + "logits/rejected": 1.828125, + "logps/chosen": -476.0, + "logps/rejected": -560.0, + "loss": 0.6325, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.349609375, + "rewards/rejected": -1.703125, + "step": 1260 + }, + { + "epoch": 2.6394557823129254, + "grad_norm": 10.854859352111816, + "learning_rate": 1.8511090810775125e-08, + "logits/chosen": 1.71875, + "logits/rejected": 2.359375, + "logps/chosen": -430.0, + "logps/rejected": -272.0, + "loss": 0.5945, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.703125, + "rewards/margins": -0.373046875, + "rewards/rejected": -1.328125, + "step": 1261 + }, + { + "epoch": 2.641548927263213, + "grad_norm": 15.076943397521973, + "learning_rate": 1.829721437837095e-08, + "logits/chosen": 2.140625, + "logits/rejected": 1.96875, + "logps/chosen": -688.0, + "logps/rejected": -506.0, + "loss": 0.5931, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0078125, + "rewards/margins": 0.375, + "rewards/rejected": -1.3828125, + "step": 1262 + }, + { + "epoch": 2.643642072213501, + "grad_norm": 11.498101234436035, + "learning_rate": 1.8084537142356815e-08, + "logits/chosen": 2.15625, + "logits/rejected": 2.265625, + "logps/chosen": -414.0, + "logps/rejected": -450.0, + "loss": 0.5866, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5546875, + "rewards/margins": 0.345703125, + "rewards/rejected": -1.8984375, + "step": 1263 + }, + { + "epoch": 2.6457352171637885, + "grad_norm": 12.064689636230469, + "learning_rate": 1.787306020334216e-08, + "logits/chosen": 1.796875, + "logits/rejected": 1.9609375, + "logps/chosen": -548.0, + "logps/rejected": -468.0, + "loss": 0.5869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8515625, + "rewards/margins": 0.7734375, + "rewards/rejected": -1.625, + "step": 1264 + }, + { + "epoch": 2.647828362114076, + "grad_norm": 11.75654411315918, + "learning_rate": 1.7662784655724857e-08, + "logits/chosen": 1.3828125, + "logits/rejected": 2.71875, + "logps/chosen": -584.0, + "logps/rejected": -400.0, + "loss": 0.583, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.59375, + "rewards/margins": 0.08203125, + "rewards/rejected": -1.671875, + "step": 1265 + }, + { + "epoch": 2.6499215070643642, + "grad_norm": 11.494534492492676, + "learning_rate": 1.745371158768539e-08, + "logits/chosen": 0.625, + "logits/rejected": 0.74609375, + "logps/chosen": -290.0, + "logps/rejected": -344.0, + "loss": 0.5834, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.59375, + "rewards/margins": 0.8359375, + "rewards/rejected": -2.4375, + "step": 1266 + }, + { + "epoch": 2.652014652014652, + "grad_norm": 11.152542114257812, + "learning_rate": 1.7245842081181468e-08, + "logits/chosen": 1.703125, + "logits/rejected": 1.828125, + "logps/chosen": -748.0, + "logps/rejected": -520.0, + "loss": 0.6139, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.63671875, + "rewards/rejected": -1.953125, + "step": 1267 + }, + { + "epoch": 2.65410779696494, + "grad_norm": 11.165491104125977, + "learning_rate": 1.7039177211942455e-08, + "logits/chosen": 2.421875, + "logits/rejected": 2.75, + "logps/chosen": -620.0, + "logps/rejected": -490.0, + "loss": 0.5875, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7109375, + "rewards/margins": -0.408203125, + "rewards/rejected": -1.3046875, + "step": 1268 + }, + { + "epoch": 2.6562009419152277, + "grad_norm": 12.534558296203613, + "learning_rate": 1.6833718049463567e-08, + "logits/chosen": 2.046875, + "logits/rejected": 2.9375, + "logps/chosen": -560.0, + "logps/rejected": -344.0, + "loss": 0.5836, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3203125, + "rewards/margins": 0.279296875, + "rewards/rejected": -1.59375, + "step": 1269 + }, + { + "epoch": 2.6582940868655154, + "grad_norm": 11.990873336791992, + "learning_rate": 1.6629465657000433e-08, + "logits/chosen": 1.3515625, + "logits/rejected": 1.5234375, + "logps/chosen": -402.0, + "logps/rejected": -420.0, + "loss": 0.572, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.28125, + "rewards/margins": 0.5, + "rewards/rejected": -1.78125, + "step": 1270 + }, + { + "epoch": 2.660387231815803, + "grad_norm": 12.490434646606445, + "learning_rate": 1.6426421091563755e-08, + "logits/chosen": 2.1875, + "logits/rejected": 1.9765625, + "logps/chosen": -466.0, + "logps/rejected": -492.0, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.166015625, + "rewards/rejected": -1.65625, + "step": 1271 + }, + { + "epoch": 2.662480376766091, + "grad_norm": 11.288036346435547, + "learning_rate": 1.6224585403913625e-08, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -736.0, + "logps/rejected": -660.0, + "loss": 0.5632, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.78125, + "rewards/rejected": -1.9296875, + "step": 1272 + }, + { + "epoch": 2.664573521716379, + "grad_norm": 11.47856616973877, + "learning_rate": 1.6023959638554143e-08, + "logits/chosen": 1.171875, + "logits/rejected": 1.5546875, + "logps/chosen": -540.0, + "logps/rejected": -528.0, + "loss": 0.5548, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09375, + "rewards/margins": 0.40625, + "rewards/rejected": -1.5, + "step": 1273 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 11.616024017333984, + "learning_rate": 1.5824544833728e-08, + "logits/chosen": 1.421875, + "logits/rejected": 2.53125, + "logps/chosen": -644.0, + "logps/rejected": -612.0, + "loss": 0.5916, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.953125, + "rewards/margins": 0.083984375, + "rewards/rejected": -2.03125, + "step": 1274 + }, + { + "epoch": 2.6687598116169546, + "grad_norm": 11.60714340209961, + "learning_rate": 1.5626342021411292e-08, + "logits/chosen": 2.75, + "logits/rejected": 3.015625, + "logps/chosen": -680.0, + "logps/rejected": -528.0, + "loss": 0.5459, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.125, + "rewards/margins": 0.77734375, + "rewards/rejected": -1.90625, + "step": 1275 + }, + { + "epoch": 2.6708529565672423, + "grad_norm": 11.576690673828125, + "learning_rate": 1.542935222730791e-08, + "logits/chosen": 2.328125, + "logits/rejected": 2.65625, + "logps/chosen": -600.0, + "logps/rejected": -572.0, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.078125, + "rewards/margins": 0.8203125, + "rewards/rejected": -1.8984375, + "step": 1276 + }, + { + "epoch": 2.67294610151753, + "grad_norm": 10.313326835632324, + "learning_rate": 1.5233576470844337e-08, + "logits/chosen": 2.53125, + "logits/rejected": 1.9375, + "logps/chosen": -446.0, + "logps/rejected": -512.0, + "loss": 0.5445, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.515625, + "rewards/margins": 0.4921875, + "rewards/rejected": -2.0, + "step": 1277 + }, + { + "epoch": 2.6750392464678177, + "grad_norm": 11.188737869262695, + "learning_rate": 1.5039015765164458e-08, + "logits/chosen": 1.734375, + "logits/rejected": 2.078125, + "logps/chosen": -760.0, + "logps/rejected": -402.0, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0, + "rewards/margins": 0.2109375, + "rewards/rejected": -1.2109375, + "step": 1278 + }, + { + "epoch": 2.6771323914181058, + "grad_norm": 12.750375747680664, + "learning_rate": 1.4845671117124229e-08, + "logits/chosen": 1.78125, + "logits/rejected": 1.8125, + "logps/chosen": -406.0, + "logps/rejected": -384.0, + "loss": 0.5843, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3828125, + "rewards/margins": 0.6015625, + "rewards/rejected": -1.984375, + "step": 1279 + }, + { + "epoch": 2.6792255363683934, + "grad_norm": 11.617804527282715, + "learning_rate": 1.4653543527286419e-08, + "logits/chosen": 1.078125, + "logits/rejected": 1.953125, + "logps/chosen": -420.0, + "logps/rejected": -360.0, + "loss": 0.5954, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.15625, + "rewards/margins": -0.17578125, + "rewards/rejected": -1.984375, + "step": 1280 + }, + { + "epoch": 2.6813186813186816, + "grad_norm": 12.743657112121582, + "learning_rate": 1.4462633989915488e-08, + "logits/chosen": 2.5, + "logits/rejected": 3.34375, + "logps/chosen": -952.0, + "logps/rejected": -600.0, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.447265625, + "rewards/rejected": -1.9375, + "step": 1281 + }, + { + "epoch": 2.6834118262689692, + "grad_norm": 11.60865306854248, + "learning_rate": 1.4272943492972566e-08, + "logits/chosen": 1.4140625, + "logits/rejected": 1.84375, + "logps/chosen": -560.0, + "logps/rejected": -552.0, + "loss": 0.5925, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.53125, + "rewards/margins": 0.80859375, + "rewards/rejected": -2.34375, + "step": 1282 + }, + { + "epoch": 2.685504971219257, + "grad_norm": 13.835541725158691, + "learning_rate": 1.4084473018110164e-08, + "logits/chosen": 1.7421875, + "logits/rejected": 2.28125, + "logps/chosen": -398.0, + "logps/rejected": -394.0, + "loss": 0.585, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5703125, + "rewards/margins": 0.1064453125, + "rewards/rejected": -1.671875, + "step": 1283 + }, + { + "epoch": 2.6875981161695446, + "grad_norm": 12.084956169128418, + "learning_rate": 1.3897223540667076e-08, + "logits/chosen": 2.765625, + "logits/rejected": 3.09375, + "logps/chosen": -588.0, + "logps/rejected": -580.0, + "loss": 0.5902, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.5, + "rewards/rejected": -1.8359375, + "step": 1284 + }, + { + "epoch": 2.6896912611198327, + "grad_norm": 10.881200790405273, + "learning_rate": 1.3711196029663487e-08, + "logits/chosen": 1.9296875, + "logits/rejected": 1.984375, + "logps/chosen": -648.0, + "logps/rejected": -414.0, + "loss": 0.5565, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.78125, + "rewards/margins": 0.1796875, + "rewards/rejected": -1.9609375, + "step": 1285 + }, + { + "epoch": 2.6917844060701204, + "grad_norm": 11.821480751037598, + "learning_rate": 1.3526391447795904e-08, + "logits/chosen": 1.640625, + "logits/rejected": 2.015625, + "logps/chosen": -324.0, + "logps/rejected": -438.0, + "loss": 0.5904, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.328125, + "rewards/margins": 0.166015625, + "rewards/rejected": -1.4921875, + "step": 1286 + }, + { + "epoch": 2.693877551020408, + "grad_norm": 10.388792037963867, + "learning_rate": 1.3342810751432064e-08, + "logits/chosen": 2.203125, + "logits/rejected": 1.0859375, + "logps/chosen": -326.0, + "logps/rejected": -536.0, + "loss": 0.5561, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.515625, + "rewards/margins": 0.34375, + "rewards/rejected": -1.859375, + "step": 1287 + }, + { + "epoch": 2.695970695970696, + "grad_norm": 12.938568115234375, + "learning_rate": 1.3160454890606067e-08, + "logits/chosen": 1.546875, + "logits/rejected": 1.609375, + "logps/chosen": -284.0, + "logps/rejected": -272.0, + "loss": 0.5617, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4765625, + "rewards/margins": 0.265625, + "rewards/rejected": -1.7421875, + "step": 1288 + }, + { + "epoch": 2.698063840920984, + "grad_norm": 11.72751522064209, + "learning_rate": 1.2979324809013578e-08, + "logits/chosen": 1.078125, + "logits/rejected": 1.3359375, + "logps/chosen": -255.0, + "logps/rejected": -250.0, + "loss": 0.5794, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2578125, + "rewards/margins": 0.21484375, + "rewards/rejected": -1.4765625, + "step": 1289 + }, + { + "epoch": 2.7001569858712715, + "grad_norm": 10.565709114074707, + "learning_rate": 1.2799421444006754e-08, + "logits/chosen": 2.203125, + "logits/rejected": 2.453125, + "logps/chosen": -580.0, + "logps/rejected": -600.0, + "loss": 0.5436, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1640625, + "rewards/margins": 1.1875, + "rewards/rejected": -2.34375, + "step": 1290 + }, + { + "epoch": 2.702250130821559, + "grad_norm": 11.568882942199707, + "learning_rate": 1.2620745726589409e-08, + "logits/chosen": 1.3125, + "logits/rejected": 1.96875, + "logps/chosen": -440.0, + "logps/rejected": -430.0, + "loss": 0.5578, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.5390625, + "rewards/rejected": -1.7421875, + "step": 1291 + }, + { + "epoch": 2.7043432757718473, + "grad_norm": 11.32339096069336, + "learning_rate": 1.2443298581412347e-08, + "logits/chosen": 1.421875, + "logits/rejected": 2.28125, + "logps/chosen": -502.0, + "logps/rejected": -372.0, + "loss": 0.5872, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.7265625, + "rewards/margins": -0.0576171875, + "rewards/rejected": -1.6640625, + "step": 1292 + }, + { + "epoch": 2.706436420722135, + "grad_norm": 11.9345703125, + "learning_rate": 1.2267080926768485e-08, + "logits/chosen": 1.46875, + "logits/rejected": 1.5859375, + "logps/chosen": -506.0, + "logps/rejected": -372.0, + "loss": 0.5798, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.640625, + "rewards/margins": -0.33984375, + "rewards/rejected": -1.296875, + "step": 1293 + }, + { + "epoch": 2.708529565672423, + "grad_norm": 11.131210327148438, + "learning_rate": 1.2092093674588059e-08, + "logits/chosen": 1.4921875, + "logits/rejected": 1.7734375, + "logps/chosen": -412.0, + "logps/rejected": -468.0, + "loss": 0.5934, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.953125, + "rewards/margins": -0.06640625, + "rewards/rejected": -1.8828125, + "step": 1294 + }, + { + "epoch": 2.7106227106227108, + "grad_norm": 11.777026176452637, + "learning_rate": 1.1918337730433852e-08, + "logits/chosen": 2.59375, + "logits/rejected": 2.65625, + "logps/chosen": -616.0, + "logps/rejected": -468.0, + "loss": 0.591, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.65625, + "rewards/margins": 0.5859375, + "rewards/rejected": -2.25, + "step": 1295 + }, + { + "epoch": 2.7127158555729984, + "grad_norm": 11.28355598449707, + "learning_rate": 1.1745813993496789e-08, + "logits/chosen": 1.390625, + "logits/rejected": 1.4765625, + "logps/chosen": -364.0, + "logps/rejected": -536.0, + "loss": 0.5969, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4765625, + "rewards/margins": 0.4140625, + "rewards/rejected": -1.890625, + "step": 1296 + }, + { + "epoch": 2.714809000523286, + "grad_norm": 11.327817916870117, + "learning_rate": 1.157452335659099e-08, + "logits/chosen": 1.8984375, + "logits/rejected": 2.421875, + "logps/chosen": -426.0, + "logps/rejected": -436.0, + "loss": 0.5378, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.765625, + "rewards/rejected": -2.046875, + "step": 1297 + }, + { + "epoch": 2.716902145473574, + "grad_norm": 12.087903022766113, + "learning_rate": 1.1404466706149248e-08, + "logits/chosen": 2.5, + "logits/rejected": 2.140625, + "logps/chosen": -556.0, + "logps/rejected": -680.0, + "loss": 0.5745, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7109375, + "rewards/margins": -0.037109375, + "rewards/rejected": -1.671875, + "step": 1298 + }, + { + "epoch": 2.718995290423862, + "grad_norm": 11.75920581817627, + "learning_rate": 1.1235644922218483e-08, + "logits/chosen": 1.7265625, + "logits/rejected": 2.140625, + "logps/chosen": -608.0, + "logps/rejected": -688.0, + "loss": 0.5614, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.40625, + "rewards/margins": 1.0234375, + "rewards/rejected": -2.4375, + "step": 1299 + }, + { + "epoch": 2.7210884353741496, + "grad_norm": 11.290567398071289, + "learning_rate": 1.1068058878455178e-08, + "logits/chosen": 1.2890625, + "logits/rejected": 1.8515625, + "logps/chosen": -362.0, + "logps/rejected": -408.0, + "loss": 0.5772, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9765625, + "rewards/margins": 0.828125, + "rewards/rejected": -1.8046875, + "step": 1300 + }, + { + "epoch": 2.7231815803244377, + "grad_norm": 10.751604080200195, + "learning_rate": 1.0901709442120792e-08, + "logits/chosen": 3.0625, + "logits/rejected": 2.65625, + "logps/chosen": -688.0, + "logps/rejected": -648.0, + "loss": 0.5944, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.453125, + "rewards/margins": -0.1015625, + "rewards/rejected": -1.3515625, + "step": 1301 + }, + { + "epoch": 2.7252747252747254, + "grad_norm": 12.113463401794434, + "learning_rate": 1.0736597474077234e-08, + "logits/chosen": 1.96875, + "logits/rejected": 2.046875, + "logps/chosen": -422.0, + "logps/rejected": -528.0, + "loss": 0.515, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.234375, + "rewards/margins": 1.1328125, + "rewards/rejected": -2.359375, + "step": 1302 + }, + { + "epoch": 2.727367870225013, + "grad_norm": 11.601572036743164, + "learning_rate": 1.0572723828782626e-08, + "logits/chosen": 1.640625, + "logits/rejected": 1.1875, + "logps/chosen": -252.0, + "logps/rejected": -272.0, + "loss": 0.556, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.5, + "rewards/margins": 0.033203125, + "rewards/rejected": -1.5390625, + "step": 1303 + }, + { + "epoch": 2.7294610151753007, + "grad_norm": 11.546135902404785, + "learning_rate": 1.0410089354286747e-08, + "logits/chosen": 2.21875, + "logits/rejected": 3.40625, + "logps/chosen": -520.0, + "logps/rejected": -510.0, + "loss": 0.5902, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109375, + "rewards/margins": 0.73828125, + "rewards/rejected": -1.8515625, + "step": 1304 + }, + { + "epoch": 2.731554160125589, + "grad_norm": 12.792366027832031, + "learning_rate": 1.0248694892226478e-08, + "logits/chosen": 1.828125, + "logits/rejected": 1.734375, + "logps/chosen": -748.0, + "logps/rejected": -612.0, + "loss": 0.6293, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.375, + "rewards/margins": -0.087890625, + "rewards/rejected": -1.2890625, + "step": 1305 + }, + { + "epoch": 2.7336473050758765, + "grad_norm": 10.632676124572754, + "learning_rate": 1.0088541277821808e-08, + "logits/chosen": 2.25, + "logits/rejected": 2.359375, + "logps/chosen": -536.0, + "logps/rejected": -540.0, + "loss": 0.5731, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9921875, + "rewards/margins": 0.0078125, + "rewards/rejected": -2.0, + "step": 1306 + }, + { + "epoch": 2.735740450026164, + "grad_norm": 11.74323844909668, + "learning_rate": 9.92962933987112e-09, + "logits/chosen": 2.25, + "logits/rejected": 2.5625, + "logps/chosen": -556.0, + "logps/rejected": -544.0, + "loss": 0.583, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0, + "rewards/margins": 0.462890625, + "rewards/rejected": -1.4609375, + "step": 1307 + }, + { + "epoch": 2.7378335949764523, + "grad_norm": 13.4630708694458, + "learning_rate": 9.771959900747297e-09, + "logits/chosen": 1.625, + "logits/rejected": 2.015625, + "logps/chosen": -580.0, + "logps/rejected": -504.0, + "loss": 0.5989, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.453125, + "rewards/margins": 0.150390625, + "rewards/rejected": -1.6015625, + "step": 1308 + }, + { + "epoch": 2.73992673992674, + "grad_norm": 11.922073364257812, + "learning_rate": 9.615533776393041e-09, + "logits/chosen": 1.46875, + "logits/rejected": 2.234375, + "logps/chosen": -548.0, + "logps/rejected": -410.0, + "loss": 0.5334, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9921875, + "rewards/margins": 0.140625, + "rewards/rejected": -2.125, + "step": 1309 + }, + { + "epoch": 2.7420198848770276, + "grad_norm": 13.078544616699219, + "learning_rate": 9.460351776317071e-09, + "logits/chosen": 1.78125, + "logits/rejected": 1.3984375, + "logps/chosen": -312.0, + "logps/rejected": -272.0, + "loss": 0.5964, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.453125, + "rewards/margins": -0.0322265625, + "rewards/rejected": -1.421875, + "step": 1310 + }, + { + "epoch": 2.7441130298273153, + "grad_norm": 12.194268226623535, + "learning_rate": 9.30641470358964e-09, + "logits/chosen": 1.78125, + "logits/rejected": 1.9296875, + "logps/chosen": -532.0, + "logps/rejected": -548.0, + "loss": 0.6033, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.578125, + "rewards/margins": -0.15234375, + "rewards/rejected": -1.4296875, + "step": 1311 + }, + { + "epoch": 2.7462061747776034, + "grad_norm": 12.832474708557129, + "learning_rate": 9.153723354838447e-09, + "logits/chosen": 2.546875, + "logits/rejected": 2.6875, + "logps/chosen": -472.0, + "logps/rejected": -536.0, + "loss": 0.6222, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.625, + "rewards/rejected": -1.75, + "step": 1312 + }, + { + "epoch": 2.748299319727891, + "grad_norm": 11.957188606262207, + "learning_rate": 9.00227852024463e-09, + "logits/chosen": 2.3125, + "logits/rejected": 2.671875, + "logps/chosen": -512.0, + "logps/rejected": -408.0, + "loss": 0.5906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3671875, + "rewards/margins": 0.8203125, + "rewards/rejected": -2.1875, + "step": 1313 + }, + { + "epoch": 2.750392464678179, + "grad_norm": 11.670499801635742, + "learning_rate": 8.852080983538517e-09, + "logits/chosen": 1.7734375, + "logits/rejected": 2.75, + "logps/chosen": -632.0, + "logps/rejected": -388.0, + "loss": 0.5963, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0234375, + "rewards/margins": 1.0, + "rewards/rejected": -2.03125, + "step": 1314 + }, + { + "epoch": 2.752485609628467, + "grad_norm": 11.205465316772461, + "learning_rate": 8.703131521995693e-09, + "logits/chosen": 2.421875, + "logits/rejected": 2.5625, + "logps/chosen": -848.0, + "logps/rejected": -776.0, + "loss": 0.6083, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.4140625, + "rewards/rejected": -1.546875, + "step": 1315 + }, + { + "epoch": 2.7545787545787546, + "grad_norm": 11.558897018432617, + "learning_rate": 8.555430906432838e-09, + "logits/chosen": 1.78125, + "logits/rejected": 2.40625, + "logps/chosen": -480.0, + "logps/rejected": -404.0, + "loss": 0.5502, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2578125, + "rewards/margins": 0.44921875, + "rewards/rejected": -1.7109375, + "step": 1316 + }, + { + "epoch": 2.7566718995290422, + "grad_norm": 11.394280433654785, + "learning_rate": 8.408979901203941e-09, + "logits/chosen": 1.8046875, + "logits/rejected": 2.125, + "logps/chosen": -440.0, + "logps/rejected": -496.0, + "loss": 0.5655, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7109375, + "rewards/margins": -0.091796875, + "rewards/rejected": -1.6171875, + "step": 1317 + }, + { + "epoch": 2.7587650444793304, + "grad_norm": 11.302227973937988, + "learning_rate": 8.263779264196152e-09, + "logits/chosen": 2.15625, + "logits/rejected": 2.59375, + "logps/chosen": -490.0, + "logps/rejected": -394.0, + "loss": 0.5616, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.671875, + "rewards/margins": 0.3671875, + "rewards/rejected": -2.046875, + "step": 1318 + }, + { + "epoch": 2.760858189429618, + "grad_norm": 10.928841590881348, + "learning_rate": 8.119829746825964e-09, + "logits/chosen": 1.640625, + "logits/rejected": 2.15625, + "logps/chosen": -424.0, + "logps/rejected": -520.0, + "loss": 0.577, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3984375, + "rewards/margins": 0.79296875, + "rewards/rejected": -2.1875, + "step": 1319 + }, + { + "epoch": 2.7629513343799057, + "grad_norm": 11.287101745605469, + "learning_rate": 7.977132094035315e-09, + "logits/chosen": 1.6640625, + "logits/rejected": 2.015625, + "logps/chosen": -420.0, + "logps/rejected": -430.0, + "loss": 0.5749, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.671875, + "rewards/margins": 0.412109375, + "rewards/rejected": -2.09375, + "step": 1320 + }, + { + "epoch": 2.765044479330194, + "grad_norm": 13.27101993560791, + "learning_rate": 7.835687044287696e-09, + "logits/chosen": 1.2578125, + "logits/rejected": 1.5859375, + "logps/chosen": -380.0, + "logps/rejected": -460.0, + "loss": 0.5573, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109375, + "rewards/margins": 0.6328125, + "rewards/rejected": -1.7421875, + "step": 1321 + }, + { + "epoch": 2.7671376242804815, + "grad_norm": 11.39008617401123, + "learning_rate": 7.695495329564341e-09, + "logits/chosen": 2.109375, + "logits/rejected": 3.46875, + "logps/chosen": -720.0, + "logps/rejected": -366.0, + "loss": 0.6035, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4453125, + "rewards/margins": 0.1376953125, + "rewards/rejected": -1.578125, + "step": 1322 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 12.085970878601074, + "learning_rate": 7.556557675360443e-09, + "logits/chosen": 1.8515625, + "logits/rejected": 1.953125, + "logps/chosen": -532.0, + "logps/rejected": -296.0, + "loss": 0.5797, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.21875, + "rewards/margins": 0.33984375, + "rewards/rejected": -1.5625, + "step": 1323 + }, + { + "epoch": 2.771323914181057, + "grad_norm": 10.792349815368652, + "learning_rate": 7.418874800681472e-09, + "logits/chosen": 1.1953125, + "logits/rejected": 1.484375, + "logps/chosen": -328.0, + "logps/rejected": -246.0, + "loss": 0.5747, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.1328125, + "rewards/margins": 0.0703125, + "rewards/rejected": -1.203125, + "step": 1324 + }, + { + "epoch": 2.773417059131345, + "grad_norm": 11.899036407470703, + "learning_rate": 7.2824474180393035e-09, + "logits/chosen": 1.6171875, + "logits/rejected": 1.84375, + "logps/chosen": -620.0, + "logps/rejected": -298.0, + "loss": 0.6034, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.265625, + "rewards/margins": -0.52734375, + "rewards/rejected": -1.734375, + "step": 1325 + }, + { + "epoch": 2.7755102040816326, + "grad_norm": 11.257723808288574, + "learning_rate": 7.1472762334486005e-09, + "logits/chosen": 0.6796875, + "logits/rejected": 0.81640625, + "logps/chosen": -218.0, + "logps/rejected": -316.0, + "loss": 0.5686, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109375, + "rewards/margins": 0.609375, + "rewards/rejected": -1.71875, + "step": 1326 + }, + { + "epoch": 2.7776033490319203, + "grad_norm": 13.037230491638184, + "learning_rate": 7.013361946423297e-09, + "logits/chosen": 2.046875, + "logits/rejected": 3.15625, + "logps/chosen": -628.0, + "logps/rejected": -510.0, + "loss": 0.551, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4453125, + "rewards/margins": 0.41015625, + "rewards/rejected": -1.859375, + "step": 1327 + }, + { + "epoch": 2.7796964939822084, + "grad_norm": 11.820975303649902, + "learning_rate": 6.880705249972762e-09, + "logits/chosen": 2.671875, + "logits/rejected": 2.90625, + "logps/chosen": -1168.0, + "logps/rejected": -640.0, + "loss": 0.5556, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1875, + "rewards/margins": 0.6171875, + "rewards/rejected": -1.796875, + "step": 1328 + }, + { + "epoch": 2.781789638932496, + "grad_norm": 12.489603042602539, + "learning_rate": 6.749306830598223e-09, + "logits/chosen": 2.375, + "logits/rejected": 2.859375, + "logps/chosen": -936.0, + "logps/rejected": -436.0, + "loss": 0.6013, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5546875, + "rewards/margins": 0.130859375, + "rewards/rejected": -1.6875, + "step": 1329 + }, + { + "epoch": 2.7838827838827838, + "grad_norm": 10.987958908081055, + "learning_rate": 6.619167368289517e-09, + "logits/chosen": 1.7421875, + "logits/rejected": 1.453125, + "logps/chosen": -524.0, + "logps/rejected": -480.0, + "loss": 0.5985, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2578125, + "rewards/margins": 0.1982421875, + "rewards/rejected": -1.453125, + "step": 1330 + }, + { + "epoch": 2.7859759288330714, + "grad_norm": 10.890115737915039, + "learning_rate": 6.490287536521181e-09, + "logits/chosen": 2.328125, + "logits/rejected": 2.671875, + "logps/chosen": -680.0, + "logps/rejected": -652.0, + "loss": 0.5408, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.59375, + "rewards/margins": 0.076171875, + "rewards/rejected": -1.671875, + "step": 1331 + }, + { + "epoch": 2.7880690737833596, + "grad_norm": 12.001659393310547, + "learning_rate": 6.362668002249141e-09, + "logits/chosen": 2.125, + "logits/rejected": 2.59375, + "logps/chosen": -548.0, + "logps/rejected": -434.0, + "loss": 0.5569, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.609375, + "rewards/margins": -0.03515625, + "rewards/rejected": -1.5703125, + "step": 1332 + }, + { + "epoch": 2.7901622187336472, + "grad_norm": 11.607378005981445, + "learning_rate": 6.236309425907337e-09, + "logits/chosen": 2.125, + "logits/rejected": 3.28125, + "logps/chosen": -462.0, + "logps/rejected": -608.0, + "loss": 0.5898, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.78125, + "rewards/margins": 0.390625, + "rewards/rejected": -2.171875, + "step": 1333 + }, + { + "epoch": 2.7922553636839353, + "grad_norm": 13.034173011779785, + "learning_rate": 6.111212461404191e-09, + "logits/chosen": 1.7890625, + "logits/rejected": 1.390625, + "logps/chosen": -532.0, + "logps/rejected": -588.0, + "loss": 0.5415, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3671875, + "rewards/margins": 0.2431640625, + "rewards/rejected": -1.609375, + "step": 1334 + }, + { + "epoch": 2.794348508634223, + "grad_norm": 11.403310775756836, + "learning_rate": 5.987377756119224e-09, + "logits/chosen": 1.3046875, + "logits/rejected": 1.4609375, + "logps/chosen": -332.0, + "logps/rejected": -370.0, + "loss": 0.5802, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.21875, + "rewards/margins": 0.1875, + "rewards/rejected": -1.40625, + "step": 1335 + }, + { + "epoch": 2.7964416535845107, + "grad_norm": 11.478974342346191, + "learning_rate": 5.864805950899722e-09, + "logits/chosen": 3.09375, + "logits/rejected": 2.84375, + "logps/chosen": -564.0, + "logps/rejected": -704.0, + "loss": 0.6002, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6328125, + "rewards/margins": 0.4296875, + "rewards/rejected": -2.0625, + "step": 1336 + }, + { + "epoch": 2.7985347985347984, + "grad_norm": 11.049171447753906, + "learning_rate": 5.743497680057553e-09, + "logits/chosen": 2.8125, + "logits/rejected": 3.046875, + "logps/chosen": -816.0, + "logps/rejected": -784.0, + "loss": 0.5882, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.296875, + "rewards/margins": 0.6328125, + "rewards/rejected": -1.9296875, + "step": 1337 + }, + { + "epoch": 2.8006279434850865, + "grad_norm": 11.94770622253418, + "learning_rate": 5.623453571365659e-09, + "logits/chosen": 2.0625, + "logits/rejected": 2.1875, + "logps/chosen": -536.0, + "logps/rejected": -592.0, + "loss": 0.5683, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.484375, + "rewards/margins": 0.8984375, + "rewards/rejected": -2.375, + "step": 1338 + }, + { + "epoch": 2.802721088435374, + "grad_norm": 11.677643775939941, + "learning_rate": 5.504674246054929e-09, + "logits/chosen": 1.890625, + "logits/rejected": 2.40625, + "logps/chosen": -316.0, + "logps/rejected": -312.0, + "loss": 0.607, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.140625, + "rewards/margins": 0.51953125, + "rewards/rejected": -1.6640625, + "step": 1339 + }, + { + "epoch": 2.804814233385662, + "grad_norm": 12.415672302246094, + "learning_rate": 5.3871603188110015e-09, + "logits/chosen": 2.578125, + "logits/rejected": 2.46875, + "logps/chosen": -624.0, + "logps/rejected": -556.0, + "loss": 0.604, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.171875, + "rewards/margins": -0.025390625, + "rewards/rejected": -2.140625, + "step": 1340 + }, + { + "epoch": 2.80690737833595, + "grad_norm": 11.957086563110352, + "learning_rate": 5.270912397771023e-09, + "logits/chosen": 2.375, + "logits/rejected": 2.21875, + "logps/chosen": -384.0, + "logps/rejected": -592.0, + "loss": 0.5621, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.921875, + "rewards/margins": 0.53125, + "rewards/rejected": -1.453125, + "step": 1341 + }, + { + "epoch": 2.8090005232862376, + "grad_norm": 11.723847389221191, + "learning_rate": 5.1559310845205584e-09, + "logits/chosen": 2.34375, + "logits/rejected": 1.7734375, + "logps/chosen": -326.0, + "logps/rejected": -572.0, + "loss": 0.5695, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.46875, + "rewards/margins": 0.34375, + "rewards/rejected": -1.8125, + "step": 1342 + }, + { + "epoch": 2.8110936682365253, + "grad_norm": 11.346755981445312, + "learning_rate": 5.042216974090385e-09, + "logits/chosen": 2.53125, + "logits/rejected": 2.96875, + "logps/chosen": -418.0, + "logps/rejected": -356.0, + "loss": 0.5903, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.1298828125, + "rewards/rejected": -1.28125, + "step": 1343 + }, + { + "epoch": 2.813186813186813, + "grad_norm": 11.852492332458496, + "learning_rate": 4.9297706549536206e-09, + "logits/chosen": 1.90625, + "logits/rejected": 2.140625, + "logps/chosen": -532.0, + "logps/rejected": -528.0, + "loss": 0.5812, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.33984375, + "rewards/rejected": -1.6328125, + "step": 1344 + }, + { + "epoch": 2.815279958137101, + "grad_norm": 11.91535472869873, + "learning_rate": 4.818592709022374e-09, + "logits/chosen": 1.4296875, + "logits/rejected": 1.5078125, + "logps/chosen": -456.0, + "logps/rejected": -360.0, + "loss": 0.5675, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.0625, + "rewards/margins": -0.359375, + "rewards/rejected": -1.703125, + "step": 1345 + }, + { + "epoch": 2.8173731030873888, + "grad_norm": 10.856095314025879, + "learning_rate": 4.708683711644967e-09, + "logits/chosen": 2.015625, + "logits/rejected": 1.859375, + "logps/chosen": -466.0, + "logps/rejected": -600.0, + "loss": 0.5566, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.25, + "rewards/margins": 0.70703125, + "rewards/rejected": -1.953125, + "step": 1346 + }, + { + "epoch": 2.819466248037677, + "grad_norm": 12.558600425720215, + "learning_rate": 4.600044231602881e-09, + "logits/chosen": 1.5546875, + "logits/rejected": 2.09375, + "logps/chosen": -548.0, + "logps/rejected": -388.0, + "loss": 0.565, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.53125, + "rewards/margins": 0.275390625, + "rewards/rejected": -1.8046875, + "step": 1347 + }, + { + "epoch": 2.8215593929879645, + "grad_norm": 11.158424377441406, + "learning_rate": 4.492674831107842e-09, + "logits/chosen": 1.765625, + "logits/rejected": 1.578125, + "logps/chosen": -272.0, + "logps/rejected": -600.0, + "loss": 0.5686, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3203125, + "rewards/margins": 1.4296875, + "rewards/rejected": -2.75, + "step": 1348 + }, + { + "epoch": 2.823652537938252, + "grad_norm": 10.950251579284668, + "learning_rate": 4.386576065798857e-09, + "logits/chosen": 1.25, + "logits/rejected": 1.296875, + "logps/chosen": -192.0, + "logps/rejected": -230.0, + "loss": 0.5563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.90625, + "rewards/margins": 0.265625, + "rewards/rejected": -1.171875, + "step": 1349 + }, + { + "epoch": 2.82574568288854, + "grad_norm": 11.691061973571777, + "learning_rate": 4.281748484739318e-09, + "logits/chosen": 2.046875, + "logits/rejected": 1.6953125, + "logps/chosen": -482.0, + "logps/rejected": -556.0, + "loss": 0.5678, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265625, + "rewards/margins": 0.1474609375, + "rewards/rejected": -1.4140625, + "step": 1350 + }, + { + "epoch": 2.8278388278388276, + "grad_norm": 11.009173393249512, + "learning_rate": 4.178192630414292e-09, + "logits/chosen": 2.328125, + "logits/rejected": 2.71875, + "logps/chosen": -608.0, + "logps/rejected": -368.0, + "loss": 0.5479, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9765625, + "rewards/margins": 0.462890625, + "rewards/rejected": -1.4375, + "step": 1351 + }, + { + "epoch": 2.8299319727891157, + "grad_norm": 11.051095008850098, + "learning_rate": 4.0759090387276545e-09, + "logits/chosen": 2.21875, + "logits/rejected": 2.34375, + "logps/chosen": -490.0, + "logps/rejected": -418.0, + "loss": 0.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4453125, + "rewards/margins": 0.63671875, + "rewards/rejected": -2.078125, + "step": 1352 + }, + { + "epoch": 2.8320251177394034, + "grad_norm": 10.837244987487793, + "learning_rate": 3.974898238999182e-09, + "logits/chosen": 2.25, + "logits/rejected": 2.609375, + "logps/chosen": -544.0, + "logps/rejected": -432.0, + "loss": 0.5494, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.40625, + "rewards/margins": 0.3984375, + "rewards/rejected": -1.8046875, + "step": 1353 + }, + { + "epoch": 2.8341182626896915, + "grad_norm": 11.459234237670898, + "learning_rate": 3.875160753962021e-09, + "logits/chosen": 0.173828125, + "logits/rejected": 0.76953125, + "logps/chosen": -246.0, + "logps/rejected": -229.0, + "loss": 0.5664, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3046875, + "rewards/margins": 0.0859375, + "rewards/rejected": -1.390625, + "step": 1354 + }, + { + "epoch": 2.836211407639979, + "grad_norm": 11.394813537597656, + "learning_rate": 3.776697099759833e-09, + "logits/chosen": 1.515625, + "logits/rejected": 2.0625, + "logps/chosen": -536.0, + "logps/rejected": -466.0, + "loss": 0.5806, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3828125, + "rewards/margins": 0.396484375, + "rewards/rejected": -1.78125, + "step": 1355 + }, + { + "epoch": 2.838304552590267, + "grad_norm": 13.518462181091309, + "learning_rate": 3.679507785944185e-09, + "logits/chosen": 1.015625, + "logits/rejected": 1.2890625, + "logps/chosen": -328.0, + "logps/rejected": -384.0, + "loss": 0.6375, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.71875, + "rewards/margins": 0.6171875, + "rewards/rejected": -2.34375, + "step": 1356 + }, + { + "epoch": 2.8403976975405545, + "grad_norm": 12.291327476501465, + "learning_rate": 3.58359331547194e-09, + "logits/chosen": 2.171875, + "logits/rejected": 1.3671875, + "logps/chosen": -414.0, + "logps/rejected": -600.0, + "loss": 0.6212, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.53125, + "rewards/margins": 0.7890625, + "rewards/rejected": -2.328125, + "step": 1357 + }, + { + "epoch": 2.8424908424908426, + "grad_norm": 12.211554527282715, + "learning_rate": 3.4889541847025653e-09, + "logits/chosen": 1.8515625, + "logits/rejected": 2.25, + "logps/chosen": -484.0, + "logps/rejected": -466.0, + "loss": 0.5661, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.44921875, + "rewards/rejected": -1.765625, + "step": 1358 + }, + { + "epoch": 2.8445839874411303, + "grad_norm": 12.278834342956543, + "learning_rate": 3.39559088339569e-09, + "logits/chosen": 2.375, + "logits/rejected": 2.109375, + "logps/chosen": -664.0, + "logps/rejected": -848.0, + "loss": 0.6107, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.421875, + "rewards/margins": 0.1123046875, + "rewards/rejected": -1.53125, + "step": 1359 + }, + { + "epoch": 2.846677132391418, + "grad_norm": 11.289807319641113, + "learning_rate": 3.303503894708414e-09, + "logits/chosen": 2.375, + "logits/rejected": 2.921875, + "logps/chosen": -628.0, + "logps/rejected": -584.0, + "loss": 0.5567, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.703125, + "rewards/margins": 0.404296875, + "rewards/rejected": -2.109375, + "step": 1360 + }, + { + "epoch": 2.848770277341706, + "grad_norm": 11.582511901855469, + "learning_rate": 3.2126936951929205e-09, + "logits/chosen": 1.46875, + "logits/rejected": 1.921875, + "logps/chosen": -456.0, + "logps/rejected": -660.0, + "loss": 0.5281, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0078125, + "rewards/margins": 0.98828125, + "rewards/rejected": -2.0, + "step": 1361 + }, + { + "epoch": 2.8508634222919937, + "grad_norm": 11.814677238464355, + "learning_rate": 3.1231607547940605e-09, + "logits/chosen": 1.65625, + "logits/rejected": 1.8125, + "logps/chosen": -284.0, + "logps/rejected": -620.0, + "loss": 0.5227, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.53125, + "rewards/margins": 0.458984375, + "rewards/rejected": -1.9921875, + "step": 1362 + }, + { + "epoch": 2.8529565672422814, + "grad_norm": 12.034936904907227, + "learning_rate": 3.0349055368466632e-09, + "logits/chosen": 1.65625, + "logits/rejected": 1.484375, + "logps/chosen": -400.0, + "logps/rejected": -416.0, + "loss": 0.5997, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.123046875, + "rewards/rejected": -1.40625, + "step": 1363 + }, + { + "epoch": 2.855049712192569, + "grad_norm": 12.554017066955566, + "learning_rate": 2.9479284980735085e-09, + "logits/chosen": 2.28125, + "logits/rejected": 2.203125, + "logps/chosen": -392.0, + "logps/rejected": -580.0, + "loss": 0.592, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5625, + "rewards/margins": -0.33984375, + "rewards/rejected": -1.21875, + "step": 1364 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 11.883865356445312, + "learning_rate": 2.862230088582717e-09, + "logits/chosen": 1.953125, + "logits/rejected": 1.421875, + "logps/chosen": -364.0, + "logps/rejected": -644.0, + "loss": 0.6202, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.421875, + "rewards/margins": 0.7734375, + "rewards/rejected": -2.1875, + "step": 1365 + }, + { + "epoch": 2.859236002093145, + "grad_norm": 10.784345626831055, + "learning_rate": 2.7778107518653115e-09, + "logits/chosen": 0.98828125, + "logits/rejected": 1.359375, + "logps/chosen": -398.0, + "logps/rejected": -298.0, + "loss": 0.6086, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.453125, + "rewards/margins": 0.138671875, + "rewards/rejected": -1.59375, + "step": 1366 + }, + { + "epoch": 2.861329147043433, + "grad_norm": 11.65639877319336, + "learning_rate": 2.6946709247933257e-09, + "logits/chosen": 1.28125, + "logits/rejected": 1.5, + "logps/chosen": -322.0, + "logps/rejected": -356.0, + "loss": 0.5634, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5234375, + "rewards/margins": 0.46484375, + "rewards/rejected": -1.9921875, + "step": 1367 + }, + { + "epoch": 2.8634222919937207, + "grad_norm": 12.796161651611328, + "learning_rate": 2.612811037617142e-09, + "logits/chosen": 0.8125, + "logits/rejected": 1.0703125, + "logps/chosen": -462.0, + "logps/rejected": -326.0, + "loss": 0.6258, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4375, + "rewards/margins": 0.400390625, + "rewards/rejected": -1.8359375, + "step": 1368 + }, + { + "epoch": 2.8655154369440083, + "grad_norm": 11.98444652557373, + "learning_rate": 2.5322315139635215e-09, + "logits/chosen": 1.765625, + "logits/rejected": 1.4375, + "logps/chosen": -362.0, + "logps/rejected": -540.0, + "loss": 0.5765, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.8515625, + "rewards/rejected": -1.953125, + "step": 1369 + }, + { + "epoch": 2.867608581894296, + "grad_norm": 11.5888032913208, + "learning_rate": 2.4529327708332437e-09, + "logits/chosen": 1.7890625, + "logits/rejected": 2.40625, + "logps/chosen": -450.0, + "logps/rejected": -456.0, + "loss": 0.5898, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.546875, + "rewards/margins": -0.1015625, + "rewards/rejected": -1.4453125, + "step": 1370 + }, + { + "epoch": 2.869701726844584, + "grad_norm": 11.986724853515625, + "learning_rate": 2.374915218599025e-09, + "logits/chosen": 2.515625, + "logits/rejected": 2.265625, + "logps/chosen": -848.0, + "logps/rejected": -568.0, + "loss": 0.5708, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5390625, + "rewards/margins": 0.291015625, + "rewards/rejected": -1.828125, + "step": 1371 + }, + { + "epoch": 2.871794871794872, + "grad_norm": 12.023932456970215, + "learning_rate": 2.2981792610034677e-09, + "logits/chosen": 1.03125, + "logits/rejected": 1.5078125, + "logps/chosen": -352.0, + "logps/rejected": -382.0, + "loss": 0.6112, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5625, + "rewards/margins": 0.138671875, + "rewards/rejected": -1.703125, + "step": 1372 + }, + { + "epoch": 2.8738880167451595, + "grad_norm": 11.441040992736816, + "learning_rate": 2.222725295156808e-09, + "logits/chosen": 2.1875, + "logits/rejected": 3.359375, + "logps/chosen": -848.0, + "logps/rejected": -510.0, + "loss": 0.5742, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.328125, + "rewards/margins": 0.23828125, + "rewards/rejected": -1.5625, + "step": 1373 + }, + { + "epoch": 2.8759811616954476, + "grad_norm": 11.501876831054688, + "learning_rate": 2.1485537115350034e-09, + "logits/chosen": 2.671875, + "logits/rejected": 3.515625, + "logps/chosen": -652.0, + "logps/rejected": -540.0, + "loss": 0.5568, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.296875, + "rewards/margins": 0.6328125, + "rewards/rejected": -1.9375, + "step": 1374 + }, + { + "epoch": 2.8780743066457353, + "grad_norm": 12.223243713378906, + "learning_rate": 2.075664893977596e-09, + "logits/chosen": 2.125, + "logits/rejected": 2.015625, + "logps/chosen": -728.0, + "logps/rejected": -724.0, + "loss": 0.5875, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.78125, + "rewards/margins": -0.021484375, + "rewards/rejected": -1.7578125, + "step": 1375 + }, + { + "epoch": 2.880167451596023, + "grad_norm": 11.807002067565918, + "learning_rate": 2.004059219685879e-09, + "logits/chosen": 1.453125, + "logits/rejected": 1.875, + "logps/chosen": -450.0, + "logps/rejected": -458.0, + "loss": 0.5473, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265625, + "rewards/margins": 0.515625, + "rewards/rejected": -1.78125, + "step": 1376 + }, + { + "epoch": 2.8822605965463106, + "grad_norm": 11.525500297546387, + "learning_rate": 1.9337370592207062e-09, + "logits/chosen": 2.28125, + "logits/rejected": 2.359375, + "logps/chosen": -604.0, + "logps/rejected": -424.0, + "loss": 0.587, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.328125, + "rewards/margins": 0.392578125, + "rewards/rejected": -1.71875, + "step": 1377 + }, + { + "epoch": 2.8843537414965987, + "grad_norm": 11.223447799682617, + "learning_rate": 1.8646987765008824e-09, + "logits/chosen": 1.71875, + "logits/rejected": 1.546875, + "logps/chosen": -228.0, + "logps/rejected": -354.0, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.640625, + "rewards/rejected": -1.828125, + "step": 1378 + }, + { + "epoch": 2.8864468864468864, + "grad_norm": 12.623505592346191, + "learning_rate": 1.7969447288010238e-09, + "logits/chosen": 1.9296875, + "logits/rejected": 1.765625, + "logps/chosen": -458.0, + "logps/rejected": -398.0, + "loss": 0.5929, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6015625, + "rewards/margins": 0.017578125, + "rewards/rejected": -1.625, + "step": 1379 + }, + { + "epoch": 2.8885400313971745, + "grad_norm": 12.275703430175781, + "learning_rate": 1.7304752667497843e-09, + "logits/chosen": 2.109375, + "logits/rejected": 1.2265625, + "logps/chosen": -246.0, + "logps/rejected": -346.0, + "loss": 0.5909, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.203125, + "rewards/margins": 0.25, + "rewards/rejected": -1.453125, + "step": 1380 + }, + { + "epoch": 2.890633176347462, + "grad_norm": 10.699250221252441, + "learning_rate": 1.6652907343281343e-09, + "logits/chosen": 2.578125, + "logits/rejected": 2.6875, + "logps/chosen": -716.0, + "logps/rejected": -504.0, + "loss": 0.5286, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4765625, + "rewards/margins": 0.6484375, + "rewards/rejected": -2.125, + "step": 1381 + }, + { + "epoch": 2.89272632129775, + "grad_norm": 10.823837280273438, + "learning_rate": 1.6013914688674172e-09, + "logits/chosen": 2.40625, + "logits/rejected": 3.15625, + "logps/chosen": -640.0, + "logps/rejected": -604.0, + "loss": 0.5155, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.234375, + "rewards/margins": 0.38671875, + "rewards/rejected": -1.625, + "step": 1382 + }, + { + "epoch": 2.8948194662480375, + "grad_norm": 12.310914039611816, + "learning_rate": 1.5387778010477968e-09, + "logits/chosen": 1.578125, + "logits/rejected": 2.40625, + "logps/chosen": -524.0, + "logps/rejected": -482.0, + "loss": 0.6252, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6015625, + "rewards/margins": 0.3203125, + "rewards/rejected": -1.921875, + "step": 1383 + }, + { + "epoch": 2.896912611198325, + "grad_norm": 11.84903335571289, + "learning_rate": 1.4774500548963405e-09, + "logits/chosen": 1.75, + "logits/rejected": 1.5625, + "logps/chosen": -304.0, + "logps/rejected": -452.0, + "loss": 0.6086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5625, + "rewards/margins": 0.5703125, + "rewards/rejected": -2.125, + "step": 1384 + }, + { + "epoch": 2.8990057561486133, + "grad_norm": 11.620820999145508, + "learning_rate": 1.4174085477854664e-09, + "logits/chosen": 2.1875, + "logits/rejected": 2.3125, + "logps/chosen": -756.0, + "logps/rejected": -498.0, + "loss": 0.5592, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7578125, + "rewards/margins": -0.0234375, + "rewards/rejected": -1.734375, + "step": 1385 + }, + { + "epoch": 2.901098901098901, + "grad_norm": 11.344687461853027, + "learning_rate": 1.3586535904313612e-09, + "logits/chosen": 1.640625, + "logits/rejected": 2.140625, + "logps/chosen": -476.0, + "logps/rejected": -564.0, + "loss": 0.5802, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.375, + "rewards/margins": 0.248046875, + "rewards/rejected": -1.6171875, + "step": 1386 + }, + { + "epoch": 2.903192046049189, + "grad_norm": 10.856180191040039, + "learning_rate": 1.3011854868921756e-09, + "logits/chosen": 1.859375, + "logits/rejected": 1.8359375, + "logps/chosen": -510.0, + "logps/rejected": -536.0, + "loss": 0.5505, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3046875, + "rewards/margins": 0.60546875, + "rewards/rejected": -1.9140625, + "step": 1387 + }, + { + "epoch": 2.905285190999477, + "grad_norm": 11.679254531860352, + "learning_rate": 1.2450045345665826e-09, + "logits/chosen": 1.90625, + "logits/rejected": 2.671875, + "logps/chosen": -592.0, + "logps/rejected": -344.0, + "loss": 0.5609, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.359375, + "rewards/margins": 0.1689453125, + "rewards/rejected": -1.53125, + "step": 1388 + }, + { + "epoch": 2.9073783359497645, + "grad_norm": 11.584948539733887, + "learning_rate": 1.1901110241923045e-09, + "logits/chosen": 2.1875, + "logits/rejected": 2.5, + "logps/chosen": -500.0, + "logps/rejected": -532.0, + "loss": 0.5921, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.640625, + "rewards/margins": 0.203125, + "rewards/rejected": -1.84375, + "step": 1389 + }, + { + "epoch": 2.909471480900052, + "grad_norm": 12.864279747009277, + "learning_rate": 1.1365052398444774e-09, + "logits/chosen": 0.88671875, + "logits/rejected": 0.62890625, + "logps/chosen": -358.0, + "logps/rejected": -556.0, + "loss": 0.6319, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.71875, + "rewards/margins": 0.3828125, + "rewards/rejected": -2.109375, + "step": 1390 + }, + { + "epoch": 2.9115646258503403, + "grad_norm": 11.117484092712402, + "learning_rate": 1.0841874589341515e-09, + "logits/chosen": 1.5703125, + "logits/rejected": 1.7734375, + "logps/chosen": -360.0, + "logps/rejected": -390.0, + "loss": 0.5577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.93359375, + "rewards/margins": 0.86328125, + "rewards/rejected": -1.796875, + "step": 1391 + }, + { + "epoch": 2.913657770800628, + "grad_norm": 11.658158302307129, + "learning_rate": 1.033157952207015e-09, + "logits/chosen": 2.0, + "logits/rejected": 2.640625, + "logps/chosen": -576.0, + "logps/rejected": -450.0, + "loss": 0.5708, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.796875, + "rewards/margins": 0.12109375, + "rewards/rejected": -1.921875, + "step": 1392 + }, + { + "epoch": 2.9157509157509156, + "grad_norm": 11.494297981262207, + "learning_rate": 9.834169837419226e-10, + "logits/chosen": 1.7890625, + "logits/rejected": 2.671875, + "logps/chosen": -556.0, + "logps/rejected": -434.0, + "loss": 0.5964, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.376953125, + "rewards/rejected": -1.5859375, + "step": 1393 + }, + { + "epoch": 2.9178440607012037, + "grad_norm": 11.906575202941895, + "learning_rate": 9.349648109494255e-10, + "logits/chosen": 1.1171875, + "logits/rejected": 1.1953125, + "logps/chosen": -576.0, + "logps/rejected": -448.0, + "loss": 0.5775, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5859375, + "rewards/margins": 0.087890625, + "rewards/rejected": -1.671875, + "step": 1394 + }, + { + "epoch": 2.9199372056514914, + "grad_norm": 11.835088729858398, + "learning_rate": 8.878016845706324e-10, + "logits/chosen": 1.171875, + "logits/rejected": 1.7578125, + "logps/chosen": -482.0, + "logps/rejected": -340.0, + "loss": 0.5616, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.734375, + "rewards/margins": 0.10546875, + "rewards/rejected": -1.84375, + "step": 1395 + }, + { + "epoch": 2.922030350601779, + "grad_norm": 11.811446189880371, + "learning_rate": 8.419278486757394e-10, + "logits/chosen": 2.1875, + "logits/rejected": 2.234375, + "logps/chosen": -418.0, + "logps/rejected": -500.0, + "loss": 0.5857, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8125, + "rewards/margins": 0.1484375, + "rewards/rejected": -1.9609375, + "step": 1396 + }, + { + "epoch": 2.9241234955520667, + "grad_norm": 12.178156852722168, + "learning_rate": 7.973435406628644e-10, + "logits/chosen": 2.171875, + "logits/rejected": 2.546875, + "logps/chosen": -592.0, + "logps/rejected": -772.0, + "loss": 0.6156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.171875, + "rewards/margins": 0.9296875, + "rewards/rejected": -2.09375, + "step": 1397 + }, + { + "epoch": 2.926216640502355, + "grad_norm": 10.631257057189941, + "learning_rate": 7.540489912567702e-10, + "logits/chosen": 2.40625, + "logits/rejected": 2.78125, + "logps/chosen": -486.0, + "logps/rejected": -422.0, + "loss": 0.5974, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.25, + "rewards/margins": 0.384765625, + "rewards/rejected": -1.640625, + "step": 1398 + }, + { + "epoch": 2.9283097854526425, + "grad_norm": 10.772899627685547, + "learning_rate": 7.120444245076987e-10, + "logits/chosen": 1.546875, + "logits/rejected": 1.7578125, + "logps/chosen": -608.0, + "logps/rejected": -624.0, + "loss": 0.5379, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.84375, + "rewards/margins": 0.296875, + "rewards/rejected": -2.140625, + "step": 1399 + }, + { + "epoch": 2.9304029304029307, + "grad_norm": 12.620939254760742, + "learning_rate": 6.713300577902336e-10, + "logits/chosen": 1.5703125, + "logits/rejected": 1.875, + "logps/chosen": -482.0, + "logps/rejected": -470.0, + "loss": 0.6188, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4453125, + "rewards/margins": 0.62890625, + "rewards/rejected": -2.078125, + "step": 1400 + }, + { + "epoch": 2.9324960753532183, + "grad_norm": 11.615793228149414, + "learning_rate": 6.319061018021064e-10, + "logits/chosen": 1.7421875, + "logits/rejected": 1.859375, + "logps/chosen": -332.0, + "logps/rejected": -406.0, + "loss": 0.5764, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3515625, + "rewards/margins": 0.2294921875, + "rewards/rejected": -1.578125, + "step": 1401 + }, + { + "epoch": 2.934589220303506, + "grad_norm": 11.255279541015625, + "learning_rate": 5.937727605631422e-10, + "logits/chosen": 1.8671875, + "logits/rejected": 1.9609375, + "logps/chosen": -552.0, + "logps/rejected": -656.0, + "loss": 0.5535, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.359375, + "rewards/margins": 0.349609375, + "rewards/rejected": -1.703125, + "step": 1402 + }, + { + "epoch": 2.9366823652537937, + "grad_norm": 11.204919815063477, + "learning_rate": 5.56930231414233e-10, + "logits/chosen": 1.671875, + "logits/rejected": 2.203125, + "logps/chosen": -436.0, + "logps/rejected": -524.0, + "loss": 0.5956, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.703125, + "rewards/margins": 0.15234375, + "rewards/rejected": -1.859375, + "step": 1403 + }, + { + "epoch": 2.938775510204082, + "grad_norm": 11.915118217468262, + "learning_rate": 5.213787050162823e-10, + "logits/chosen": 1.4296875, + "logits/rejected": 2.046875, + "logps/chosen": -624.0, + "logps/rejected": -608.0, + "loss": 0.6307, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.515625, + "rewards/margins": -0.9140625, + "rewards/rejected": -1.609375, + "step": 1404 + }, + { + "epoch": 2.9408686551543695, + "grad_norm": 12.069853782653809, + "learning_rate": 4.871183653492071e-10, + "logits/chosen": 1.796875, + "logits/rejected": 1.7890625, + "logps/chosen": -374.0, + "logps/rejected": -416.0, + "loss": 0.5981, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.484375, + "rewards/margins": 0.4609375, + "rewards/rejected": -1.9453125, + "step": 1405 + }, + { + "epoch": 2.942961800104657, + "grad_norm": 12.105071067810059, + "learning_rate": 4.5414938971104906e-10, + "logits/chosen": 2.5, + "logits/rejected": 2.09375, + "logps/chosen": -696.0, + "logps/rejected": -502.0, + "loss": 0.5965, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.484375, + "rewards/margins": -0.07421875, + "rewards/rejected": -1.40625, + "step": 1406 + }, + { + "epoch": 2.9450549450549453, + "grad_norm": 12.12260913848877, + "learning_rate": 4.2247194871694753e-10, + "logits/chosen": 2.0625, + "logits/rejected": 1.5703125, + "logps/chosen": -382.0, + "logps/rejected": -516.0, + "loss": 0.5959, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.234375, + "rewards/margins": -0.078125, + "rewards/rejected": -1.15625, + "step": 1407 + }, + { + "epoch": 2.947148090005233, + "grad_norm": 11.283514976501465, + "learning_rate": 3.9208620629839086e-10, + "logits/chosen": 2.40625, + "logits/rejected": 2.671875, + "logps/chosen": -648.0, + "logps/rejected": -648.0, + "loss": 0.5661, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.140625, + "rewards/margins": 0.2578125, + "rewards/rejected": -1.3984375, + "step": 1408 + }, + { + "epoch": 2.9492412349555206, + "grad_norm": 12.450947761535645, + "learning_rate": 3.629923197022169e-10, + "logits/chosen": 2.203125, + "logits/rejected": 2.484375, + "logps/chosen": -864.0, + "logps/rejected": -644.0, + "loss": 0.6292, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.94140625, + "rewards/margins": 0.2158203125, + "rewards/rejected": -1.15625, + "step": 1409 + }, + { + "epoch": 2.9513343799058083, + "grad_norm": 10.849802017211914, + "learning_rate": 3.3519043948997476e-10, + "logits/chosen": 3.0, + "logits/rejected": 2.8125, + "logps/chosen": -688.0, + "logps/rejected": -736.0, + "loss": 0.5535, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.953125, + "rewards/margins": 0.056640625, + "rewards/rejected": -2.0, + "step": 1410 + }, + { + "epoch": 2.9534275248560964, + "grad_norm": 10.051568031311035, + "learning_rate": 3.086807095369811e-10, + "logits/chosen": 2.046875, + "logits/rejected": 1.453125, + "logps/chosen": -390.0, + "logps/rejected": -476.0, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0703125, + "rewards/margins": 0.73046875, + "rewards/rejected": -1.796875, + "step": 1411 + }, + { + "epoch": 2.955520669806384, + "grad_norm": 11.31157398223877, + "learning_rate": 2.8346326703168203e-10, + "logits/chosen": 2.21875, + "logits/rejected": 2.34375, + "logps/chosen": -412.0, + "logps/rejected": -472.0, + "loss": 0.5827, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5390625, + "rewards/margins": 0.306640625, + "rewards/rejected": -1.84375, + "step": 1412 + }, + { + "epoch": 2.957613814756672, + "grad_norm": 11.44780158996582, + "learning_rate": 2.5953824247490364e-10, + "logits/chosen": 2.765625, + "logits/rejected": 2.671875, + "logps/chosen": -648.0, + "logps/rejected": -410.0, + "loss": 0.5737, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.5625, + "rewards/margins": -0.2265625, + "rewards/rejected": -1.3359375, + "step": 1413 + }, + { + "epoch": 2.95970695970696, + "grad_norm": 11.903714179992676, + "learning_rate": 2.3690575967915824e-10, + "logits/chosen": 2.328125, + "logits/rejected": 2.71875, + "logps/chosen": -528.0, + "logps/rejected": -544.0, + "loss": 0.6146, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.234375, + "rewards/margins": 0.60546875, + "rewards/rejected": -1.84375, + "step": 1414 + }, + { + "epoch": 2.9618001046572475, + "grad_norm": 12.053587913513184, + "learning_rate": 2.1556593576806152e-10, + "logits/chosen": 2.0625, + "logits/rejected": 2.5, + "logps/chosen": -600.0, + "logps/rejected": -620.0, + "loss": 0.549, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.265625, + "rewards/margins": 0.494140625, + "rewards/rejected": -1.765625, + "step": 1415 + }, + { + "epoch": 2.963893249607535, + "grad_norm": 11.51821517944336, + "learning_rate": 1.9551888117566647e-10, + "logits/chosen": 2.796875, + "logits/rejected": 2.90625, + "logps/chosen": -640.0, + "logps/rejected": -502.0, + "loss": 0.5759, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.125, + "rewards/margins": 0.169921875, + "rewards/rejected": -1.296875, + "step": 1416 + }, + { + "epoch": 2.965986394557823, + "grad_norm": 12.681265830993652, + "learning_rate": 1.7676469964590832e-10, + "logits/chosen": 2.828125, + "logits/rejected": 3.125, + "logps/chosen": -880.0, + "logps/rejected": -684.0, + "loss": 0.6017, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7265625, + "rewards/margins": 0.193359375, + "rewards/rejected": -1.921875, + "step": 1417 + }, + { + "epoch": 2.968079539508111, + "grad_norm": 11.360309600830078, + "learning_rate": 1.5930348823207737e-10, + "logits/chosen": 1.28125, + "logits/rejected": 2.5, + "logps/chosen": -360.0, + "logps/rejected": -278.0, + "loss": 0.5533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.421875, + "rewards/margins": 0.29296875, + "rewards/rejected": -1.71875, + "step": 1418 + }, + { + "epoch": 2.9701726844583987, + "grad_norm": 11.54595947265625, + "learning_rate": 1.4313533729634691e-10, + "logits/chosen": 2.53125, + "logits/rejected": 2.328125, + "logps/chosen": -556.0, + "logps/rejected": -640.0, + "loss": 0.6055, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4765625, + "rewards/margins": 0.59375, + "rewards/rejected": -2.078125, + "step": 1419 + }, + { + "epoch": 2.9722658294086868, + "grad_norm": 11.778307914733887, + "learning_rate": 1.2826033050927406e-10, + "logits/chosen": 1.6171875, + "logits/rejected": 1.640625, + "logps/chosen": -356.0, + "logps/rejected": -600.0, + "loss": 0.6007, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3984375, + "rewards/margins": 0.4609375, + "rewards/rejected": -1.859375, + "step": 1420 + }, + { + "epoch": 2.9743589743589745, + "grad_norm": 12.374484062194824, + "learning_rate": 1.146785448493276e-10, + "logits/chosen": 1.25, + "logits/rejected": 2.03125, + "logps/chosen": -528.0, + "logps/rejected": -442.0, + "loss": 0.6091, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6875, + "rewards/margins": 0.28515625, + "rewards/rejected": -1.96875, + "step": 1421 + }, + { + "epoch": 2.976452119309262, + "grad_norm": 11.62448787689209, + "learning_rate": 1.0239005060252739e-10, + "logits/chosen": 0.73046875, + "logits/rejected": 1.65625, + "logps/chosen": -338.0, + "logps/rejected": -356.0, + "loss": 0.5605, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.435546875, + "rewards/rejected": -1.7734375, + "step": 1422 + }, + { + "epoch": 2.97854526425955, + "grad_norm": 12.74441146850586, + "learning_rate": 9.1394911362139e-11, + "logits/chosen": 2.109375, + "logits/rejected": 2.25, + "logps/chosen": -728.0, + "logps/rejected": -480.0, + "loss": 0.6134, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.0625, + "rewards/margins": -0.224609375, + "rewards/rejected": -1.828125, + "step": 1423 + }, + { + "epoch": 2.980638409209838, + "grad_norm": 11.858436584472656, + "learning_rate": 8.169318402820202e-11, + "logits/chosen": 1.7734375, + "logits/rejected": 1.6171875, + "logps/chosen": -452.0, + "logps/rejected": -832.0, + "loss": 0.5728, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3125, + "rewards/margins": 0.6875, + "rewards/rejected": -3.0, + "step": 1424 + }, + { + "epoch": 2.9827315541601256, + "grad_norm": 10.899884223937988, + "learning_rate": 7.328491880741893e-11, + "logits/chosen": 2.34375, + "logits/rejected": 2.203125, + "logps/chosen": -640.0, + "logps/rejected": -510.0, + "loss": 0.5538, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3125, + "rewards/margins": 0.54296875, + "rewards/rejected": -1.859375, + "step": 1425 + }, + { + "epoch": 2.9848246991104133, + "grad_norm": 10.890758514404297, + "learning_rate": 6.617015921273888e-11, + "logits/chosen": 1.6953125, + "logits/rejected": 2.03125, + "logps/chosen": -400.0, + "logps/rejected": -348.0, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.515625, + "rewards/margins": 0.302734375, + "rewards/rejected": -1.8125, + "step": 1426 + }, + { + "epoch": 2.9869178440607014, + "grad_norm": 11.535402297973633, + "learning_rate": 6.03489420631634e-11, + "logits/chosen": 1.171875, + "logits/rejected": 1.4375, + "logps/chosen": -368.0, + "logps/rejected": -436.0, + "loss": 0.5985, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.234375, + "rewards/margins": 0.08203125, + "rewards/rejected": -1.3203125, + "step": 1427 + }, + { + "epoch": 2.989010989010989, + "grad_norm": 11.802215576171875, + "learning_rate": 5.5821297483635366e-11, + "logits/chosen": 2.625, + "logits/rejected": 3.46875, + "logps/chosen": -572.0, + "logps/rejected": -416.0, + "loss": 0.5801, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.94921875, + "rewards/margins": 1.265625, + "rewards/rejected": -2.203125, + "step": 1428 + }, + { + "epoch": 2.9911041339612767, + "grad_norm": 12.465570449829102, + "learning_rate": 5.258724890484477e-11, + "logits/chosen": 2.703125, + "logits/rejected": 2.28125, + "logps/chosen": -380.0, + "logps/rejected": -556.0, + "loss": 0.5928, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.453125, + "rewards/margins": 0.0673828125, + "rewards/rejected": -1.5234375, + "step": 1429 + }, + { + "epoch": 2.9931972789115644, + "grad_norm": 12.42818832397461, + "learning_rate": 5.0646813063034436e-11, + "logits/chosen": 1.53125, + "logits/rejected": 0.96484375, + "logps/chosen": -260.0, + "logps/rejected": -446.0, + "loss": 0.5906, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5625, + "rewards/margins": -0.046875, + "rewards/rejected": -1.515625, + "step": 1430 + }, + { + "epoch": 2.9952904238618525, + "grad_norm": 12.25236701965332, + "learning_rate": 5e-11, + "logits/chosen": 1.390625, + "logits/rejected": 1.34375, + "logps/chosen": -436.0, + "logps/rejected": -231.0, + "loss": 0.5959, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.6328125, + "rewards/margins": -0.1943359375, + "rewards/rejected": -1.4375, + "step": 1431 + } + ], + "logging_steps": 1.0, + "max_steps": 1431, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}