{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.208333333333333e-09, "logits/chosen": -2.3438680171966553, "logits/rejected": -2.200690984725952, "logps/chosen": -309.19024658203125, "logps/rejected": -222.5582275390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.3084700107574463, "logits/rejected": -2.1888935565948486, "logps/chosen": -313.1751708984375, "logps/rejected": -276.1279602050781, "loss": 0.6933, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0006199586787261069, "rewards/margins": 0.0007922492804937065, "rewards/rejected": -0.00017229063087143004, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.069239377975464, "logits/rejected": -2.046480655670166, "logps/chosen": -235.33877563476562, "logps/rejected": -250.28262329101562, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.000233887491049245, "rewards/margins": -0.0002934989461209625, "rewards/rejected": 0.0005273864371702075, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.1968424320220947, "logits/rejected": -2.0520730018615723, "logps/chosen": -270.2682189941406, "logps/rejected": -251.5027618408203, "loss": 0.6921, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0023205596953630447, "rewards/margins": 0.0016283988952636719, "rewards/rejected": 0.000692160683684051, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.2101526260375977, "logits/rejected": -2.1605677604675293, "logps/chosen": -279.98785400390625, "logps/rejected": -269.2066650390625, "loss": 0.6908, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00826999731361866, "rewards/margins": 0.004588194657117128, "rewards/rejected": 0.003681803122162819, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.604166666666667e-07, "logits/chosen": -2.202209711074829, "logits/rejected": -2.1607909202575684, "logps/chosen": -260.29913330078125, "logps/rejected": -255.05734252929688, "loss": 0.6863, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.01528189517557621, "rewards/margins": 0.01289013959467411, "rewards/rejected": 0.0023917562793940306, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.155121326446533, "logits/rejected": -2.055530548095703, "logps/chosen": -284.8448181152344, "logps/rejected": -260.4853515625, "loss": 0.6799, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.023422595113515854, "rewards/margins": 0.029890310019254684, "rewards/rejected": -0.006467717699706554, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.645833333333333e-07, "logits/chosen": -2.1490182876586914, "logits/rejected": -2.0071780681610107, "logps/chosen": -291.1217041015625, "logps/rejected": -274.3982238769531, "loss": 0.6654, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.006606037728488445, "rewards/margins": 0.06380081921815872, "rewards/rejected": -0.07040686160326004, "step": 70 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.025543212890625, "logits/rejected": -1.9235265254974365, "logps/chosen": -319.5644836425781, "logps/rejected": -308.890869140625, "loss": 0.6484, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13454711437225342, "rewards/margins": 0.1107935681939125, "rewards/rejected": -0.24534066021442413, "step": 80 }, { "epoch": 0.09, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -1.9773271083831787, "logits/rejected": -1.933540940284729, "logps/chosen": -332.3374938964844, "logps/rejected": -305.792724609375, "loss": 0.6488, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2452891618013382, "rewards/margins": 0.1369139850139618, "rewards/rejected": -0.3822031617164612, "step": 90 }, { "epoch": 0.1, "learning_rate": 4.999732492681437e-07, "logits/chosen": -1.8573644161224365, "logits/rejected": -1.8359777927398682, "logps/chosen": -306.6253967285156, "logps/rejected": -350.30572509765625, "loss": 0.6339, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.32189178466796875, "rewards/margins": 0.19727933406829834, "rewards/rejected": -0.5191711187362671, "step": 100 }, { "epoch": 0.1, "eval_logits/chosen": -1.854963779449463, "eval_logits/rejected": -1.7265539169311523, "eval_logps/chosen": -322.45428466796875, "eval_logps/rejected": -318.7289123535156, "eval_loss": 0.6366350054740906, "eval_rewards/accuracies": 0.6765872836112976, "eval_rewards/chosen": -0.42509153485298157, "eval_rewards/margins": 0.2029310166835785, "eval_rewards/rejected": -0.6280225515365601, "eval_runtime": 245.1911, "eval_samples_per_second": 8.157, "eval_steps_per_second": 0.257, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.996723692767926e-07, "logits/chosen": -1.9280967712402344, "logits/rejected": -1.7659218311309814, "logps/chosen": -304.70416259765625, "logps/rejected": -303.22015380859375, "loss": 0.6252, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3900960385799408, "rewards/margins": 0.21217259764671326, "rewards/rejected": -0.6022686958312988, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.990375746213598e-07, "logits/chosen": -1.9455022811889648, "logits/rejected": -1.77316415309906, "logps/chosen": -319.1278381347656, "logps/rejected": -316.08233642578125, "loss": 0.6289, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.39358627796173096, "rewards/margins": 0.2644176185131073, "rewards/rejected": -0.6580039262771606, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.980697142834314e-07, "logits/chosen": -2.112504005432129, "logits/rejected": -1.9311736822128296, "logps/chosen": -345.83428955078125, "logps/rejected": -329.4554748535156, "loss": 0.6144, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4626695513725281, "rewards/margins": 0.18274100124835968, "rewards/rejected": -0.6454105973243713, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.967700826904229e-07, "logits/chosen": -2.019052505493164, "logits/rejected": -1.936741828918457, "logps/chosen": -315.3255920410156, "logps/rejected": -319.93902587890625, "loss": 0.595, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6618548631668091, "rewards/margins": 0.29689091444015503, "rewards/rejected": -0.9587456583976746, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.951404179843962e-07, "logits/chosen": -1.9618381261825562, "logits/rejected": -1.9211444854736328, "logps/chosen": -349.3891906738281, "logps/rejected": -336.864013671875, "loss": 0.6143, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4716704487800598, "rewards/margins": 0.22611722350120544, "rewards/rejected": -0.6977876424789429, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.931828996974498e-07, "logits/chosen": -2.0160911083221436, "logits/rejected": -1.9039357900619507, "logps/chosen": -307.0707092285156, "logps/rejected": -351.3368225097656, "loss": 0.5853, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5011919140815735, "rewards/margins": 0.29305344820022583, "rewards/rejected": -0.7942453622817993, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.909001458367866e-07, "logits/chosen": -2.044952869415283, "logits/rejected": -1.9001652002334595, "logps/chosen": -353.18634033203125, "logps/rejected": -322.8375549316406, "loss": 0.5853, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4935056269168854, "rewards/margins": 0.3311043679714203, "rewards/rejected": -0.8246100544929504, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.882952093833627e-07, "logits/chosen": -1.9955902099609375, "logits/rejected": -1.8852081298828125, "logps/chosen": -354.67779541015625, "logps/rejected": -409.36767578125, "loss": 0.564, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9336503744125366, "rewards/margins": 0.49948063492774963, "rewards/rejected": -1.4331310987472534, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.853715742087946e-07, "logits/chosen": -2.175471067428589, "logits/rejected": -1.9974597692489624, "logps/chosen": -358.2491760253906, "logps/rejected": -364.2205505371094, "loss": 0.5624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6146177053451538, "rewards/margins": 0.5364774465560913, "rewards/rejected": -1.1510951519012451, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.821331504159906e-07, "logits/chosen": -1.9217967987060547, "logits/rejected": -1.864315390586853, "logps/chosen": -347.8321228027344, "logps/rejected": -396.397216796875, "loss": 0.5801, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9297510385513306, "rewards/margins": 0.4349847435951233, "rewards/rejected": -1.3647358417510986, "step": 200 }, { "epoch": 0.21, "eval_logits/chosen": -1.886601448059082, "eval_logits/rejected": -1.7790945768356323, "eval_logps/chosen": -373.3334655761719, "eval_logps/rejected": -405.0862121582031, "eval_loss": 0.5760898590087891, "eval_rewards/accuracies": 0.7242063283920288, "eval_rewards/chosen": -0.9338834285736084, "eval_rewards/margins": 0.5577120184898376, "eval_rewards/rejected": -1.4915955066680908, "eval_runtime": 244.0521, "eval_samples_per_second": 8.195, "eval_steps_per_second": 0.258, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.785842691097342e-07, "logits/chosen": -1.9147882461547852, "logits/rejected": -1.7938772439956665, "logps/chosen": -381.3200378417969, "logps/rejected": -368.9107360839844, "loss": 0.5853, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8299940228462219, "rewards/margins": 0.4266335964202881, "rewards/rejected": -1.2566276788711548, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7472967660421603e-07, "logits/chosen": -1.8929469585418701, "logits/rejected": -1.888225793838501, "logps/chosen": -338.57611083984375, "logps/rejected": -366.98126220703125, "loss": 0.57, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6195345520973206, "rewards/margins": 0.3649435043334961, "rewards/rejected": -0.9844779968261719, "step": 220 }, { "epoch": 0.24, "learning_rate": 4.705745280752585e-07, "logits/chosen": -1.7569427490234375, "logits/rejected": -1.6396926641464233, "logps/chosen": -387.4503479003906, "logps/rejected": -401.4142150878906, "loss": 0.56, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8091585040092468, "rewards/margins": 0.5702639818191528, "rewards/rejected": -1.379422664642334, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.6612438066572555e-07, "logits/chosen": -1.488446593284607, "logits/rejected": -1.3490570783615112, "logps/chosen": -340.69622802734375, "logps/rejected": -346.3163757324219, "loss": 0.5711, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7310938835144043, "rewards/margins": 0.6003143191337585, "rewards/rejected": -1.3314082622528076, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.6138518605333664e-07, "logits/chosen": -1.5929442644119263, "logits/rejected": -1.5048617124557495, "logps/chosen": -386.0514221191406, "logps/rejected": -422.6902770996094, "loss": 0.5572, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9384471774101257, "rewards/margins": 0.6384426951408386, "rewards/rejected": -1.5768897533416748, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.5636328249082514e-07, "logits/chosen": -1.654510498046875, "logits/rejected": -1.6068557500839233, "logps/chosen": -364.17596435546875, "logps/rejected": -406.23736572265625, "loss": 0.5703, "rewards/accuracies": 0.65625, "rewards/chosen": -1.179985523223877, "rewards/margins": 0.42138758301734924, "rewards/rejected": -1.6013730764389038, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.510653863290871e-07, "logits/chosen": -1.6021859645843506, "logits/rejected": -1.461978554725647, "logps/chosen": -407.7553405761719, "logps/rejected": -452.82501220703125, "loss": 0.5428, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1438275575637817, "rewards/margins": 0.6944350004196167, "rewards/rejected": -1.8382627964019775, "step": 270 }, { "epoch": 0.29, "learning_rate": 4.4549858303465737e-07, "logits/chosen": -1.666479468345642, "logits/rejected": -1.5147731304168701, "logps/chosen": -397.7020568847656, "logps/rejected": -426.46966552734375, "loss": 0.55, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1143194437026978, "rewards/margins": 0.6110407114028931, "rewards/rejected": -1.7253602743148804, "step": 280 }, { "epoch": 0.3, "learning_rate": 4.396703177135261e-07, "logits/chosen": -1.5770277976989746, "logits/rejected": -1.3010271787643433, "logps/chosen": -389.9909362792969, "logps/rejected": -386.58642578125, "loss": 0.5299, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0810632705688477, "rewards/margins": 0.5923460721969604, "rewards/rejected": -1.6734092235565186, "step": 290 }, { "epoch": 0.31, "learning_rate": 4.335883851539693e-07, "logits/chosen": -1.2049105167388916, "logits/rejected": -0.9145506024360657, "logps/chosen": -372.0166320800781, "logps/rejected": -415.6207580566406, "loss": 0.5298, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.15445876121521, "rewards/margins": 0.6882535815238953, "rewards/rejected": -1.8427120447158813, "step": 300 }, { "epoch": 0.31, "eval_logits/chosen": -1.193803071975708, "eval_logits/rejected": -0.9729028940200806, "eval_logps/chosen": -375.13653564453125, "eval_logps/rejected": -417.95367431640625, "eval_loss": 0.550506591796875, "eval_rewards/accuracies": 0.7400793433189392, "eval_rewards/chosen": -0.9519141316413879, "eval_rewards/margins": 0.6683558821678162, "eval_rewards/rejected": -1.620270013809204, "eval_runtime": 244.8113, "eval_samples_per_second": 8.17, "eval_steps_per_second": 0.257, "step": 300 }, { "epoch": 0.32, "learning_rate": 4.272609194017105e-07, "logits/chosen": -1.1300534009933472, "logits/rejected": -1.0067355632781982, "logps/chosen": -343.98272705078125, "logps/rejected": -458.59185791015625, "loss": 0.5017, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8752749562263489, "rewards/margins": 0.8713703155517578, "rewards/rejected": -1.746645212173462, "step": 310 }, { "epoch": 0.33, "learning_rate": 4.2069638288135547e-07, "logits/chosen": -0.5048955678939819, "logits/rejected": -0.22074377536773682, "logps/chosen": -470.4259338378906, "logps/rejected": -543.0339965820312, "loss": 0.5284, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6318514347076416, "rewards/margins": 0.9790364503860474, "rewards/rejected": -2.6108880043029785, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.139035550786494e-07, "logits/chosen": -0.7544638514518738, "logits/rejected": -0.46497392654418945, "logps/chosen": -424.0389099121094, "logps/rejected": -432.882080078125, "loss": 0.5646, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4958339929580688, "rewards/margins": 0.6142513155937195, "rewards/rejected": -2.1100850105285645, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.0689152079869306e-07, "logits/chosen": -1.2732598781585693, "logits/rejected": -1.1444041728973389, "logps/chosen": -405.2666015625, "logps/rejected": -401.07354736328125, "loss": 0.571, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1170111894607544, "rewards/margins": 0.5194820165634155, "rewards/rejected": -1.6364930868148804, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.99669658015821e-07, "logits/chosen": -1.1732494831085205, "logits/rejected": -1.0453077554702759, "logps/chosen": -429.7718811035156, "logps/rejected": -471.4634704589844, "loss": 0.5373, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.258093237876892, "rewards/margins": 0.7526389956474304, "rewards/rejected": -2.0107321739196777, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.92247625331392e-07, "logits/chosen": -0.8629885911941528, "logits/rejected": -0.6935967803001404, "logps/chosen": -421.68194580078125, "logps/rejected": -475.949462890625, "loss": 0.5317, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5934243202209473, "rewards/margins": 0.631182074546814, "rewards/rejected": -2.224606513977051, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.846353490562664e-07, "logits/chosen": -0.7160054445266724, "logits/rejected": -0.41435980796813965, "logps/chosen": -399.6954345703125, "logps/rejected": -486.19757080078125, "loss": 0.5105, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2889573574066162, "rewards/margins": 0.8785942792892456, "rewards/rejected": -2.1675515174865723, "step": 370 }, { "epoch": 0.4, "learning_rate": 3.768430099352445e-07, "logits/chosen": -1.0206403732299805, "logits/rejected": -0.7304887771606445, "logps/chosen": -433.85137939453125, "logps/rejected": -487.86126708984375, "loss": 0.5303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4668599367141724, "rewards/margins": 0.7815066576004028, "rewards/rejected": -2.2483668327331543, "step": 380 }, { "epoch": 0.41, "learning_rate": 3.6888102953122304e-07, "logits/chosen": -0.7131624817848206, "logits/rejected": -0.43439167737960815, "logps/chosen": -434.3667907714844, "logps/rejected": -473.67620849609375, "loss": 0.5214, "rewards/accuracies": 0.75, "rewards/chosen": -1.4395004510879517, "rewards/margins": 0.8123389482498169, "rewards/rejected": -2.2518393993377686, "step": 390 }, { "epoch": 0.42, "learning_rate": 3.607600562872785e-07, "logits/chosen": -0.5696905255317688, "logits/rejected": -0.11699406057596207, "logps/chosen": -451.731689453125, "logps/rejected": -470.5762634277344, "loss": 0.5055, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5301005840301514, "rewards/margins": 0.6669865846633911, "rewards/rejected": -2.197086811065674, "step": 400 }, { "epoch": 0.42, "eval_logits/chosen": -0.03764622285962105, "eval_logits/rejected": 0.29014527797698975, "eval_logps/chosen": -418.03948974609375, "eval_logps/rejected": -474.5049743652344, "eval_loss": 0.5331180691719055, "eval_rewards/accuracies": 0.7539682388305664, "eval_rewards/chosen": -1.3809435367584229, "eval_rewards/margins": 0.8048391938209534, "eval_rewards/rejected": -2.1857824325561523, "eval_runtime": 244.4235, "eval_samples_per_second": 8.183, "eval_steps_per_second": 0.258, "step": 400 }, { "epoch": 0.43, "learning_rate": 3.5249095128531856e-07, "logits/chosen": -0.3684214949607849, "logits/rejected": -0.036270398646593094, "logps/chosen": -420.4697265625, "logps/rejected": -425.89837646484375, "loss": 0.5474, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3041114807128906, "rewards/margins": 0.469282865524292, "rewards/rejected": -1.773394227027893, "step": 410 }, { "epoch": 0.44, "learning_rate": 3.4408477372034736e-07, "logits/chosen": -0.5258590579032898, "logits/rejected": -0.11696865409612656, "logps/chosen": -331.28082275390625, "logps/rejected": -395.2735290527344, "loss": 0.5456, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0080857276916504, "rewards/margins": 0.560020387172699, "rewards/rejected": -1.5681060552597046, "step": 420 }, { "epoch": 0.45, "learning_rate": 3.3555276610977276e-07, "logits/chosen": -0.45506519079208374, "logits/rejected": -0.2455427199602127, "logps/chosen": -372.3184509277344, "logps/rejected": -409.4679870605469, "loss": 0.5575, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0017720460891724, "rewards/margins": 0.5574255585670471, "rewards/rejected": -1.5591974258422852, "step": 430 }, { "epoch": 0.46, "learning_rate": 3.269063392575352e-07, "logits/chosen": -0.26316189765930176, "logits/rejected": 0.11150024086236954, "logps/chosen": -400.49072265625, "logps/rejected": -426.67657470703125, "loss": 0.534, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2625601291656494, "rewards/margins": 0.5866714119911194, "rewards/rejected": -1.8492317199707031, "step": 440 }, { "epoch": 0.47, "learning_rate": 3.1815705699316964e-07, "logits/chosen": 0.445736825466156, "logits/rejected": 0.7627506852149963, "logps/chosen": -389.2279968261719, "logps/rejected": -451.03521728515625, "loss": 0.5543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5224748849868774, "rewards/margins": 0.645032525062561, "rewards/rejected": -2.1675071716308594, "step": 450 }, { "epoch": 0.48, "learning_rate": 3.0931662070620794e-07, "logits/chosen": 0.20439806580543518, "logits/rejected": 0.6606889963150024, "logps/chosen": -455.30035400390625, "logps/rejected": -485.322021484375, "loss": 0.5352, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5770175457000732, "rewards/margins": 0.6814727783203125, "rewards/rejected": -2.2584900856018066, "step": 460 }, { "epoch": 0.49, "learning_rate": 3.003968536966078e-07, "logits/chosen": 0.2584216892719269, "logits/rejected": 0.6647650003433228, "logps/chosen": -412.4117736816406, "logps/rejected": -455.8914489746094, "loss": 0.5332, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4638701677322388, "rewards/margins": 0.5353850722312927, "rewards/rejected": -1.9992549419403076, "step": 470 }, { "epoch": 0.5, "learning_rate": 2.9140968536213693e-07, "logits/chosen": 0.15033851563930511, "logits/rejected": 0.30757248401641846, "logps/chosen": -370.2145690917969, "logps/rejected": -433.5589904785156, "loss": 0.5567, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2743232250213623, "rewards/margins": 0.5514111518859863, "rewards/rejected": -1.8257343769073486, "step": 480 }, { "epoch": 0.51, "learning_rate": 2.823671352438608e-07, "logits/chosen": -0.019240472465753555, "logits/rejected": 0.541126012802124, "logps/chosen": -384.51373291015625, "logps/rejected": -420.33648681640625, "loss": 0.527, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.236613392829895, "rewards/margins": 0.6580051183700562, "rewards/rejected": -1.894618272781372, "step": 490 }, { "epoch": 0.52, "learning_rate": 2.73281296951072e-07, "logits/chosen": 0.26845166087150574, "logits/rejected": 0.9440711140632629, "logps/chosen": -447.5386657714844, "logps/rejected": -504.5428771972656, "loss": 0.5243, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.576322317123413, "rewards/margins": 0.9059449434280396, "rewards/rejected": -2.482267141342163, "step": 500 }, { "epoch": 0.52, "eval_logits/chosen": 0.7244722843170166, "eval_logits/rejected": 1.116690993309021, "eval_logps/chosen": -433.9210205078125, "eval_logps/rejected": -491.7053527832031, "eval_loss": 0.5239810347557068, "eval_rewards/accuracies": 0.77182537317276, "eval_rewards/chosen": -1.5397586822509766, "eval_rewards/margins": 0.8180281519889832, "eval_rewards/rejected": -2.3577868938446045, "eval_runtime": 244.6169, "eval_samples_per_second": 8.176, "eval_steps_per_second": 0.258, "step": 500 }, { "epoch": 0.53, "learning_rate": 2.641643219871597e-07, "logits/chosen": 0.5857471823692322, "logits/rejected": 1.1579170227050781, "logps/chosen": -432.2608337402344, "logps/rejected": -453.0177307128906, "loss": 0.5076, "rewards/accuracies": 0.71875, "rewards/chosen": -1.56296706199646, "rewards/margins": 0.7662748694419861, "rewards/rejected": -2.329241991043091, "step": 510 }, { "epoch": 0.54, "learning_rate": 2.550284034980507e-07, "logits/chosen": 0.5627486705780029, "logits/rejected": 1.0138986110687256, "logps/chosen": -440.6780700683594, "logps/rejected": -490.7980041503906, "loss": 0.5118, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5351483821868896, "rewards/margins": 0.7626742124557495, "rewards/rejected": -2.2978224754333496, "step": 520 }, { "epoch": 0.55, "learning_rate": 2.4588575996495794e-07, "logits/chosen": 0.5618034601211548, "logits/rejected": 1.1389967203140259, "logps/chosen": -458.1664123535156, "logps/rejected": -529.7509765625, "loss": 0.5147, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6149412393569946, "rewards/margins": 0.7950725555419922, "rewards/rejected": -2.4100139141082764, "step": 530 }, { "epoch": 0.57, "learning_rate": 2.367486188632446e-07, "logits/chosen": 0.46996626257896423, "logits/rejected": 1.0377795696258545, "logps/chosen": -429.85540771484375, "logps/rejected": -497.40948486328125, "loss": 0.5133, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4811718463897705, "rewards/margins": 0.8862358927726746, "rewards/rejected": -2.367408037185669, "step": 540 }, { "epoch": 0.58, "learning_rate": 2.276292003092593e-07, "logits/chosen": 0.3401317596435547, "logits/rejected": 0.842154324054718, "logps/chosen": -437.26739501953125, "logps/rejected": -500.60089111328125, "loss": 0.5413, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5497123003005981, "rewards/margins": 0.8867634534835815, "rewards/rejected": -2.4364757537841797, "step": 550 }, { "epoch": 0.59, "learning_rate": 2.185397007170141e-07, "logits/chosen": 0.17786632478237152, "logits/rejected": 0.7487555742263794, "logps/chosen": -402.21453857421875, "logps/rejected": -413.3843688964844, "loss": 0.5198, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3434484004974365, "rewards/margins": 0.6699635982513428, "rewards/rejected": -2.0134117603302, "step": 560 }, { "epoch": 0.6, "learning_rate": 2.094922764865619e-07, "logits/chosen": 0.33579394221305847, "logits/rejected": 0.8120689392089844, "logps/chosen": -394.34857177734375, "logps/rejected": -451.68878173828125, "loss": 0.5383, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.341185450553894, "rewards/margins": 0.6223596930503845, "rewards/rejected": -1.9635450839996338, "step": 570 }, { "epoch": 0.61, "learning_rate": 2.0049902774588797e-07, "logits/chosen": 0.6950188875198364, "logits/rejected": 1.2489306926727295, "logps/chosen": -420.9115295410156, "logps/rejected": -468.81402587890625, "loss": 0.5353, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3789879083633423, "rewards/margins": 0.7200512886047363, "rewards/rejected": -2.099039316177368, "step": 580 }, { "epoch": 0.62, "learning_rate": 1.9157198216806238e-07, "logits/chosen": 0.6935154795646667, "logits/rejected": 0.9982309341430664, "logps/chosen": -409.898193359375, "logps/rejected": -451.1421813964844, "loss": 0.5394, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.388021469116211, "rewards/margins": 0.5696064829826355, "rewards/rejected": -1.9576278924942017, "step": 590 }, { "epoch": 0.63, "learning_rate": 1.8272307888529274e-07, "logits/chosen": 0.2730625867843628, "logits/rejected": 0.6284732818603516, "logps/chosen": -443.95013427734375, "logps/rejected": -519.79541015625, "loss": 0.5024, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3900865316390991, "rewards/margins": 0.8703911900520325, "rewards/rejected": -2.2604775428771973, "step": 600 }, { "epoch": 0.63, "eval_logits/chosen": 0.8469038605690002, "eval_logits/rejected": 1.3223692178726196, "eval_logps/chosen": -446.7126770019531, "eval_logps/rejected": -509.12152099609375, "eval_loss": 0.5212409496307373, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -1.6676758527755737, "eval_rewards/margins": 0.8642725944519043, "eval_rewards/rejected": -2.5319488048553467, "eval_runtime": 243.8512, "eval_samples_per_second": 8.202, "eval_steps_per_second": 0.258, "step": 600 }, { "epoch": 0.64, "learning_rate": 1.7396415252139288e-07, "logits/chosen": 0.7108888626098633, "logits/rejected": 1.4451757669448853, "logps/chosen": -478.5006408691406, "logps/rejected": -478.9517517089844, "loss": 0.5186, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6936050653457642, "rewards/margins": 0.7683889865875244, "rewards/rejected": -2.461993932723999, "step": 610 }, { "epoch": 0.65, "learning_rate": 1.6530691736402316e-07, "logits/chosen": 0.623712420463562, "logits/rejected": 1.2085224390029907, "logps/chosen": -436.6463928222656, "logps/rejected": -483.9029235839844, "loss": 0.5119, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5746710300445557, "rewards/margins": 0.7840819954872131, "rewards/rejected": -2.358752965927124, "step": 620 }, { "epoch": 0.66, "learning_rate": 1.5676295169786864e-07, "logits/chosen": 1.255614995956421, "logits/rejected": 1.7882163524627686, "logps/chosen": -416.83563232421875, "logps/rejected": -496.5362243652344, "loss": 0.5017, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4195477962493896, "rewards/margins": 1.038696527481079, "rewards/rejected": -2.4582440853118896, "step": 630 }, { "epoch": 0.67, "learning_rate": 1.483436823197092e-07, "logits/chosen": 1.1485587358474731, "logits/rejected": 1.5099462270736694, "logps/chosen": -407.3171691894531, "logps/rejected": -482.505615234375, "loss": 0.5076, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5105384588241577, "rewards/margins": 0.8905227780342102, "rewards/rejected": -2.401061534881592, "step": 640 }, { "epoch": 0.68, "learning_rate": 1.4006036925609243e-07, "logits/chosen": 0.8679726719856262, "logits/rejected": 1.5178191661834717, "logps/chosen": -471.46868896484375, "logps/rejected": -529.3899536132812, "loss": 0.5044, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6612011194229126, "rewards/margins": 0.8902603983879089, "rewards/rejected": -2.5514614582061768, "step": 650 }, { "epoch": 0.69, "learning_rate": 1.319240907040458e-07, "logits/chosen": 0.9074466824531555, "logits/rejected": 1.4612318277359009, "logps/chosen": -458.69842529296875, "logps/rejected": -467.8643493652344, "loss": 0.5214, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7074607610702515, "rewards/margins": 0.5886183977127075, "rewards/rejected": -2.296079158782959, "step": 660 }, { "epoch": 0.7, "learning_rate": 1.239457282149695e-07, "logits/chosen": 0.826758086681366, "logits/rejected": 1.512880563735962, "logps/chosen": -418.04949951171875, "logps/rejected": -474.4520568847656, "loss": 0.5169, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4526798725128174, "rewards/margins": 0.9233277440071106, "rewards/rejected": -2.376007556915283, "step": 670 }, { "epoch": 0.71, "learning_rate": 1.1613595214152711e-07, "logits/chosen": 0.6957104802131653, "logits/rejected": 1.0907188653945923, "logps/chosen": -422.57086181640625, "logps/rejected": -443.7021484375, "loss": 0.5358, "rewards/accuracies": 0.625, "rewards/chosen": -1.4303207397460938, "rewards/margins": 0.5159063339233398, "rewards/rejected": -1.9462270736694336, "step": 680 }, { "epoch": 0.72, "learning_rate": 1.0850520736699362e-07, "logits/chosen": 0.5629902482032776, "logits/rejected": 1.4120800495147705, "logps/chosen": -379.07037353515625, "logps/rejected": -417.822021484375, "loss": 0.5065, "rewards/accuracies": 0.78125, "rewards/chosen": -1.299447774887085, "rewards/margins": 0.9052375555038452, "rewards/rejected": -2.2046852111816406, "step": 690 }, { "epoch": 0.73, "learning_rate": 1.0106369933615042e-07, "logits/chosen": 0.5293871164321899, "logits/rejected": 1.561988115310669, "logps/chosen": -415.15570068359375, "logps/rejected": -517.3580322265625, "loss": 0.4855, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4450310468673706, "rewards/margins": 1.3134677410125732, "rewards/rejected": -2.7584986686706543, "step": 700 }, { "epoch": 0.73, "eval_logits/chosen": 1.0176714658737183, "eval_logits/rejected": 1.5164849758148193, "eval_logps/chosen": -432.87799072265625, "eval_logps/rejected": -497.0490417480469, "eval_loss": 0.5155569911003113, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -1.5293281078338623, "eval_rewards/margins": 0.8818953633308411, "eval_rewards/rejected": -2.4112234115600586, "eval_runtime": 243.7233, "eval_samples_per_second": 8.206, "eval_steps_per_second": 0.258, "step": 700 }, { "epoch": 0.74, "learning_rate": 9.382138040640714e-08, "logits/chosen": 0.7708175182342529, "logits/rejected": 1.4804003238677979, "logps/chosen": -430.60675048828125, "logps/rejected": -460.25506591796875, "loss": 0.5182, "rewards/accuracies": 0.71875, "rewards/chosen": -1.51383376121521, "rewards/margins": 0.781708836555481, "rewards/rejected": -2.2955427169799805, "step": 710 }, { "epoch": 0.75, "learning_rate": 8.678793653740632e-08, "logits/chosen": 0.9884228706359863, "logits/rejected": 1.4830214977264404, "logps/chosen": -411.5726013183594, "logps/rejected": -486.2874450683594, "loss": 0.5148, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.49359929561615, "rewards/margins": 0.8712177276611328, "rewards/rejected": -2.3648171424865723, "step": 720 }, { "epoch": 0.76, "learning_rate": 7.997277433690983e-08, "logits/chosen": 0.7098981738090515, "logits/rejected": 1.4574624300003052, "logps/chosen": -431.80694580078125, "logps/rejected": -440.78448486328125, "loss": 0.533, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4108504056930542, "rewards/margins": 0.6520459651947021, "rewards/rejected": -2.062896490097046, "step": 730 }, { "epoch": 0.77, "learning_rate": 7.338500848029602e-08, "logits/chosen": 0.6488819122314453, "logits/rejected": 1.1337546110153198, "logps/chosen": -441.52020263671875, "logps/rejected": -506.21246337890625, "loss": 0.5276, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4171245098114014, "rewards/margins": 0.8641043901443481, "rewards/rejected": -2.28122878074646, "step": 740 }, { "epoch": 0.79, "learning_rate": 6.70334495204884e-08, "logits/chosen": 0.6074367761611938, "logits/rejected": 1.0385777950286865, "logps/chosen": -400.0601806640625, "logps/rejected": -490.62701416015625, "loss": 0.5015, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4023693799972534, "rewards/margins": 0.8330108523368835, "rewards/rejected": -2.2353804111480713, "step": 750 }, { "epoch": 0.8, "learning_rate": 6.092659210462231e-08, "logits/chosen": 0.4711516499519348, "logits/rejected": 0.9624277353286743, "logps/chosen": -422.23480224609375, "logps/rejected": -472.27325439453125, "loss": 0.4845, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3671127557754517, "rewards/margins": 0.813764214515686, "rewards/rejected": -2.180877208709717, "step": 760 }, { "epoch": 0.81, "learning_rate": 5.507260361320737e-08, "logits/chosen": 0.6579657793045044, "logits/rejected": 1.0575228929519653, "logps/chosen": -438.59564208984375, "logps/rejected": -502.49407958984375, "loss": 0.5179, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4378478527069092, "rewards/margins": 0.8018784523010254, "rewards/rejected": -2.2397265434265137, "step": 770 }, { "epoch": 0.82, "learning_rate": 4.947931323697982e-08, "logits/chosen": 0.7942476868629456, "logits/rejected": 1.3635971546173096, "logps/chosen": -420.66888427734375, "logps/rejected": -464.9019470214844, "loss": 0.5246, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.438265085220337, "rewards/margins": 0.8271247148513794, "rewards/rejected": -2.265389919281006, "step": 780 }, { "epoch": 0.83, "learning_rate": 4.415420150605398e-08, "logits/chosen": 0.7373208999633789, "logits/rejected": 0.9085140228271484, "logps/chosen": -412.26177978515625, "logps/rejected": -484.4659118652344, "loss": 0.5129, "rewards/accuracies": 0.75, "rewards/chosen": -1.5069735050201416, "rewards/margins": 0.7849918603897095, "rewards/rejected": -2.2919652462005615, "step": 790 }, { "epoch": 0.84, "learning_rate": 3.9104390285376374e-08, "logits/chosen": 0.6553566455841064, "logits/rejected": 1.0975841283798218, "logps/chosen": -468.61328125, "logps/rejected": -519.8575439453125, "loss": 0.5048, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4834080934524536, "rewards/margins": 0.8748735189437866, "rewards/rejected": -2.3582816123962402, "step": 800 }, { "epoch": 0.84, "eval_logits/chosen": 0.8796811103820801, "eval_logits/rejected": 1.3869489431381226, "eval_logps/chosen": -427.48309326171875, "eval_logps/rejected": -493.06396484375, "eval_loss": 0.5121396780014038, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -1.4753795862197876, "eval_rewards/margins": 0.8959933519363403, "eval_rewards/rejected": -2.371372699737549, "eval_runtime": 244.0597, "eval_samples_per_second": 8.195, "eval_steps_per_second": 0.258, "step": 800 }, { "epoch": 0.85, "learning_rate": 3.433663324986208e-08, "logits/chosen": 0.6968400478363037, "logits/rejected": 1.4569685459136963, "logps/chosen": -442.61041259765625, "logps/rejected": -456.3666076660156, "loss": 0.5114, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6009132862091064, "rewards/margins": 0.6439798474311829, "rewards/rejected": -2.2448930740356445, "step": 810 }, { "epoch": 0.86, "learning_rate": 2.9857306851953897e-08, "logits/chosen": 0.6378097534179688, "logits/rejected": 1.441167950630188, "logps/chosen": -432.9095764160156, "logps/rejected": -471.34710693359375, "loss": 0.5392, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4161417484283447, "rewards/margins": 0.9558764696121216, "rewards/rejected": -2.372018337249756, "step": 820 }, { "epoch": 0.87, "learning_rate": 2.567240179368185e-08, "logits/chosen": 0.8674399256706238, "logits/rejected": 0.9316266179084778, "logps/chosen": -408.4041442871094, "logps/rejected": -497.81317138671875, "loss": 0.4902, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5210860967636108, "rewards/margins": 0.7341635823249817, "rewards/rejected": -2.2552497386932373, "step": 830 }, { "epoch": 0.88, "learning_rate": 2.1787515014630357e-08, "logits/chosen": 0.8804190754890442, "logits/rejected": 1.1247572898864746, "logps/chosen": -414.587646484375, "logps/rejected": -463.87310791015625, "loss": 0.5324, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.527111291885376, "rewards/margins": 0.6108843088150024, "rewards/rejected": -2.137995481491089, "step": 840 }, { "epoch": 0.89, "learning_rate": 1.820784220652766e-08, "logits/chosen": 0.5601187944412231, "logits/rejected": 1.2288376092910767, "logps/chosen": -427.48760986328125, "logps/rejected": -458.49951171875, "loss": 0.5159, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.374366044998169, "rewards/margins": 0.8709263801574707, "rewards/rejected": -2.2452924251556396, "step": 850 }, { "epoch": 0.9, "learning_rate": 1.4938170864468636e-08, "logits/chosen": 0.8801227807998657, "logits/rejected": 1.2111625671386719, "logps/chosen": -430.5556640625, "logps/rejected": -500.516845703125, "loss": 0.5158, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.518132209777832, "rewards/margins": 0.838158905506134, "rewards/rejected": -2.3562910556793213, "step": 860 }, { "epoch": 0.91, "learning_rate": 1.1982873884064465e-08, "logits/chosen": 0.8533649444580078, "logits/rejected": 1.03411865234375, "logps/chosen": -372.19580078125, "logps/rejected": -463.11114501953125, "loss": 0.5198, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3652175664901733, "rewards/margins": 0.9399977922439575, "rewards/rejected": -2.30521559715271, "step": 870 }, { "epoch": 0.92, "learning_rate": 9.345903713082304e-09, "logits/chosen": 0.5068883895874023, "logits/rejected": 1.1840332746505737, "logps/chosen": -432.7618103027344, "logps/rejected": -479.28369140625, "loss": 0.4962, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4607574939727783, "rewards/margins": 0.7478801012039185, "rewards/rejected": -2.2086377143859863, "step": 880 }, { "epoch": 0.93, "learning_rate": 7.030787065396865e-09, "logits/chosen": 0.3581078350543976, "logits/rejected": 0.8491541743278503, "logps/chosen": -418.20379638671875, "logps/rejected": -493.193115234375, "loss": 0.5266, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.417425274848938, "rewards/margins": 0.8820412755012512, "rewards/rejected": -2.299466609954834, "step": 890 }, { "epoch": 0.94, "learning_rate": 5.04062020432286e-09, "logits/chosen": 0.7375041246414185, "logits/rejected": 1.3644684553146362, "logps/chosen": -425.05047607421875, "logps/rejected": -497.4638671875, "loss": 0.5193, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4368913173675537, "rewards/margins": 0.9173351526260376, "rewards/rejected": -2.3542263507843018, "step": 900 }, { "epoch": 0.94, "eval_logits/chosen": 0.9411238431930542, "eval_logits/rejected": 1.4498772621154785, "eval_logps/chosen": -425.3929748535156, "eval_logps/rejected": -490.2650146484375, "eval_loss": 0.5108779072761536, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -1.4544785022735596, "eval_rewards/margins": 0.888904333114624, "eval_rewards/rejected": -2.3433828353881836, "eval_runtime": 244.5086, "eval_samples_per_second": 8.18, "eval_steps_per_second": 0.258, "step": 900 }, { "epoch": 0.95, "learning_rate": 3.3780648016376866e-09, "logits/chosen": 0.824137806892395, "logits/rejected": 1.4844555854797363, "logps/chosen": -434.61865234375, "logps/rejected": -485.60528564453125, "loss": 0.5072, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.451385736465454, "rewards/margins": 0.8570237159729004, "rewards/rejected": -2.3084096908569336, "step": 910 }, { "epoch": 0.96, "learning_rate": 2.0453443778310766e-09, "logits/chosen": 0.5803619623184204, "logits/rejected": 1.347163200378418, "logps/chosen": -456.23150634765625, "logps/rejected": -499.64178466796875, "loss": 0.5039, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5105242729187012, "rewards/margins": 0.9060274362564087, "rewards/rejected": -2.416551351547241, "step": 920 }, { "epoch": 0.97, "learning_rate": 1.0442413283435758e-09, "logits/chosen": 0.4976336359977722, "logits/rejected": 1.3075406551361084, "logps/chosen": -391.784423828125, "logps/rejected": -485.1461486816406, "loss": 0.4874, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2704774141311646, "rewards/margins": 1.2178288698196411, "rewards/rejected": -2.4883062839508057, "step": 930 }, { "epoch": 0.98, "learning_rate": 3.760945397705828e-10, "logits/chosen": 0.5646733045578003, "logits/rejected": 1.1906431913375854, "logps/chosen": -441.12298583984375, "logps/rejected": -493.2767639160156, "loss": 0.4969, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.583254337310791, "rewards/margins": 0.746857762336731, "rewards/rejected": -2.3301119804382324, "step": 940 }, { "epoch": 0.99, "learning_rate": 4.17975992204056e-11, "logits/chosen": 0.8913224935531616, "logits/rejected": 1.4200079441070557, "logps/chosen": -431.2433166503906, "logps/rejected": -494.87744140625, "loss": 0.5167, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6311872005462646, "rewards/margins": 0.8244245648384094, "rewards/rejected": -2.4556117057800293, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 0.5487770005670517, "train_runtime": 16595.3532, "train_samples_per_second": 3.684, "train_steps_per_second": 0.058 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }