{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.47426924660354053, "eval_steps": 500, "global_step": 36, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 0.538081705570221, "learning_rate": 1.25e-07, "logits/chosen": 10.038352012634277, "logits/rejected": 10.592904090881348, "logps/chosen": -0.6228358745574951, "logps/rejected": -0.6871199011802673, "loss": 1.342, "rewards/accuracies": 0.578125, "rewards/chosen": -1.2456717491149902, "rewards/margins": 0.12856802344322205, "rewards/rejected": -1.3742398023605347, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.6521235108375549, "learning_rate": 2.5e-07, "logits/chosen": 10.320584297180176, "logits/rejected": 10.721946716308594, "logps/chosen": -0.7115719318389893, "logps/rejected": -0.788784384727478, "loss": 1.3147, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4231438636779785, "rewards/margins": 0.15442489087581635, "rewards/rejected": -1.577568769454956, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.8797138929367065, "learning_rate": 3.75e-07, "logits/chosen": 9.899504661560059, "logits/rejected": 10.505952835083008, "logps/chosen": -0.8225007057189941, "logps/rejected": -0.8832307457923889, "loss": 1.3674, "rewards/accuracies": 0.5078125, "rewards/chosen": -1.6450014114379883, "rewards/margins": 0.12146000564098358, "rewards/rejected": -1.7664614915847778, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 1.9139935970306396, "learning_rate": 5e-07, "logits/chosen": 10.082985877990723, "logits/rejected": 10.576549530029297, "logps/chosen": -0.6892099976539612, "logps/rejected": -0.7180394530296326, "loss": 1.4038, "rewards/accuracies": 0.5, "rewards/chosen": -1.3784199953079224, "rewards/margins": 0.05765870213508606, "rewards/rejected": -1.4360789060592651, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 0.8647859692573547, "learning_rate": 6.249999999999999e-07, "logits/chosen": 10.318564414978027, "logits/rejected": 11.072587966918945, "logps/chosen": -0.6658570766448975, "logps/rejected": -0.6663312911987305, "loss": 1.4062, "rewards/accuracies": 0.5703125, "rewards/chosen": -1.331714153289795, "rewards/margins": 0.0009482596069574356, "rewards/rejected": -1.332662582397461, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 0.7906696796417236, "learning_rate": 7.5e-07, "logits/chosen": 10.802580833435059, "logits/rejected": 11.333773612976074, "logps/chosen": -0.7257988452911377, "logps/rejected": -0.7839725017547607, "loss": 1.3781, "rewards/accuracies": 0.6015625, "rewards/chosen": -1.4515976905822754, "rewards/margins": 0.11634734272956848, "rewards/rejected": -1.5679450035095215, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 0.724219799041748, "learning_rate": 8.75e-07, "logits/chosen": 9.928263664245605, "logits/rejected": 10.422144889831543, "logps/chosen": -0.5926575660705566, "logps/rejected": -0.6688517928123474, "loss": 1.314, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.1853151321411133, "rewards/margins": 0.15238842368125916, "rewards/rejected": -1.3377035856246948, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 0.558660089969635, "learning_rate": 1e-06, "logits/chosen": 10.657012939453125, "logits/rejected": 11.171004295349121, "logps/chosen": -0.6659789681434631, "logps/rejected": -0.7012848258018494, "loss": 1.365, "rewards/accuracies": 0.578125, "rewards/chosen": -1.3319579362869263, "rewards/margins": 0.07061176747083664, "rewards/rejected": -1.4025696516036987, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 0.7675488591194153, "learning_rate": 9.994504457428556e-07, "logits/chosen": 10.544637680053711, "logits/rejected": 10.839460372924805, "logps/chosen": -0.814159095287323, "logps/rejected": -0.7815468907356262, "loss": 1.4888, "rewards/accuracies": 0.515625, "rewards/chosen": -1.628318190574646, "rewards/margins": -0.0652243047952652, "rewards/rejected": -1.5630937814712524, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 1.1157082319259644, "learning_rate": 9.97802991010949e-07, "logits/chosen": 10.10114574432373, "logits/rejected": 10.555818557739258, "logps/chosen": -0.673196017742157, "logps/rejected": -0.6864349246025085, "loss": 1.4279, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.346392035484314, "rewards/margins": 0.02647773176431656, "rewards/rejected": -1.372869849205017, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 1.2121509313583374, "learning_rate": 9.950612572673255e-07, "logits/chosen": 10.158075332641602, "logits/rejected": 10.813385009765625, "logps/chosen": -0.7734582424163818, "logps/rejected": -0.8254096508026123, "loss": 1.3538, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.5469164848327637, "rewards/margins": 0.10390281677246094, "rewards/rejected": -1.6508193016052246, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 0.921929657459259, "learning_rate": 9.912312714377879e-07, "logits/chosen": 10.2570161819458, "logits/rejected": 10.633421897888184, "logps/chosen": -0.7107410430908203, "logps/rejected": -0.7390152812004089, "loss": 1.3791, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4214820861816406, "rewards/margins": 0.056548528373241425, "rewards/rejected": -1.4780305624008179, "step": 12 }, { "epoch": 0.17126389460683408, "grad_norm": 1.0088778734207153, "learning_rate": 9.863214526624063e-07, "logits/chosen": 9.808923721313477, "logits/rejected": 10.603569984436035, "logps/chosen": -0.7246454954147339, "logps/rejected": -0.8062013387680054, "loss": 1.3588, "rewards/accuracies": 0.5234375, "rewards/chosen": -1.4492909908294678, "rewards/margins": 0.1631116420030594, "rewards/rejected": -1.6124026775360107, "step": 13 }, { "epoch": 0.1844380403458213, "grad_norm": 1.2618002891540527, "learning_rate": 9.8034259378842e-07, "logits/chosen": 10.618194580078125, "logits/rejected": 11.330740928649902, "logps/chosen": -0.6871081590652466, "logps/rejected": -0.752004086971283, "loss": 1.3318, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3742163181304932, "rewards/margins": 0.1297919750213623, "rewards/rejected": -1.504008173942566, "step": 14 }, { "epoch": 0.19761218608480857, "grad_norm": 0.3426913917064667, "learning_rate": 9.73307837645217e-07, "logits/chosen": 9.945769309997559, "logits/rejected": 10.648375511169434, "logps/chosen": -0.6394751071929932, "logps/rejected": -0.7090991139411926, "loss": 1.3241, "rewards/accuracies": 0.546875, "rewards/chosen": -1.2789502143859863, "rewards/margins": 0.13924814760684967, "rewards/rejected": -1.4181982278823853, "step": 15 }, { "epoch": 0.2107863318237958, "grad_norm": 1.1439241170883179, "learning_rate": 9.652326481535433e-07, "logits/chosen": 10.895740509033203, "logits/rejected": 11.243146896362305, "logps/chosen": -0.6249470710754395, "logps/rejected": -0.6594743132591248, "loss": 1.3593, "rewards/accuracies": 0.4765625, "rewards/chosen": -1.249894142150879, "rewards/margins": 0.06905444711446762, "rewards/rejected": -1.3189486265182495, "step": 16 }, { "epoch": 0.22396047756278303, "grad_norm": 0.6084972023963928, "learning_rate": 9.561347763324483e-07, "logits/chosen": 10.294602394104004, "logits/rejected": 10.75848388671875, "logps/chosen": -0.6647714972496033, "logps/rejected": -0.6630449891090393, "loss": 1.4144, "rewards/accuracies": 0.5703125, "rewards/chosen": -1.3295429944992065, "rewards/margins": -0.0034531853161752224, "rewards/rejected": -1.3260899782180786, "step": 17 }, { "epoch": 0.23713462330177026, "grad_norm": 1.20784330368042, "learning_rate": 9.460342212786932e-07, "logits/chosen": 10.368552207946777, "logits/rejected": 11.025596618652344, "logps/chosen": -0.8019055724143982, "logps/rejected": -0.7324545979499817, "loss": 1.5474, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6038111448287964, "rewards/margins": -0.13890185952186584, "rewards/rejected": -1.4649091958999634, "step": 18 }, { "epoch": 0.2503087690407575, "grad_norm": 0.6674954891204834, "learning_rate": 9.349531862043951e-07, "logits/chosen": 10.525771141052246, "logits/rejected": 10.864877700805664, "logps/chosen": -0.6636431217193604, "logps/rejected": -0.690964937210083, "loss": 1.3837, "rewards/accuracies": 0.640625, "rewards/chosen": -1.3272862434387207, "rewards/margins": 0.05464361608028412, "rewards/rejected": -1.381929874420166, "step": 19 }, { "epoch": 0.2634829147797447, "grad_norm": 0.9430455565452576, "learning_rate": 9.229160296295487e-07, "logits/chosen": 10.509737014770508, "logits/rejected": 11.218311309814453, "logps/chosen": -0.7572717666625977, "logps/rejected": -0.7445338368415833, "loss": 1.4599, "rewards/accuracies": 0.5, "rewards/chosen": -1.5145435333251953, "rewards/margins": -0.02547581121325493, "rewards/rejected": -1.4890676736831665, "step": 20 }, { "epoch": 0.276657060518732, "grad_norm": 0.5352253317832947, "learning_rate": 9.099492118367122e-07, "logits/chosen": 9.723971366882324, "logits/rejected": 10.554994583129883, "logps/chosen": -0.706066906452179, "logps/rejected": -0.7981133460998535, "loss": 1.3079, "rewards/accuracies": 0.5625, "rewards/chosen": -1.412133812904358, "rewards/margins": 0.18409286439418793, "rewards/rejected": -1.596226692199707, "step": 21 }, { "epoch": 0.28983120625771924, "grad_norm": 1.4943097829818726, "learning_rate": 8.960812367055646e-07, "logits/chosen": 10.025705337524414, "logits/rejected": 10.595457077026367, "logps/chosen": -0.7785383462905884, "logps/rejected": -0.7879722118377686, "loss": 1.4168, "rewards/accuracies": 0.5, "rewards/chosen": -1.5570766925811768, "rewards/margins": 0.01886790432035923, "rewards/rejected": -1.575944423675537, "step": 22 }, { "epoch": 0.3030053519967065, "grad_norm": 0.4453490972518921, "learning_rate": 8.813425890551909e-07, "logits/chosen": 10.01511001586914, "logits/rejected": 10.817825317382812, "logps/chosen": -0.7974975109100342, "logps/rejected": -0.7531540393829346, "loss": 1.5107, "rewards/accuracies": 0.5078125, "rewards/chosen": -1.5949950218200684, "rewards/margins": -0.08868695795536041, "rewards/rejected": -1.5063080787658691, "step": 23 }, { "epoch": 0.3161794977356937, "grad_norm": 0.8292773962020874, "learning_rate": 8.657656676318345e-07, "logits/chosen": 10.253622055053711, "logits/rejected": 10.707040786743164, "logps/chosen": -0.6545951962471008, "logps/rejected": -0.6871030330657959, "loss": 1.3837, "rewards/accuracies": 0.546875, "rewards/chosen": -1.3091903924942017, "rewards/margins": 0.06501554697751999, "rewards/rejected": -1.3742060661315918, "step": 24 }, { "epoch": 0.32935364347468093, "grad_norm": 3.5230870246887207, "learning_rate": 8.493847138894208e-07, "logits/chosen": 10.166427612304688, "logits/rejected": 10.720212936401367, "logps/chosen": -0.6964614391326904, "logps/rejected": -0.7443124055862427, "loss": 1.3755, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.3929228782653809, "rewards/margins": 0.09570197016000748, "rewards/rejected": -1.4886248111724854, "step": 25 }, { "epoch": 0.34252778921366817, "grad_norm": 1.1598762273788452, "learning_rate": 8.322357367194108e-07, "logits/chosen": 10.49393367767334, "logits/rejected": 11.039409637451172, "logps/chosen": -0.7815979719161987, "logps/rejected": -0.8116061687469482, "loss": 1.3981, "rewards/accuracies": 0.5, "rewards/chosen": -1.5631959438323975, "rewards/margins": 0.06001615524291992, "rewards/rejected": -1.6232123374938965, "step": 26 }, { "epoch": 0.3557019349526554, "grad_norm": 2.6781487464904785, "learning_rate": 8.143564332954425e-07, "logits/chosen": 10.239036560058594, "logits/rejected": 11.069831848144531, "logps/chosen": -0.7489575147628784, "logps/rejected": -0.7497468590736389, "loss": 1.4293, "rewards/accuracies": 0.4921875, "rewards/chosen": -1.4979150295257568, "rewards/margins": 0.0015787146985530853, "rewards/rejected": -1.4994937181472778, "step": 27 }, { "epoch": 0.3688760806916426, "grad_norm": 0.4053559899330139, "learning_rate": 7.957861062067612e-07, "logits/chosen": 9.529670715332031, "logits/rejected": 10.38549518585205, "logps/chosen": -0.7035645246505737, "logps/rejected": -0.7123870253562927, "loss": 1.4258, "rewards/accuracies": 0.546875, "rewards/chosen": -1.4071290493011475, "rewards/margins": 0.017645111307501793, "rewards/rejected": -1.4247740507125854, "step": 28 }, { "epoch": 0.3820502264306299, "grad_norm": 0.6468765735626221, "learning_rate": 7.765655770625996e-07, "logits/chosen": 10.515686988830566, "logits/rejected": 10.88726806640625, "logps/chosen": -0.6611747145652771, "logps/rejected": -0.7040727734565735, "loss": 1.368, "rewards/accuracies": 0.6328125, "rewards/chosen": -1.3223494291305542, "rewards/margins": 0.08579609543085098, "rewards/rejected": -1.408145546913147, "step": 29 }, { "epoch": 0.39522437216961714, "grad_norm": 0.6986653208732605, "learning_rate": 7.567370967574209e-07, "logits/chosen": 10.111821174621582, "logits/rejected": 11.31137466430664, "logps/chosen": -0.6936085820198059, "logps/rejected": -0.6943917274475098, "loss": 1.4297, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3872171640396118, "rewards/margins": 0.0015661753714084625, "rewards/rejected": -1.3887834548950195, "step": 30 }, { "epoch": 0.4083985179086044, "grad_norm": 0.4217478334903717, "learning_rate": 7.363442525942826e-07, "logits/chosen": 9.911711692810059, "logits/rejected": 10.704763412475586, "logps/chosen": -0.6641364097595215, "logps/rejected": -0.6581347584724426, "loss": 1.4344, "rewards/accuracies": 0.546875, "rewards/chosen": -1.328272819519043, "rewards/margins": -0.012003323063254356, "rewards/rejected": -1.3162695169448853, "step": 31 }, { "epoch": 0.4215726636475916, "grad_norm": 0.6227706670761108, "learning_rate": 7.154318724704851e-07, "logits/chosen": 9.962979316711426, "logits/rejected": 10.975818634033203, "logps/chosen": -0.6416503190994263, "logps/rejected": -0.6450206637382507, "loss": 1.4274, "rewards/accuracies": 0.4765625, "rewards/chosen": -1.2833006381988525, "rewards/margins": 0.006740846671164036, "rewards/rejected": -1.2900413274765015, "step": 32 }, { "epoch": 0.43474680938657884, "grad_norm": 0.9526137709617615, "learning_rate": 6.940459263361248e-07, "logits/chosen": 10.01764965057373, "logits/rejected": 10.877232551574707, "logps/chosen": -0.7044752836227417, "logps/rejected": -0.6560808420181274, "loss": 1.5103, "rewards/accuracies": 0.4765625, "rewards/chosen": -1.4089505672454834, "rewards/margins": -0.09678899496793747, "rewards/rejected": -1.3121616840362549, "step": 33 }, { "epoch": 0.44792095512556607, "grad_norm": 0.7180893421173096, "learning_rate": 6.722334251421664e-07, "logits/chosen": 9.873922348022461, "logits/rejected": 10.598325729370117, "logps/chosen": -0.7634012699127197, "logps/rejected": -0.7632129192352295, "loss": 1.4281, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.5268025398254395, "rewards/margins": -0.00037665292620658875, "rewards/rejected": -1.526425838470459, "step": 34 }, { "epoch": 0.4610951008645533, "grad_norm": 1.9519481658935547, "learning_rate": 6.500423175001703e-07, "logits/chosen": 10.66737174987793, "logits/rejected": 11.170511245727539, "logps/chosen": -0.7307225465774536, "logps/rejected": -0.7907586097717285, "loss": 1.3918, "rewards/accuracies": 0.5390625, "rewards/chosen": -1.4614450931549072, "rewards/margins": 0.12007206678390503, "rewards/rejected": -1.581517219543457, "step": 35 }, { "epoch": 0.47426924660354053, "grad_norm": 0.7991163730621338, "learning_rate": 6.275213842808382e-07, "logits/chosen": 10.241875648498535, "logits/rejected": 10.639405250549316, "logps/chosen": -0.6178256869316101, "logps/rejected": -0.6322227120399475, "loss": 1.3976, "rewards/accuracies": 0.4921875, "rewards/chosen": -1.2356513738632202, "rewards/margins": 0.028794117271900177, "rewards/rejected": -1.264445424079895, "step": 36 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }