{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15808974886784685, "eval_steps": 500, "global_step": 12, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 0.538081705570221, "learning_rate": 1.25e-07, "logits/chosen": 10.038352012634277, "logits/rejected": 10.592904090881348, "logps/chosen": -0.6228358745574951, "logps/rejected": -0.6871199011802673, "loss": 1.342, "rewards/accuracies": 0.578125, "rewards/chosen": -1.2456717491149902, "rewards/margins": 0.12856802344322205, "rewards/rejected": -1.3742398023605347, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.6521235108375549, "learning_rate": 2.5e-07, "logits/chosen": 10.320584297180176, "logits/rejected": 10.721946716308594, "logps/chosen": -0.7115719318389893, "logps/rejected": -0.788784384727478, "loss": 1.3147, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4231438636779785, "rewards/margins": 0.15442489087581635, "rewards/rejected": -1.577568769454956, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.8797138929367065, "learning_rate": 3.75e-07, "logits/chosen": 9.899504661560059, "logits/rejected": 10.505952835083008, "logps/chosen": -0.8225007057189941, "logps/rejected": -0.8832307457923889, "loss": 1.3674, "rewards/accuracies": 0.5078125, "rewards/chosen": -1.6450014114379883, "rewards/margins": 0.12146000564098358, "rewards/rejected": -1.7664614915847778, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 1.9139935970306396, "learning_rate": 5e-07, "logits/chosen": 10.082985877990723, "logits/rejected": 10.576549530029297, "logps/chosen": -0.6892099976539612, "logps/rejected": -0.7180394530296326, "loss": 1.4038, "rewards/accuracies": 0.5, "rewards/chosen": -1.3784199953079224, "rewards/margins": 0.05765870213508606, "rewards/rejected": -1.4360789060592651, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 0.8647859692573547, "learning_rate": 6.249999999999999e-07, "logits/chosen": 10.318564414978027, "logits/rejected": 11.072587966918945, "logps/chosen": -0.6658570766448975, "logps/rejected": -0.6663312911987305, "loss": 1.4062, "rewards/accuracies": 0.5703125, "rewards/chosen": -1.331714153289795, "rewards/margins": 0.0009482596069574356, "rewards/rejected": -1.332662582397461, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 0.7906696796417236, "learning_rate": 7.5e-07, "logits/chosen": 10.802580833435059, "logits/rejected": 11.333773612976074, "logps/chosen": -0.7257988452911377, "logps/rejected": -0.7839725017547607, "loss": 1.3781, "rewards/accuracies": 0.6015625, "rewards/chosen": -1.4515976905822754, "rewards/margins": 0.11634734272956848, "rewards/rejected": -1.5679450035095215, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 0.724219799041748, "learning_rate": 8.75e-07, "logits/chosen": 9.928263664245605, "logits/rejected": 10.422144889831543, "logps/chosen": -0.5926575660705566, "logps/rejected": -0.6688517928123474, "loss": 1.314, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.1853151321411133, "rewards/margins": 0.15238842368125916, "rewards/rejected": -1.3377035856246948, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 0.558660089969635, "learning_rate": 1e-06, "logits/chosen": 10.657012939453125, "logits/rejected": 11.171004295349121, "logps/chosen": -0.6659789681434631, "logps/rejected": -0.7012848258018494, "loss": 1.365, "rewards/accuracies": 0.578125, "rewards/chosen": -1.3319579362869263, "rewards/margins": 0.07061176747083664, "rewards/rejected": -1.4025696516036987, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 0.7675488591194153, "learning_rate": 9.994504457428556e-07, "logits/chosen": 10.544637680053711, "logits/rejected": 10.839460372924805, "logps/chosen": -0.814159095287323, "logps/rejected": -0.7815468907356262, "loss": 1.4888, "rewards/accuracies": 0.515625, "rewards/chosen": -1.628318190574646, "rewards/margins": -0.0652243047952652, "rewards/rejected": -1.5630937814712524, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 1.1157082319259644, "learning_rate": 9.97802991010949e-07, "logits/chosen": 10.10114574432373, "logits/rejected": 10.555818557739258, "logps/chosen": -0.673196017742157, "logps/rejected": -0.6864349246025085, "loss": 1.4279, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.346392035484314, "rewards/margins": 0.02647773176431656, "rewards/rejected": -1.372869849205017, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 1.2121509313583374, "learning_rate": 9.950612572673255e-07, "logits/chosen": 10.158075332641602, "logits/rejected": 10.813385009765625, "logps/chosen": -0.7734582424163818, "logps/rejected": -0.8254096508026123, "loss": 1.3538, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.5469164848327637, "rewards/margins": 0.10390281677246094, "rewards/rejected": -1.6508193016052246, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 0.921929657459259, "learning_rate": 9.912312714377879e-07, "logits/chosen": 10.2570161819458, "logits/rejected": 10.633421897888184, "logps/chosen": -0.7107410430908203, "logps/rejected": -0.7390152812004089, "loss": 1.3791, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4214820861816406, "rewards/margins": 0.056548528373241425, "rewards/rejected": -1.4780305624008179, "step": 12 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }