{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9221902017291066, "eval_steps": 500, "global_step": 10, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09221902017291066, "grad_norm": 0.37403297424316406, "learning_rate": 5e-07, "logits/chosen": 0.10453951358795166, "logits/rejected": 0.2759632468223572, "logps/chosen": -132.67144775390625, "logps/ref_chosen": -133.03013610839844, "logps/ref_rejected": -164.177734375, "logps/rejected": -163.5892333984375, "loss": 0.6943, "rewards/accuracies": 0.3984375, "rewards/chosen": 0.003586653620004654, "rewards/margins": -0.002298325300216675, "rewards/rejected": 0.005884978454560041, "step": 1 }, { "epoch": 0.1844380403458213, "grad_norm": 0.358889102935791, "learning_rate": 4.849231551964771e-07, "logits/chosen": 0.08250421285629272, "logits/rejected": 0.33024948835372925, "logps/chosen": -134.9761962890625, "logps/ref_chosen": -135.27749633789062, "logps/ref_rejected": -188.41795349121094, "logps/rejected": -187.88470458984375, "loss": 0.6943, "rewards/accuracies": 0.3515625, "rewards/chosen": 0.003012962406501174, "rewards/margins": -0.0023194574750959873, "rewards/rejected": 0.005332420114427805, "step": 2 }, { "epoch": 0.276657060518732, "grad_norm": 0.3239206075668335, "learning_rate": 4.415111107797445e-07, "logits/chosen": 0.08983182162046432, "logits/rejected": 0.2510722875595093, "logps/chosen": -134.5572052001953, "logps/ref_chosen": -134.77491760253906, "logps/ref_rejected": -161.10980224609375, "logps/rejected": -160.8002471923828, "loss": 0.6936, "rewards/accuracies": 0.453125, "rewards/chosen": 0.002177180489525199, "rewards/margins": -0.0009184688096866012, "rewards/rejected": 0.003095649415627122, "step": 3 }, { "epoch": 0.3688760806916426, "grad_norm": 0.3113608658313751, "learning_rate": 3.75e-07, "logits/chosen": 0.18616041541099548, "logits/rejected": 0.3378028869628906, "logps/chosen": -141.66685485839844, "logps/ref_chosen": -142.0138702392578, "logps/ref_rejected": -173.76629638671875, "logps/rejected": -173.24481201171875, "loss": 0.694, "rewards/accuracies": 0.390625, "rewards/chosen": 0.0034699777606874704, "rewards/margins": -0.0017446475103497505, "rewards/rejected": 0.005214625503867865, "step": 4 }, { "epoch": 0.4610951008645533, "grad_norm": 0.3619195818901062, "learning_rate": 2.934120444167326e-07, "logits/chosen": 0.1221667155623436, "logits/rejected": 0.268534779548645, "logps/chosen": -134.57679748535156, "logps/ref_chosen": -134.8294677734375, "logps/ref_rejected": -177.42715454101562, "logps/rejected": -177.0855712890625, "loss": 0.6936, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.002526558004319668, "rewards/margins": -0.0008892094483599067, "rewards/rejected": 0.00341576780192554, "step": 5 }, { "epoch": 0.553314121037464, "grad_norm": 0.3865343928337097, "learning_rate": 2.065879555832674e-07, "logits/chosen": 0.19299811124801636, "logits/rejected": 0.30019116401672363, "logps/chosen": -135.14425659179688, "logps/ref_chosen": -135.45623779296875, "logps/ref_rejected": -159.72341918945312, "logps/rejected": -159.44705200195312, "loss": 0.693, "rewards/accuracies": 0.515625, "rewards/chosen": 0.003119847271591425, "rewards/margins": 0.00035619616392068565, "rewards/rejected": 0.002763650845736265, "step": 6 }, { "epoch": 0.6455331412103746, "grad_norm": 0.40479081869125366, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 0.15088775753974915, "logits/rejected": 0.3000352084636688, "logps/chosen": -139.46456909179688, "logps/ref_chosen": -139.45156860351562, "logps/ref_rejected": -172.6890869140625, "logps/rejected": -172.62013244628906, "loss": 0.6936, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.0001300960429944098, "rewards/margins": -0.000819505425170064, "rewards/rejected": 0.0006894093239679933, "step": 7 }, { "epoch": 0.7377521613832853, "grad_norm": 0.3541257679462433, "learning_rate": 5.848888922025552e-08, "logits/chosen": 0.15152569115161896, "logits/rejected": 0.2909863293170929, "logps/chosen": -133.0579376220703, "logps/ref_chosen": -133.19911193847656, "logps/ref_rejected": -167.17926025390625, "logps/rejected": -167.0704803466797, "loss": 0.693, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.0014116661623120308, "rewards/margins": 0.00032381698838435113, "rewards/rejected": 0.0010878491448238492, "step": 8 }, { "epoch": 0.829971181556196, "grad_norm": 0.36632558703422546, "learning_rate": 1.507684480352292e-08, "logits/chosen": 0.0733647495508194, "logits/rejected": 0.21200355887413025, "logps/chosen": -137.79318237304688, "logps/ref_chosen": -137.95736694335938, "logps/ref_rejected": -172.7656707763672, "logps/rejected": -172.64659118652344, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0016417349688708782, "rewards/margins": 0.00045086207683198154, "rewards/rejected": 0.0011908727465197444, "step": 9 }, { "epoch": 0.9221902017291066, "grad_norm": 0.33898451924324036, "learning_rate": 0.0, "logits/chosen": 0.10225574672222137, "logits/rejected": 0.29343536496162415, "logps/chosen": -135.6978759765625, "logps/ref_chosen": -135.66470336914062, "logps/ref_rejected": -178.8431854248047, "logps/rejected": -178.8609619140625, "loss": 0.6932, "rewards/accuracies": 0.4453125, "rewards/chosen": -0.0003318000235594809, "rewards/margins": -0.00015407620230689645, "rewards/rejected": -0.00017772376304492354, "step": 10 }, { "epoch": 0.9221902017291066, "step": 10, "total_flos": 0.0, "train_loss": 0.6935525000095367, "train_runtime": 687.8249, "train_samples_per_second": 2.017, "train_steps_per_second": 0.015 } ], "logging_steps": 1, "max_steps": 10, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }