{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15808974886784685, "eval_steps": 500, "global_step": 12, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 0.5102696418762207, "learning_rate": 1.25e-07, "logits/chosen": 10.088521957397461, "logits/rejected": 10.263787269592285, "logps/chosen": -0.9118157029151917, "logps/rejected": -0.9621729850769043, "loss": 1.3897, "rewards/accuracies": 0.5234375, "rewards/chosen": -1.8236314058303833, "rewards/margins": 0.10071463882923126, "rewards/rejected": -1.9243459701538086, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.9815747141838074, "learning_rate": 2.5e-07, "logits/chosen": 10.592972755432129, "logits/rejected": 10.720216751098633, "logps/chosen": -0.945902943611145, "logps/rejected": -1.0317902565002441, "loss": 1.3077, "rewards/accuracies": 0.609375, "rewards/chosen": -1.89180588722229, "rewards/margins": 0.1717745065689087, "rewards/rejected": -2.0635805130004883, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.9049758315086365, "learning_rate": 3.75e-07, "logits/chosen": 10.041976928710938, "logits/rejected": 10.399316787719727, "logps/chosen": -1.0869810581207275, "logps/rejected": -1.1895216703414917, "loss": 1.346, "rewards/accuracies": 0.578125, "rewards/chosen": -2.173962116241455, "rewards/margins": 0.20508113503456116, "rewards/rejected": -2.3790433406829834, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 1.8911848068237305, "learning_rate": 5e-07, "logits/chosen": 10.243470191955566, "logits/rejected": 10.443375587463379, "logps/chosen": -0.966098427772522, "logps/rejected": -1.0040662288665771, "loss": 1.4032, "rewards/accuracies": 0.546875, "rewards/chosen": -1.932196855545044, "rewards/margins": 0.07593552023172379, "rewards/rejected": -2.0081324577331543, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 0.6135074496269226, "learning_rate": 6.249999999999999e-07, "logits/chosen": 10.439040184020996, "logits/rejected": 10.739177703857422, "logps/chosen": -0.9262609481811523, "logps/rejected": -0.9657196998596191, "loss": 1.3727, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.8525218963623047, "rewards/margins": 0.07891744375228882, "rewards/rejected": -1.9314393997192383, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 0.5990542769432068, "learning_rate": 7.5e-07, "logits/chosen": 10.910269737243652, "logits/rejected": 11.204473495483398, "logps/chosen": -0.9439595341682434, "logps/rejected": -1.0420396327972412, "loss": 1.3491, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8879190683364868, "rewards/margins": 0.196160227060318, "rewards/rejected": -2.0840792655944824, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 1.3676807880401611, "learning_rate": 8.75e-07, "logits/chosen": 9.873465538024902, "logits/rejected": 10.022269248962402, "logps/chosen": -0.8941428661346436, "logps/rejected": -1.0010743141174316, "loss": 1.3507, "rewards/accuracies": 0.625, "rewards/chosen": -1.788285732269287, "rewards/margins": 0.21386288106441498, "rewards/rejected": -2.0021486282348633, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 1.7690002918243408, "learning_rate": 1e-06, "logits/chosen": 10.597719192504883, "logits/rejected": 10.780376434326172, "logps/chosen": -0.9080270528793335, "logps/rejected": -0.9909782409667969, "loss": 1.3305, "rewards/accuracies": 0.640625, "rewards/chosen": -1.816054105758667, "rewards/margins": 0.16590236127376556, "rewards/rejected": -1.9819564819335938, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 1.056247353553772, "learning_rate": 9.994504457428556e-07, "logits/chosen": 10.446786880493164, "logits/rejected": 10.839168548583984, "logps/chosen": -1.1091859340667725, "logps/rejected": -1.0694739818572998, "loss": 1.5127, "rewards/accuracies": 0.5390625, "rewards/chosen": -2.218371868133545, "rewards/margins": -0.07942387461662292, "rewards/rejected": -2.1389479637145996, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 2.076240062713623, "learning_rate": 9.97802991010949e-07, "logits/chosen": 10.343971252441406, "logits/rejected": 10.492179870605469, "logps/chosen": -0.9705042839050293, "logps/rejected": -0.9916192889213562, "loss": 1.4611, "rewards/accuracies": 0.53125, "rewards/chosen": -1.9410085678100586, "rewards/margins": 0.04223020374774933, "rewards/rejected": -1.9832385778427124, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 1.13358736038208, "learning_rate": 9.950612572673255e-07, "logits/chosen": 10.49313735961914, "logits/rejected": 10.680143356323242, "logps/chosen": -1.1081148386001587, "logps/rejected": -1.223841667175293, "loss": 1.3449, "rewards/accuracies": 0.59375, "rewards/chosen": -2.2162296772003174, "rewards/margins": 0.23145350813865662, "rewards/rejected": -2.447683334350586, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 1.1638388633728027, "learning_rate": 9.912312714377879e-07, "logits/chosen": 10.328557014465332, "logits/rejected": 10.365793228149414, "logps/chosen": -0.9249637722969055, "logps/rejected": -0.9842618703842163, "loss": 1.351, "rewards/accuracies": 0.6015625, "rewards/chosen": -1.849927544593811, "rewards/margins": 0.11859625577926636, "rewards/rejected": -1.9685237407684326, "step": 12 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }