{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15808974886784685, "eval_steps": 500, "global_step": 12, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 0.7230468392372131, "learning_rate": 1.25e-07, "logits/chosen": 9.993395805358887, "logits/rejected": 10.444890022277832, "logps/chosen": -0.8203260898590088, "logps/rejected": -0.8579692840576172, "loss": 1.4012, "rewards/accuracies": 0.546875, "rewards/chosen": -1.6406521797180176, "rewards/margins": 0.07528629899024963, "rewards/rejected": -1.7159385681152344, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 1.0001572370529175, "learning_rate": 2.5e-07, "logits/chosen": 10.52372932434082, "logits/rejected": 10.788372993469238, "logps/chosen": -0.8189243078231812, "logps/rejected": -0.9178226590156555, "loss": 1.2834, "rewards/accuracies": 0.625, "rewards/chosen": -1.6378486156463623, "rewards/margins": 0.19779649376869202, "rewards/rejected": -1.835645318031311, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.979387640953064, "learning_rate": 3.75e-07, "logits/chosen": 10.042590141296387, "logits/rejected": 10.521858215332031, "logps/chosen": -0.917183518409729, "logps/rejected": -0.9847703576087952, "loss": 1.3664, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.834367036819458, "rewards/margins": 0.13517364859580994, "rewards/rejected": -1.9695407152175903, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 1.867851972579956, "learning_rate": 5e-07, "logits/chosen": 10.141695022583008, "logits/rejected": 10.483091354370117, "logps/chosen": -0.8512520790100098, "logps/rejected": -0.8865377902984619, "loss": 1.4018, "rewards/accuracies": 0.546875, "rewards/chosen": -1.7025041580200195, "rewards/margins": 0.0705714151263237, "rewards/rejected": -1.7730755805969238, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 0.5915809273719788, "learning_rate": 6.249999999999999e-07, "logits/chosen": 10.462024688720703, "logits/rejected": 11.114532470703125, "logps/chosen": -0.7721583247184753, "logps/rejected": -0.7985925078392029, "loss": 1.3829, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5443166494369507, "rewards/margins": 0.052868396043777466, "rewards/rejected": -1.5971850156784058, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 0.6252888441085815, "learning_rate": 7.5e-07, "logits/chosen": 10.718672752380371, "logits/rejected": 11.112823486328125, "logps/chosen": -0.8559192419052124, "logps/rejected": -0.9305768013000488, "loss": 1.3618, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.7118384838104248, "rewards/margins": 0.1493152379989624, "rewards/rejected": -1.8611536026000977, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 0.7208986878395081, "learning_rate": 8.75e-07, "logits/chosen": 9.864439010620117, "logits/rejected": 10.09216022491455, "logps/chosen": -0.7553099393844604, "logps/rejected": -0.8098596930503845, "loss": 1.3613, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.510619878768921, "rewards/margins": 0.10909969359636307, "rewards/rejected": -1.619719386100769, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 1.6331408023834229, "learning_rate": 1e-06, "logits/chosen": 10.65585994720459, "logits/rejected": 11.107338905334473, "logps/chosen": -0.774090051651001, "logps/rejected": -0.8291972279548645, "loss": 1.3462, "rewards/accuracies": 0.625, "rewards/chosen": -1.548180103302002, "rewards/margins": 0.11021438986063004, "rewards/rejected": -1.658394455909729, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 1.0860488414764404, "learning_rate": 9.994504457428556e-07, "logits/chosen": 10.446334838867188, "logits/rejected": 10.82742691040039, "logps/chosen": -0.938445508480072, "logps/rejected": -0.8894600868225098, "loss": 1.5125, "rewards/accuracies": 0.53125, "rewards/chosen": -1.876891016960144, "rewards/margins": -0.09797099232673645, "rewards/rejected": -1.7789201736450195, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 1.3078101873397827, "learning_rate": 9.97802991010949e-07, "logits/chosen": 10.292729377746582, "logits/rejected": 10.629972457885742, "logps/chosen": -0.8660728931427002, "logps/rejected": -0.881781280040741, "loss": 1.4654, "rewards/accuracies": 0.5390625, "rewards/chosen": -1.7321457862854004, "rewards/margins": 0.0314166434109211, "rewards/rejected": -1.763562560081482, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 1.0685713291168213, "learning_rate": 9.950612572673255e-07, "logits/chosen": 10.143664360046387, "logits/rejected": 10.639056205749512, "logps/chosen": -0.8788573145866394, "logps/rejected": -0.9421562552452087, "loss": 1.3638, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7577146291732788, "rewards/margins": 0.12659788131713867, "rewards/rejected": -1.8843125104904175, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 0.9801668524742126, "learning_rate": 9.912312714377879e-07, "logits/chosen": 10.19462776184082, "logits/rejected": 10.269098281860352, "logps/chosen": -0.8250212669372559, "logps/rejected": -0.8715258836746216, "loss": 1.3649, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6500425338745117, "rewards/margins": 0.09300932288169861, "rewards/rejected": -1.7430517673492432, "step": 12 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }