{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.000605226127611929, "eval_steps": 5, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.026130638059645e-05, "grad_norm": 6.897684097290039, "learning_rate": 2e-05, "loss": 14.1293, "step": 1 }, { "epoch": 3.026130638059645e-05, "eval_loss": 3.439159631729126, "eval_runtime": 842.2168, "eval_samples_per_second": 16.521, "eval_steps_per_second": 8.26, "step": 1 }, { "epoch": 6.05226127611929e-05, "grad_norm": 7.347817897796631, "learning_rate": 4e-05, "loss": 13.2212, "step": 2 }, { "epoch": 9.078391914178935e-05, "grad_norm": 6.220455646514893, "learning_rate": 6e-05, "loss": 13.1675, "step": 3 }, { "epoch": 0.0001210452255223858, "grad_norm": 6.953392505645752, "learning_rate": 8e-05, "loss": 14.161, "step": 4 }, { "epoch": 0.00015130653190298225, "grad_norm": 5.382694721221924, "learning_rate": 0.0001, "loss": 12.7597, "step": 5 }, { "epoch": 0.00015130653190298225, "eval_loss": 3.2645046710968018, "eval_runtime": 841.9675, "eval_samples_per_second": 16.526, "eval_steps_per_second": 8.263, "step": 5 }, { "epoch": 0.0001815678382835787, "grad_norm": 4.9391889572143555, "learning_rate": 0.00012, "loss": 12.6619, "step": 6 }, { "epoch": 0.00021182914466417516, "grad_norm": 4.317287921905518, "learning_rate": 0.00014, "loss": 11.4259, "step": 7 }, { "epoch": 0.0002420904510447716, "grad_norm": 5.104113578796387, "learning_rate": 0.00016, "loss": 12.6299, "step": 8 }, { "epoch": 0.0002723517574253681, "grad_norm": 5.477384567260742, "learning_rate": 0.00018, "loss": 12.2808, "step": 9 }, { "epoch": 0.0003026130638059645, "grad_norm": 5.456517696380615, "learning_rate": 0.0002, "loss": 12.0744, "step": 10 }, { "epoch": 0.0003026130638059645, "eval_loss": 2.7829158306121826, "eval_runtime": 843.9912, "eval_samples_per_second": 16.486, "eval_steps_per_second": 8.243, "step": 10 }, { "epoch": 0.00033287437018656097, "grad_norm": 5.335687160491943, "learning_rate": 0.00019510565162951537, "loss": 12.4119, "step": 11 }, { "epoch": 0.0003631356765671574, "grad_norm": 5.601584434509277, "learning_rate": 0.00018090169943749476, "loss": 11.0707, "step": 12 }, { "epoch": 0.00039339698294775386, "grad_norm": 5.754549503326416, "learning_rate": 0.00015877852522924732, "loss": 9.4772, "step": 13 }, { "epoch": 0.00042365828932835033, "grad_norm": 8.268277168273926, "learning_rate": 0.00013090169943749476, "loss": 10.7824, "step": 14 }, { "epoch": 0.00045391959570894675, "grad_norm": 7.563680648803711, "learning_rate": 0.0001, "loss": 11.0886, "step": 15 }, { "epoch": 0.00045391959570894675, "eval_loss": 2.6860508918762207, "eval_runtime": 847.5438, "eval_samples_per_second": 16.417, "eval_steps_per_second": 8.208, "step": 15 }, { "epoch": 0.0004841809020895432, "grad_norm": 7.714522838592529, "learning_rate": 6.909830056250527e-05, "loss": 9.9248, "step": 16 }, { "epoch": 0.0005144422084701396, "grad_norm": 7.84112024307251, "learning_rate": 4.12214747707527e-05, "loss": 10.7425, "step": 17 }, { "epoch": 0.0005447035148507362, "grad_norm": 6.446829319000244, "learning_rate": 1.9098300562505266e-05, "loss": 10.3292, "step": 18 }, { "epoch": 0.0005749648212313326, "grad_norm": 7.735879421234131, "learning_rate": 4.8943483704846475e-06, "loss": 10.3231, "step": 19 }, { "epoch": 0.000605226127611929, "grad_norm": 6.886676788330078, "learning_rate": 0.0, "loss": 9.9093, "step": 20 }, { "epoch": 0.000605226127611929, "eval_loss": 2.6602911949157715, "eval_runtime": 843.1817, "eval_samples_per_second": 16.502, "eval_steps_per_second": 8.251, "step": 20 } ], "logging_steps": 1, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6534646443540480.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }