{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 1.52798593044281, "learning_rate": 9.756378189094549e-06, "loss": 0.1999, "step": 500 }, { "epoch": 0.05, "grad_norm": 2.1572341918945312, "learning_rate": 9.506253126563282e-06, "loss": 0.0279, "step": 1000 }, { "epoch": 0.075, "grad_norm": 1.0278618335723877, "learning_rate": 9.256128064032017e-06, "loss": 0.0234, "step": 1500 }, { "epoch": 0.1, "grad_norm": 1.9378705024719238, "learning_rate": 9.006003001500752e-06, "loss": 0.0203, "step": 2000 }, { "epoch": 0.125, "grad_norm": 1.0253329277038574, "learning_rate": 8.755877938969486e-06, "loss": 0.0179, "step": 2500 }, { "epoch": 0.15, "grad_norm": 1.588853359222412, "learning_rate": 8.50575287643822e-06, "loss": 0.0171, "step": 3000 }, { "epoch": 0.175, "grad_norm": 2.365065336227417, "learning_rate": 8.255627813906954e-06, "loss": 0.0153, "step": 3500 }, { "epoch": 0.2, "grad_norm": 0.9309738874435425, "learning_rate": 8.005502751375689e-06, "loss": 0.015, "step": 4000 }, { "epoch": 0.225, "grad_norm": 0.4729306697845459, "learning_rate": 7.755377688844424e-06, "loss": 0.0143, "step": 4500 }, { "epoch": 0.25, "grad_norm": 1.5990219116210938, "learning_rate": 7.505252626313157e-06, "loss": 0.014, "step": 5000 }, { "epoch": 0.275, "grad_norm": 0.32973700761795044, "learning_rate": 7.255627813906953e-06, "loss": 0.0128, "step": 5500 }, { "epoch": 0.3, "grad_norm": 0.7276681661605835, "learning_rate": 7.0055027513756875e-06, "loss": 0.013, "step": 6000 }, { "epoch": 0.325, "grad_norm": 1.137271761894226, "learning_rate": 6.7553776888444225e-06, "loss": 0.0129, "step": 6500 }, { "epoch": 0.35, "grad_norm": 0.2594708800315857, "learning_rate": 6.505252626313157e-06, "loss": 0.0124, "step": 7000 }, { "epoch": 0.375, "grad_norm": 0.2336331158876419, "learning_rate": 6.255127563781892e-06, "loss": 0.012, "step": 7500 }, { "epoch": 0.4, "grad_norm": 0.19649972021579742, "learning_rate": 6.005002501250627e-06, "loss": 0.0121, "step": 8000 }, { "epoch": 0.425, "grad_norm": 0.5051653981208801, "learning_rate": 5.754877438719361e-06, "loss": 0.0115, "step": 8500 }, { "epoch": 0.45, "grad_norm": 0.3455007076263428, "learning_rate": 5.504752376188095e-06, "loss": 0.0113, "step": 9000 }, { "epoch": 0.475, "grad_norm": 0.4421294331550598, "learning_rate": 5.254627313656829e-06, "loss": 0.0106, "step": 9500 }, { "epoch": 0.5, "grad_norm": 1.2041834592819214, "learning_rate": 5.004502251125563e-06, "loss": 0.0107, "step": 10000 }, { "epoch": 0.525, "grad_norm": 1.119016408920288, "learning_rate": 4.754377188594297e-06, "loss": 0.011, "step": 10500 }, { "epoch": 0.55, "grad_norm": 0.9011972546577454, "learning_rate": 4.5042521260630315e-06, "loss": 0.0108, "step": 11000 }, { "epoch": 0.575, "grad_norm": 0.4205041229724884, "learning_rate": 4.2546273136568285e-06, "loss": 0.0103, "step": 11500 }, { "epoch": 0.6, "grad_norm": 0.22211983799934387, "learning_rate": 4.0045022511255635e-06, "loss": 0.0097, "step": 12000 }, { "epoch": 0.625, "grad_norm": 0.26337993144989014, "learning_rate": 3.7543771885942976e-06, "loss": 0.01, "step": 12500 }, { "epoch": 0.65, "grad_norm": 0.526462197303772, "learning_rate": 3.5042521260630318e-06, "loss": 0.0104, "step": 13000 }, { "epoch": 0.675, "grad_norm": 0.12982240319252014, "learning_rate": 3.2541270635317664e-06, "loss": 0.0096, "step": 13500 }, { "epoch": 0.7, "grad_norm": 0.8299207091331482, "learning_rate": 3.0040020010005005e-06, "loss": 0.0096, "step": 14000 }, { "epoch": 0.725, "grad_norm": 0.18750137090682983, "learning_rate": 2.753876938469235e-06, "loss": 0.0097, "step": 14500 }, { "epoch": 0.75, "grad_norm": 0.37416380643844604, "learning_rate": 2.5037518759379692e-06, "loss": 0.0097, "step": 15000 }, { "epoch": 0.775, "grad_norm": 0.47138354182243347, "learning_rate": 2.2536268134067034e-06, "loss": 0.0089, "step": 15500 }, { "epoch": 0.8, "grad_norm": 0.859859824180603, "learning_rate": 2.003501750875438e-06, "loss": 0.0089, "step": 16000 }, { "epoch": 0.825, "grad_norm": Infinity, "learning_rate": 1.7538769384692347e-06, "loss": 0.0091, "step": 16500 }, { "epoch": 0.85, "grad_norm": 0.4002706706523895, "learning_rate": 1.503751875937969e-06, "loss": 0.0085, "step": 17000 }, { "epoch": 0.875, "grad_norm": 0.29968053102493286, "learning_rate": 1.2536268134067034e-06, "loss": 0.0084, "step": 17500 }, { "epoch": 0.9, "grad_norm": 1.422002911567688, "learning_rate": 1.0035017508754378e-06, "loss": 0.0089, "step": 18000 }, { "epoch": 0.925, "grad_norm": 0.272173672914505, "learning_rate": 7.533766883441721e-07, "loss": 0.0087, "step": 18500 }, { "epoch": 0.95, "grad_norm": 0.2766880691051483, "learning_rate": 5.03751875937969e-07, "loss": 0.0087, "step": 19000 }, { "epoch": 0.975, "grad_norm": 0.17756783962249756, "learning_rate": 2.5362681340670335e-07, "loss": 0.0088, "step": 19500 }, { "epoch": 1.0, "grad_norm": 0.5026212334632874, "learning_rate": 3.501750875437719e-09, "loss": 0.0089, "step": 20000 } ], "logging_steps": 500, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }