| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9955555555555555, | |
| "eval_steps": 100, | |
| "global_step": 562, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.035555555555555556, | |
| "grad_norm": 11.767955780029297, | |
| "learning_rate": 0.00019679715302491104, | |
| "loss": 33.6554, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07111111111111111, | |
| "grad_norm": 9.438668251037598, | |
| "learning_rate": 0.0001932384341637011, | |
| "loss": 33.8787, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 10.174617767333984, | |
| "learning_rate": 0.00018967971530249112, | |
| "loss": 33.8919, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14222222222222222, | |
| "grad_norm": 8.84274673461914, | |
| "learning_rate": 0.00018612099644128114, | |
| "loss": 33.7011, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 10.169342041015625, | |
| "learning_rate": 0.0001825622775800712, | |
| "loss": 33.6306, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 9.339362144470215, | |
| "learning_rate": 0.0001790035587188612, | |
| "loss": 33.5378, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24888888888888888, | |
| "grad_norm": 10.399051666259766, | |
| "learning_rate": 0.00017544483985765125, | |
| "loss": 33.1223, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.28444444444444444, | |
| "grad_norm": 8.772202491760254, | |
| "learning_rate": 0.00017188612099644127, | |
| "loss": 34.3864, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 9.338233947753906, | |
| "learning_rate": 0.00016832740213523133, | |
| "loss": 33.2955, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 9.439739227294922, | |
| "learning_rate": 0.00016476868327402135, | |
| "loss": 33.229, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "eval_loss": 2.133469820022583, | |
| "eval_runtime": 296.1668, | |
| "eval_samples_per_second": 3.376, | |
| "eval_steps_per_second": 0.422, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.39111111111111113, | |
| "grad_norm": 9.046673774719238, | |
| "learning_rate": 0.0001612099644128114, | |
| "loss": 33.3667, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 8.99227237701416, | |
| "learning_rate": 0.00015765124555160143, | |
| "loss": 32.6701, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4622222222222222, | |
| "grad_norm": 7.6904144287109375, | |
| "learning_rate": 0.00015409252669039148, | |
| "loss": 33.2927, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.49777777777777776, | |
| "grad_norm": 8.012206077575684, | |
| "learning_rate": 0.00015053380782918148, | |
| "loss": 33.2934, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 10.931622505187988, | |
| "learning_rate": 0.00014697508896797153, | |
| "loss": 33.3676, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5688888888888889, | |
| "grad_norm": 7.606035232543945, | |
| "learning_rate": 0.00014341637010676156, | |
| "loss": 34.1758, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6044444444444445, | |
| "grad_norm": 9.531214714050293, | |
| "learning_rate": 0.0001398576512455516, | |
| "loss": 33.0847, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 8.761300086975098, | |
| "learning_rate": 0.00013629893238434164, | |
| "loss": 33.5206, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6755555555555556, | |
| "grad_norm": 9.155729293823242, | |
| "learning_rate": 0.0001327402135231317, | |
| "loss": 33.2403, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 9.354476928710938, | |
| "learning_rate": 0.00012918149466192172, | |
| "loss": 33.5548, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "eval_loss": 2.126850128173828, | |
| "eval_runtime": 296.1679, | |
| "eval_samples_per_second": 3.376, | |
| "eval_steps_per_second": 0.422, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 8.922224998474121, | |
| "learning_rate": 0.00012562277580071177, | |
| "loss": 33.279, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7822222222222223, | |
| "grad_norm": 9.973633766174316, | |
| "learning_rate": 0.00012206405693950178, | |
| "loss": 33.5481, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8177777777777778, | |
| "grad_norm": 8.771803855895996, | |
| "learning_rate": 0.00011850533807829183, | |
| "loss": 33.1058, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 10.16543960571289, | |
| "learning_rate": 0.00011494661921708185, | |
| "loss": 33.3706, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 9.286821365356445, | |
| "learning_rate": 0.0001113879003558719, | |
| "loss": 33.3456, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9244444444444444, | |
| "grad_norm": 9.520956039428711, | |
| "learning_rate": 0.00010782918149466192, | |
| "loss": 33.5781, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 10.376456260681152, | |
| "learning_rate": 0.00010427046263345198, | |
| "loss": 32.9687, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9955555555555555, | |
| "grad_norm": 8.36178207397461, | |
| "learning_rate": 0.00010071174377224199, | |
| "loss": 33.7239, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0284444444444445, | |
| "grad_norm": 10.113052368164062, | |
| "learning_rate": 9.715302491103203e-05, | |
| "loss": 29.9997, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": 11.123631477355957, | |
| "learning_rate": 9.359430604982207e-05, | |
| "loss": 32.5004, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "eval_loss": 2.122236490249634, | |
| "eval_runtime": 296.127, | |
| "eval_samples_per_second": 3.377, | |
| "eval_steps_per_second": 0.422, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0995555555555556, | |
| "grad_norm": 9.897551536560059, | |
| "learning_rate": 9.00355871886121e-05, | |
| "loss": 32.5046, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1351111111111112, | |
| "grad_norm": 9.53073501586914, | |
| "learning_rate": 8.647686832740213e-05, | |
| "loss": 32.2727, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1706666666666667, | |
| "grad_norm": 10.394311904907227, | |
| "learning_rate": 8.291814946619217e-05, | |
| "loss": 32.688, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2062222222222223, | |
| "grad_norm": 9.498970031738281, | |
| "learning_rate": 7.935943060498221e-05, | |
| "loss": 33.6316, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2417777777777779, | |
| "grad_norm": 10.150975227355957, | |
| "learning_rate": 7.580071174377225e-05, | |
| "loss": 33.0713, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2773333333333334, | |
| "grad_norm": 9.899177551269531, | |
| "learning_rate": 7.224199288256229e-05, | |
| "loss": 32.4769, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3128888888888888, | |
| "grad_norm": 9.39831829071045, | |
| "learning_rate": 6.868327402135231e-05, | |
| "loss": 32.2654, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.3484444444444446, | |
| "grad_norm": 10.761151313781738, | |
| "learning_rate": 6.512455516014235e-05, | |
| "loss": 32.491, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": 9.932414054870605, | |
| "learning_rate": 6.156583629893239e-05, | |
| "loss": 33.5308, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.4195555555555557, | |
| "grad_norm": 11.054327011108398, | |
| "learning_rate": 5.8007117437722425e-05, | |
| "loss": 31.7061, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4195555555555557, | |
| "eval_loss": 2.120673418045044, | |
| "eval_runtime": 296.1092, | |
| "eval_samples_per_second": 3.377, | |
| "eval_steps_per_second": 0.422, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.455111111111111, | |
| "grad_norm": 10.89476203918457, | |
| "learning_rate": 5.4448398576512464e-05, | |
| "loss": 32.485, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4906666666666666, | |
| "grad_norm": 9.823376655578613, | |
| "learning_rate": 5.0889679715302496e-05, | |
| "loss": 32.9951, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5262222222222221, | |
| "grad_norm": 11.316079139709473, | |
| "learning_rate": 4.733096085409253e-05, | |
| "loss": 32.3443, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.561777777777778, | |
| "grad_norm": 11.608524322509766, | |
| "learning_rate": 4.377224199288256e-05, | |
| "loss": 32.2948, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5973333333333333, | |
| "grad_norm": 11.020298957824707, | |
| "learning_rate": 4.02135231316726e-05, | |
| "loss": 32.6702, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6328888888888888, | |
| "grad_norm": 9.804555892944336, | |
| "learning_rate": 3.665480427046263e-05, | |
| "loss": 31.6452, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6684444444444444, | |
| "grad_norm": 11.037073135375977, | |
| "learning_rate": 3.309608540925267e-05, | |
| "loss": 32.479, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.704, | |
| "grad_norm": 9.837021827697754, | |
| "learning_rate": 2.9537366548042704e-05, | |
| "loss": 32.72, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.7395555555555555, | |
| "grad_norm": 11.720721244812012, | |
| "learning_rate": 2.597864768683274e-05, | |
| "loss": 32.6789, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.775111111111111, | |
| "grad_norm": 11.738125801086426, | |
| "learning_rate": 2.2419928825622775e-05, | |
| "loss": 33.3128, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.775111111111111, | |
| "eval_loss": 2.11881947517395, | |
| "eval_runtime": 296.1216, | |
| "eval_samples_per_second": 3.377, | |
| "eval_steps_per_second": 0.422, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.8106666666666666, | |
| "grad_norm": 11.249613761901855, | |
| "learning_rate": 1.8861209964412814e-05, | |
| "loss": 31.9298, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.8462222222222222, | |
| "grad_norm": 11.530637741088867, | |
| "learning_rate": 1.530249110320285e-05, | |
| "loss": 31.8878, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8817777777777778, | |
| "grad_norm": 11.147592544555664, | |
| "learning_rate": 1.1743772241992882e-05, | |
| "loss": 32.6852, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.9173333333333333, | |
| "grad_norm": 9.81916332244873, | |
| "learning_rate": 8.185053380782918e-06, | |
| "loss": 32.1578, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.952888888888889, | |
| "grad_norm": 10.557317733764648, | |
| "learning_rate": 4.626334519572954e-06, | |
| "loss": 32.2151, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.9884444444444445, | |
| "grad_norm": 10.493524551391602, | |
| "learning_rate": 1.0676156583629894e-06, | |
| "loss": 31.9549, | |
| "step": 560 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 562, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.93400073703424e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |