|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9809417040358746, |
|
"eval_steps": 500, |
|
"global_step": 712, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14013452914798205, |
|
"grad_norm": 14.509355545043945, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 7.1534, |
|
"num_input_tokens_seen": 318240, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2802690582959641, |
|
"grad_norm": 6.642310619354248, |
|
"learning_rate": 6.533333333333334e-06, |
|
"loss": 1.9472, |
|
"num_input_tokens_seen": 636760, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4204035874439462, |
|
"grad_norm": 3.3247640132904053, |
|
"learning_rate": 9.866666666666668e-06, |
|
"loss": 1.5046, |
|
"num_input_tokens_seen": 956520, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5605381165919282, |
|
"grad_norm": 2.3405237197875977, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 1.3971, |
|
"num_input_tokens_seen": 1275920, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7006726457399103, |
|
"grad_norm": 3.028830051422119, |
|
"learning_rate": 1.6533333333333333e-05, |
|
"loss": 1.3768, |
|
"num_input_tokens_seen": 1595280, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8408071748878924, |
|
"grad_norm": 1.428653597831726, |
|
"learning_rate": 1.9866666666666667e-05, |
|
"loss": 1.3278, |
|
"num_input_tokens_seen": 1914720, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9809417040358744, |
|
"grad_norm": 1.9624038934707642, |
|
"learning_rate": 1.9910139651840497e-05, |
|
"loss": 1.2927, |
|
"num_input_tokens_seen": 2233680, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1177130044843049, |
|
"grad_norm": 1.6478056907653809, |
|
"learning_rate": 1.962720313575358e-05, |
|
"loss": 1.2029, |
|
"num_input_tokens_seen": 2544928, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.257847533632287, |
|
"grad_norm": 2.1640427112579346, |
|
"learning_rate": 1.915655103523529e-05, |
|
"loss": 1.2199, |
|
"num_input_tokens_seen": 2864008, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.397982062780269, |
|
"grad_norm": 2.563964605331421, |
|
"learning_rate": 1.8507360338956896e-05, |
|
"loss": 1.2112, |
|
"num_input_tokens_seen": 3183248, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5381165919282511, |
|
"grad_norm": 1.9103755950927734, |
|
"learning_rate": 1.7692289262315e-05, |
|
"loss": 1.221, |
|
"num_input_tokens_seen": 3502528, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6782511210762332, |
|
"grad_norm": 2.564152240753174, |
|
"learning_rate": 1.6727230431791816e-05, |
|
"loss": 1.1758, |
|
"num_input_tokens_seen": 3821648, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8183856502242153, |
|
"grad_norm": 1.57261061668396, |
|
"learning_rate": 1.563100100329731e-05, |
|
"loss": 1.1828, |
|
"num_input_tokens_seen": 4140008, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.9585201793721974, |
|
"grad_norm": 1.8220309019088745, |
|
"learning_rate": 1.442497575670668e-05, |
|
"loss": 1.1948, |
|
"num_input_tokens_seen": 4458448, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.0952914798206277, |
|
"grad_norm": 2.028656005859375, |
|
"learning_rate": 1.313267032068285e-05, |
|
"loss": 1.07, |
|
"num_input_tokens_seen": 4770136, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.2354260089686098, |
|
"grad_norm": 3.713899850845337, |
|
"learning_rate": 1.1779282654255685e-05, |
|
"loss": 1.0581, |
|
"num_input_tokens_seen": 5087896, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.375560538116592, |
|
"grad_norm": 2.000035285949707, |
|
"learning_rate": 1.0391201725558842e-05, |
|
"loss": 1.048, |
|
"num_input_tokens_seen": 5407296, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.515695067264574, |
|
"grad_norm": 1.5742937326431274, |
|
"learning_rate": 8.99549296772945e-06, |
|
"loss": 1.0218, |
|
"num_input_tokens_seen": 5726896, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.655829596412556, |
|
"grad_norm": 1.7964842319488525, |
|
"learning_rate": 7.619370544785608e-06, |
|
"loss": 1.0206, |
|
"num_input_tokens_seen": 6046856, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.795964125560538, |
|
"grad_norm": 1.5667575597763062, |
|
"learning_rate": 6.289666717481497e-06, |
|
"loss": 1.0277, |
|
"num_input_tokens_seen": 6366216, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.93609865470852, |
|
"grad_norm": 1.8910105228424072, |
|
"learning_rate": 5.032308655686011e-06, |
|
"loss": 1.0494, |
|
"num_input_tokens_seen": 6685296, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.0728699551569507, |
|
"grad_norm": 1.1843669414520264, |
|
"learning_rate": 3.8718128986350154e-06, |
|
"loss": 0.9487, |
|
"num_input_tokens_seen": 6996704, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.213004484304933, |
|
"grad_norm": 1.5657708644866943, |
|
"learning_rate": 2.8308073203011667e-06, |
|
"loss": 0.9103, |
|
"num_input_tokens_seen": 7316144, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.353139013452915, |
|
"grad_norm": 1.8331918716430664, |
|
"learning_rate": 1.929589920817806e-06, |
|
"loss": 0.9102, |
|
"num_input_tokens_seen": 7635704, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.493273542600897, |
|
"grad_norm": 1.651666283607483, |
|
"learning_rate": 1.1857330468424466e-06, |
|
"loss": 0.9034, |
|
"num_input_tokens_seen": 7955304, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.633408071748879, |
|
"grad_norm": 1.1031330823898315, |
|
"learning_rate": 6.137407579511212e-07, |
|
"loss": 0.8949, |
|
"num_input_tokens_seen": 8274744, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.773542600896861, |
|
"grad_norm": 3.233630895614624, |
|
"learning_rate": 2.2476601988947965e-07, |
|
"loss": 0.9073, |
|
"num_input_tokens_seen": 8594304, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.913677130044843, |
|
"grad_norm": 1.2947144508361816, |
|
"learning_rate": 2.639323897518975e-08, |
|
"loss": 0.8967, |
|
"num_input_tokens_seen": 8913944, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.9809417040358746, |
|
"num_input_tokens_seen": 9067184, |
|
"step": 712, |
|
"total_flos": 6.151178707859082e+17, |
|
"train_loss": 1.3522842801019046, |
|
"train_runtime": 6892.301, |
|
"train_samples_per_second": 2.588, |
|
"train_steps_per_second": 0.103 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 712, |
|
"num_input_tokens_seen": 9067184, |
|
"num_train_epochs": 4, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.151178707859082e+17, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|