|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.99728014505893, |
|
"eval_steps": 500, |
|
"global_step": 880, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11332728921124206, |
|
"grad_norm": 12.256142616271973, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 6.768, |
|
"num_input_tokens_seen": 320000, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.22665457842248413, |
|
"grad_norm": 14.015169143676758, |
|
"learning_rate": 6.533333333333334e-06, |
|
"loss": 2.2714, |
|
"num_input_tokens_seen": 640000, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3399818676337262, |
|
"grad_norm": 2.729668140411377, |
|
"learning_rate": 9.866666666666668e-06, |
|
"loss": 1.6663, |
|
"num_input_tokens_seen": 960000, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.45330915684496825, |
|
"grad_norm": 2.5849289894104004, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 1.5351, |
|
"num_input_tokens_seen": 1280000, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5666364460562103, |
|
"grad_norm": 2.5873868465423584, |
|
"learning_rate": 1.6533333333333333e-05, |
|
"loss": 1.4698, |
|
"num_input_tokens_seen": 1600000, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6799637352674524, |
|
"grad_norm": 1.8172498941421509, |
|
"learning_rate": 1.9866666666666667e-05, |
|
"loss": 1.4164, |
|
"num_input_tokens_seen": 1920000, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7932910244786945, |
|
"grad_norm": 3.378293037414551, |
|
"learning_rate": 1.994670819911521e-05, |
|
"loss": 1.3952, |
|
"num_input_tokens_seen": 2239840, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9066183136899365, |
|
"grad_norm": 1.4730700254440308, |
|
"learning_rate": 1.977848341505657e-05, |
|
"loss": 1.3792, |
|
"num_input_tokens_seen": 2559800, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0226654578422485, |
|
"grad_norm": 1.2745240926742554, |
|
"learning_rate": 1.949717842791432e-05, |
|
"loss": 1.3537, |
|
"num_input_tokens_seen": 2884920, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.1359927470534905, |
|
"grad_norm": 1.7677239179611206, |
|
"learning_rate": 1.9106046300942165e-05, |
|
"loss": 1.2956, |
|
"num_input_tokens_seen": 3204920, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2493200362647325, |
|
"grad_norm": 1.9639768600463867, |
|
"learning_rate": 1.8609610158889943e-05, |
|
"loss": 1.2642, |
|
"num_input_tokens_seen": 3524920, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.3626473254759746, |
|
"grad_norm": 1.738120436668396, |
|
"learning_rate": 1.8013610881746767e-05, |
|
"loss": 1.2527, |
|
"num_input_tokens_seen": 3844920, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.4759746146872166, |
|
"grad_norm": 1.5475760698318481, |
|
"learning_rate": 1.732494071613579e-05, |
|
"loss": 1.2602, |
|
"num_input_tokens_seen": 4164920, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.5893019038984586, |
|
"grad_norm": 1.3410508632659912, |
|
"learning_rate": 1.6551563572090855e-05, |
|
"loss": 1.2551, |
|
"num_input_tokens_seen": 4484840, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.7026291931097008, |
|
"grad_norm": 1.6581236124038696, |
|
"learning_rate": 1.5702422926917872e-05, |
|
"loss": 1.2325, |
|
"num_input_tokens_seen": 4804840, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.8159564823209429, |
|
"grad_norm": 1.8297406435012817, |
|
"learning_rate": 1.4787338401157888e-05, |
|
"loss": 1.2436, |
|
"num_input_tokens_seen": 5124840, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.929283771532185, |
|
"grad_norm": 1.9106981754302979, |
|
"learning_rate": 1.3816892202666591e-05, |
|
"loss": 1.2319, |
|
"num_input_tokens_seen": 5444840, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.045330915684497, |
|
"grad_norm": 1.4120745658874512, |
|
"learning_rate": 1.2802306751992163e-05, |
|
"loss": 1.2132, |
|
"num_input_tokens_seen": 5769800, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.158658204895739, |
|
"grad_norm": 1.541704535484314, |
|
"learning_rate": 1.1755314904214284e-05, |
|
"loss": 1.0794, |
|
"num_input_tokens_seen": 6089800, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.271985494106981, |
|
"grad_norm": 2.239482879638672, |
|
"learning_rate": 1.06880242680232e-05, |
|
"loss": 1.0779, |
|
"num_input_tokens_seen": 6409680, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.385312783318223, |
|
"grad_norm": 2.1483075618743896, |
|
"learning_rate": 9.612777191078257e-06, |
|
"loss": 1.0722, |
|
"num_input_tokens_seen": 6729680, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.498640072529465, |
|
"grad_norm": 1.631958246231079, |
|
"learning_rate": 8.542008030801254e-06, |
|
"loss": 1.0663, |
|
"num_input_tokens_seen": 7049680, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.611967361740707, |
|
"grad_norm": 1.948183298110962, |
|
"learning_rate": 7.4880993611518095e-06, |
|
"loss": 1.056, |
|
"num_input_tokens_seen": 7369680, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.725294650951949, |
|
"grad_norm": 1.8783904314041138, |
|
"learning_rate": 6.463238778236287e-06, |
|
"loss": 1.0578, |
|
"num_input_tokens_seen": 7689680, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.838621940163191, |
|
"grad_norm": 2.3182311058044434, |
|
"learning_rate": 5.479277960676959e-06, |
|
"loss": 1.0531, |
|
"num_input_tokens_seen": 8009600, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.951949229374433, |
|
"grad_norm": 2.483482837677002, |
|
"learning_rate": 4.547595614593489e-06, |
|
"loss": 1.0523, |
|
"num_input_tokens_seen": 8329600, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.067996373526745, |
|
"grad_norm": 1.3066755533218384, |
|
"learning_rate": 3.6789658881265135e-06, |
|
"loss": 1.0127, |
|
"num_input_tokens_seen": 8654720, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.1813236627379875, |
|
"grad_norm": 1.4423686265945435, |
|
"learning_rate": 2.883433777182255e-06, |
|
"loss": 0.9245, |
|
"num_input_tokens_seen": 8974520, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.2946509519492295, |
|
"grad_norm": 2.5524277687072754, |
|
"learning_rate": 2.170198963229372e-06, |
|
"loss": 0.918, |
|
"num_input_tokens_seen": 9294520, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.4079782411604715, |
|
"grad_norm": 1.343592882156372, |
|
"learning_rate": 1.547509426469368e-06, |
|
"loss": 0.9192, |
|
"num_input_tokens_seen": 9614520, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.5213055303717136, |
|
"grad_norm": 1.9703131914138794, |
|
"learning_rate": 1.022566064657663e-06, |
|
"loss": 0.9177, |
|
"num_input_tokens_seen": 9934520, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.6346328195829556, |
|
"grad_norm": 2.417506694793701, |
|
"learning_rate": 6.01439420581047e-07, |
|
"loss": 0.9152, |
|
"num_input_tokens_seen": 10254520, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.7479601087941976, |
|
"grad_norm": 1.23152494430542, |
|
"learning_rate": 2.889994811704966e-07, |
|
"loss": 0.917, |
|
"num_input_tokens_seen": 10574520, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.8612873980054396, |
|
"grad_norm": 1.5819039344787598, |
|
"learning_rate": 8.885936006545304e-08, |
|
"loss": 0.912, |
|
"num_input_tokens_seen": 10894520, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.9746146872166817, |
|
"grad_norm": 1.4086824655532837, |
|
"learning_rate": 3.333514894887646e-09, |
|
"loss": 0.9239, |
|
"num_input_tokens_seen": 11214520, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.99728014505893, |
|
"num_input_tokens_seen": 11278520, |
|
"step": 880, |
|
"total_flos": 7.651349314204147e+17, |
|
"train_loss": 1.351207665421746, |
|
"train_runtime": 8552.2085, |
|
"train_samples_per_second": 2.578, |
|
"train_steps_per_second": 0.103 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 880, |
|
"num_input_tokens_seen": 11278520, |
|
"num_train_epochs": 4, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.651349314204147e+17, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|