|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9838187702265372, |
|
"eval_steps": 500, |
|
"global_step": 38, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.025889967637540454, |
|
"grad_norm": 14.94955365537486, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6643, |
|
"mean_token_accuracy": 0.8553724065423012, |
|
"num_tokens": 65536.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05177993527508091, |
|
"grad_norm": 13.315623180611107, |
|
"learning_rate": 1.9473684210526318e-05, |
|
"loss": 0.452, |
|
"mean_token_accuracy": 0.8764078170061111, |
|
"num_tokens": 130612.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.07766990291262135, |
|
"grad_norm": 3.2684775267493738, |
|
"learning_rate": 1.894736842105263e-05, |
|
"loss": 0.3998, |
|
"mean_token_accuracy": 0.8837301507592201, |
|
"num_tokens": 196148.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.10355987055016182, |
|
"grad_norm": 3.648625597879628, |
|
"learning_rate": 1.8421052631578947e-05, |
|
"loss": 0.4149, |
|
"mean_token_accuracy": 0.8796855807304382, |
|
"num_tokens": 261684.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.12944983818770225, |
|
"grad_norm": 2.1735962344366744, |
|
"learning_rate": 1.7894736842105264e-05, |
|
"loss": 0.4029, |
|
"mean_token_accuracy": 0.8861111029982567, |
|
"num_tokens": 327220.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1553398058252427, |
|
"grad_norm": 1.2399813671584698, |
|
"learning_rate": 1.736842105263158e-05, |
|
"loss": 0.3874, |
|
"mean_token_accuracy": 0.8873015865683556, |
|
"num_tokens": 392756.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.18122977346278318, |
|
"grad_norm": 1.2840669625935126, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 0.3925, |
|
"mean_token_accuracy": 0.8855463936924934, |
|
"num_tokens": 458292.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.20711974110032363, |
|
"grad_norm": 0.9666205702654993, |
|
"learning_rate": 1.6315789473684213e-05, |
|
"loss": 0.4035, |
|
"mean_token_accuracy": 0.8799450471997261, |
|
"num_tokens": 523828.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.23300970873786409, |
|
"grad_norm": 1.0109942942154606, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 0.434, |
|
"mean_token_accuracy": 0.8735195249319077, |
|
"num_tokens": 589364.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2588996763754045, |
|
"grad_norm": 1.0413113875886446, |
|
"learning_rate": 1.5263157894736846e-05, |
|
"loss": 0.4191, |
|
"mean_token_accuracy": 0.8769078031182289, |
|
"num_tokens": 654900.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.284789644012945, |
|
"grad_norm": 1.0902203237712937, |
|
"learning_rate": 1.4736842105263159e-05, |
|
"loss": 0.4493, |
|
"mean_token_accuracy": 0.8699328452348709, |
|
"num_tokens": 720436.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.3106796116504854, |
|
"grad_norm": 0.8849467554405712, |
|
"learning_rate": 1.4210526315789475e-05, |
|
"loss": 0.3931, |
|
"mean_token_accuracy": 0.8852578699588776, |
|
"num_tokens": 785579.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.3365695792880259, |
|
"grad_norm": 1.035926324924405, |
|
"learning_rate": 1.3684210526315791e-05, |
|
"loss": 0.39, |
|
"mean_token_accuracy": 0.8831959590315819, |
|
"num_tokens": 851115.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.36245954692556637, |
|
"grad_norm": 1.0026322980151723, |
|
"learning_rate": 1.3157894736842108e-05, |
|
"loss": 0.4184, |
|
"mean_token_accuracy": 0.8782203868031502, |
|
"num_tokens": 916651.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3883495145631068, |
|
"grad_norm": 1.0102817872020193, |
|
"learning_rate": 1.263157894736842e-05, |
|
"loss": 0.3938, |
|
"mean_token_accuracy": 0.8836233168840408, |
|
"num_tokens": 982187.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.41423948220064727, |
|
"grad_norm": 0.9944421404980388, |
|
"learning_rate": 1.2105263157894737e-05, |
|
"loss": 0.4384, |
|
"mean_token_accuracy": 0.8699938952922821, |
|
"num_tokens": 1047723.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.4401294498381877, |
|
"grad_norm": 0.8998162798325435, |
|
"learning_rate": 1.1578947368421053e-05, |
|
"loss": 0.3713, |
|
"mean_token_accuracy": 0.8903498351573944, |
|
"num_tokens": 1112544.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.46601941747572817, |
|
"grad_norm": 0.9087432992292579, |
|
"learning_rate": 1.105263157894737e-05, |
|
"loss": 0.4036, |
|
"mean_token_accuracy": 0.8820064589381218, |
|
"num_tokens": 1174265.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.4919093851132686, |
|
"grad_norm": 0.8697071517631878, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 0.3931, |
|
"mean_token_accuracy": 0.8835927844047546, |
|
"num_tokens": 1239801.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.517799352750809, |
|
"grad_norm": 0.8774544351292555, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4133, |
|
"mean_token_accuracy": 0.8790166154503822, |
|
"num_tokens": 1303377.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5436893203883495, |
|
"grad_norm": 0.994546197670681, |
|
"learning_rate": 9.473684210526315e-06, |
|
"loss": 0.4375, |
|
"mean_token_accuracy": 0.8726342990994453, |
|
"num_tokens": 1368913.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.56957928802589, |
|
"grad_norm": 0.9232673730081883, |
|
"learning_rate": 8.947368421052632e-06, |
|
"loss": 0.4048, |
|
"mean_token_accuracy": 0.8793192803859711, |
|
"num_tokens": 1434449.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.5954692556634305, |
|
"grad_norm": 1.0209861898624548, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 0.397, |
|
"mean_token_accuracy": 0.8836233168840408, |
|
"num_tokens": 1499985.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.6213592233009708, |
|
"grad_norm": 0.9046118944574179, |
|
"learning_rate": 7.894736842105265e-06, |
|
"loss": 0.4138, |
|
"mean_token_accuracy": 0.8774877861142159, |
|
"num_tokens": 1565521.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.6472491909385113, |
|
"grad_norm": 0.9351060177767945, |
|
"learning_rate": 7.368421052631579e-06, |
|
"loss": 0.3922, |
|
"mean_token_accuracy": 0.8843711838126183, |
|
"num_tokens": 1631057.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.6731391585760518, |
|
"grad_norm": 0.9167336580555596, |
|
"learning_rate": 6.842105263157896e-06, |
|
"loss": 0.377, |
|
"mean_token_accuracy": 0.8888125643134117, |
|
"num_tokens": 1696593.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.6990291262135923, |
|
"grad_norm": 0.8827610291370933, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 0.4382, |
|
"mean_token_accuracy": 0.8711233139038086, |
|
"num_tokens": 1762129.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.7249190938511327, |
|
"grad_norm": 0.8216978597200468, |
|
"learning_rate": 5.789473684210527e-06, |
|
"loss": 0.3978, |
|
"mean_token_accuracy": 0.8840201497077942, |
|
"num_tokens": 1827665.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.7508090614886731, |
|
"grad_norm": 0.7861852214672553, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 0.361, |
|
"mean_token_accuracy": 0.8924908339977264, |
|
"num_tokens": 1893201.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.7766990291262136, |
|
"grad_norm": 0.9358327841683868, |
|
"learning_rate": 4.736842105263158e-06, |
|
"loss": 0.3654, |
|
"mean_token_accuracy": 0.89267398416996, |
|
"num_tokens": 1958737.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8025889967637541, |
|
"grad_norm": 0.7893903886446123, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 0.3375, |
|
"mean_token_accuracy": 0.8997863158583641, |
|
"num_tokens": 2024273.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.8284789644012945, |
|
"grad_norm": 0.8243141967647842, |
|
"learning_rate": 3.6842105263157896e-06, |
|
"loss": 0.3728, |
|
"mean_token_accuracy": 0.8913461416959763, |
|
"num_tokens": 2089809.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.8543689320388349, |
|
"grad_norm": 0.8246709048525032, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 0.3643, |
|
"mean_token_accuracy": 0.8915140256285667, |
|
"num_tokens": 2155345.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.8802588996763754, |
|
"grad_norm": 0.8260415283504307, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 0.3828, |
|
"mean_token_accuracy": 0.8868742287158966, |
|
"num_tokens": 2220881.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.9061488673139159, |
|
"grad_norm": 0.7747154448934308, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 0.3455, |
|
"mean_token_accuracy": 0.897390104830265, |
|
"num_tokens": 2286417.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.9320388349514563, |
|
"grad_norm": 0.8212045075297872, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.4002, |
|
"mean_token_accuracy": 0.8831327557563782, |
|
"num_tokens": 2348817.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.9579288025889967, |
|
"grad_norm": 0.8299799490223582, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 0.442, |
|
"mean_token_accuracy": 0.8720848485827446, |
|
"num_tokens": 2414353.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.9838187702265372, |
|
"grad_norm": 0.8097572023142755, |
|
"learning_rate": 5.263157894736843e-07, |
|
"loss": 0.3812, |
|
"mean_token_accuracy": 0.8855026215314865, |
|
"num_tokens": 2478526.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.9838187702265372, |
|
"step": 38, |
|
"total_flos": 4940351791104.0, |
|
"train_loss": 0.40646139257832575, |
|
"train_runtime": 867.7069, |
|
"train_samples_per_second": 0.711, |
|
"train_steps_per_second": 0.044 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 38, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4940351791104.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|