| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 1.0864604711532593, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 0.4927, | |
| "mean_token_accuracy": 0.8949072062969208, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.8317855596542358, | |
| "learning_rate": 7.692307692307693e-05, | |
| "loss": 0.5321, | |
| "mean_token_accuracy": 0.8870031237602234, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.35360249876976013, | |
| "learning_rate": 0.00015384615384615385, | |
| "loss": 0.4318, | |
| "mean_token_accuracy": 0.8946158409118652, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.26475989818573, | |
| "learning_rate": 0.00019984268150178167, | |
| "loss": 0.3243, | |
| "mean_token_accuracy": 0.9152041494846344, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.21459443867206573, | |
| "learning_rate": 0.00019807852804032305, | |
| "loss": 0.2103, | |
| "mean_token_accuracy": 0.941685950756073, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.15868614614009857, | |
| "learning_rate": 0.00019438833303083678, | |
| "loss": 0.1155, | |
| "mean_token_accuracy": 0.9637313485145569, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.15498687326908112, | |
| "learning_rate": 0.00018884456359788724, | |
| "loss": 0.0816, | |
| "mean_token_accuracy": 0.9711011052131653, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.11740818619728088, | |
| "learning_rate": 0.00018155608689592604, | |
| "loss": 0.0622, | |
| "mean_token_accuracy": 0.9760455787181854, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.10971173644065857, | |
| "learning_rate": 0.0001726660322034027, | |
| "loss": 0.0545, | |
| "mean_token_accuracy": 0.9776363372802734, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.08847042918205261, | |
| "learning_rate": 0.00016234898018587337, | |
| "loss": 0.0519, | |
| "mean_token_accuracy": 0.977908480167389, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.09057971835136414, | |
| "learning_rate": 0.00015080753452465296, | |
| "loss": 0.0497, | |
| "mean_token_accuracy": 0.9789802670478821, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.08262317627668381, | |
| "learning_rate": 0.000138268343236509, | |
| "loss": 0.0434, | |
| "mean_token_accuracy": 0.9816033959388732, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.08489084988832474, | |
| "learning_rate": 0.0001249776478167227, | |
| "loss": 0.043, | |
| "mean_token_accuracy": 0.9817807137966156, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.07146206498146057, | |
| "learning_rate": 0.00011119644761033078, | |
| "loss": 0.0397, | |
| "mean_token_accuracy": 0.9835689246654511, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.08493078500032425, | |
| "learning_rate": 9.719537437241312e-05, | |
| "loss": 0.0432, | |
| "mean_token_accuracy": 0.981769073009491, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.06853792816400528, | |
| "learning_rate": 8.324937766952638e-05, | |
| "loss": 0.0416, | |
| "mean_token_accuracy": 0.9825034320354462, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.07850378751754761, | |
| "learning_rate": 6.963232548903853e-05, | |
| "loss": 0.0416, | |
| "mean_token_accuracy": 0.9826254367828369, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.10064064711332321, | |
| "learning_rate": 5.6611626088244194e-05, | |
| "loss": 0.0411, | |
| "mean_token_accuracy": 0.9827791035175324, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.07430274784564972, | |
| "learning_rate": 4.444297669803981e-05, | |
| "loss": 0.0432, | |
| "mean_token_accuracy": 0.9819111526012421, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.05630122497677803, | |
| "learning_rate": 3.336534220479961e-05, | |
| "loss": 0.0381, | |
| "mean_token_accuracy": 0.984023529291153, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.07422107458114624, | |
| "learning_rate": 2.3596262417839255e-05, | |
| "loss": 0.041, | |
| "mean_token_accuracy": 0.9828759372234345, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.06742699444293976, | |
| "learning_rate": 1.5327580077171587e-05, | |
| "loss": 0.0435, | |
| "mean_token_accuracy": 0.9813436925411224, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.07175164669752121, | |
| "learning_rate": 8.72167349386811e-06, | |
| "loss": 0.0406, | |
| "mean_token_accuracy": 0.9831586062908173, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.06535231322050095, | |
| "learning_rate": 3.908267805490051e-06, | |
| "loss": 0.0411, | |
| "mean_token_accuracy": 0.9826524317264557, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.07355163991451263, | |
| "learning_rate": 9.818874663554357e-07, | |
| "loss": 0.0407, | |
| "mean_token_accuracy": 0.9827761054039001, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.07261276245117188, | |
| "learning_rate": 0.0, | |
| "loss": 0.04, | |
| "mean_token_accuracy": 0.9830402076244354, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.0398690365254879, | |
| "eval_mean_token_accuracy": 0.9831665050983429, | |
| "eval_runtime": 167.2529, | |
| "eval_samples_per_second": 11.958, | |
| "eval_steps_per_second": 1.495, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 125, | |
| "total_flos": 4505964279496704.0, | |
| "train_loss": 0.1011103401184082, | |
| "train_runtime": 450.8333, | |
| "train_samples_per_second": 2.218, | |
| "train_steps_per_second": 0.277 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4505964279496704.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |