{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2, "global_step": 14, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14545454545454545, "grad_norm": 5.989531517028809, "learning_rate": 0.0001, "loss": 2.8214, "mean_token_accuracy": 0.7949551716446877, "num_tokens": 18590.0, "step": 2 }, { "epoch": 0.14545454545454545, "eval_loss": 1.2452431917190552, "eval_mean_token_accuracy": 0.6979593506881169, "eval_num_tokens": 18590.0, "eval_runtime": 10.9414, "eval_samples_per_second": 5.027, "eval_steps_per_second": 1.28, "step": 2 }, { "epoch": 0.2909090909090909, "grad_norm": 4.987532615661621, "learning_rate": 0.00019983081582712685, "loss": 2.4811, "mean_token_accuracy": 0.8189418539404869, "num_tokens": 37141.0, "step": 4 }, { "epoch": 0.2909090909090909, "eval_loss": 1.3113545179367065, "eval_mean_token_accuracy": 0.6982744761875698, "eval_num_tokens": 37141.0, "eval_runtime": 10.3505, "eval_samples_per_second": 5.314, "eval_steps_per_second": 1.353, "step": 4 }, { "epoch": 0.43636363636363634, "grad_norm": 6.911726474761963, "learning_rate": 0.00019848077530122083, "loss": 2.3469, "mean_token_accuracy": 0.8197802603244781, "num_tokens": 55750.0, "step": 6 }, { "epoch": 0.43636363636363634, "eval_loss": 1.3598123788833618, "eval_mean_token_accuracy": 0.7003392151423863, "eval_num_tokens": 55750.0, "eval_runtime": 10.3223, "eval_samples_per_second": 5.328, "eval_steps_per_second": 1.356, "step": 6 }, { "epoch": 0.5818181818181818, "grad_norm": 7.113568305969238, "learning_rate": 0.0001957989512315489, "loss": 2.5459, "mean_token_accuracy": 0.8071999177336693, "num_tokens": 74464.0, "step": 8 }, { "epoch": 0.5818181818181818, "eval_loss": 1.330183744430542, "eval_mean_token_accuracy": 0.7020843710218158, "eval_num_tokens": 74464.0, "eval_runtime": 10.3053, "eval_samples_per_second": 5.337, "eval_steps_per_second": 1.359, "step": 8 }, { "epoch": 0.7272727272727273, "grad_norm": 6.972678184509277, "learning_rate": 0.00019182161068802741, "loss": 2.7286, "mean_token_accuracy": 0.799242340028286, "num_tokens": 93229.0, "step": 10 }, { "epoch": 0.7272727272727273, "eval_loss": 1.2857917547225952, "eval_mean_token_accuracy": 0.7042312707219806, "eval_num_tokens": 93229.0, "eval_runtime": 10.3146, "eval_samples_per_second": 5.332, "eval_steps_per_second": 1.357, "step": 10 }, { "epoch": 0.8727272727272727, "grad_norm": 7.350575923919678, "learning_rate": 0.00018660254037844388, "loss": 2.6974, "mean_token_accuracy": 0.8019056767225266, "num_tokens": 111785.0, "step": 12 }, { "epoch": 0.8727272727272727, "eval_loss": 1.267033338546753, "eval_mean_token_accuracy": 0.7043451964855194, "eval_num_tokens": 111785.0, "eval_runtime": 10.3056, "eval_samples_per_second": 5.337, "eval_steps_per_second": 1.358, "step": 12 }, { "epoch": 1.0, "grad_norm": 6.314274311065674, "learning_rate": 0.0001802123192755044, "loss": 2.51, "mean_token_accuracy": 0.7953777994428363, "num_tokens": 126994.0, "step": 14 }, { "epoch": 1.0, "eval_loss": 1.2424124479293823, "eval_mean_token_accuracy": 0.7082162286554065, "eval_num_tokens": 126994.0, "eval_runtime": 10.3422, "eval_samples_per_second": 5.318, "eval_steps_per_second": 1.354, "step": 14 } ], "logging_steps": 2, "max_steps": 56, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3387742653033024.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }