{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 20, "global_step": 481, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04162330905306972, "grad_norm": 20.466537475585938, "learning_rate": 4.957081545064378e-05, "loss": 8.8593, "mean_token_accuracy": 0.6828776024281978, "num_tokens": 161548.0, "step": 20 }, { "epoch": 0.04162330905306972, "eval_loss": 1.1315333843231201, "eval_mean_token_accuracy": 0.8237503719329834, "eval_num_tokens": 161548.0, "eval_runtime": 34.7122, "eval_samples_per_second": 2.881, "eval_steps_per_second": 0.72, "step": 20 }, { "epoch": 0.08324661810613944, "grad_norm": 2.744436025619507, "learning_rate": 4.742489270386266e-05, "loss": 1.8553, "mean_token_accuracy": 0.9392111219465733, "num_tokens": 323168.0, "step": 40 }, { "epoch": 0.08324661810613944, "eval_loss": 0.16864576935768127, "eval_mean_token_accuracy": 0.9810383248329163, "eval_num_tokens": 323168.0, "eval_runtime": 34.3084, "eval_samples_per_second": 2.915, "eval_steps_per_second": 0.729, "step": 40 }, { "epoch": 0.12486992715920915, "grad_norm": 1.6022675037384033, "learning_rate": 4.527896995708155e-05, "loss": 0.3903, "mean_token_accuracy": 0.9888123281300067, "num_tokens": 484780.0, "step": 60 }, { "epoch": 0.12486992715920915, "eval_loss": 0.03530249744653702, "eval_mean_token_accuracy": 0.9926152086257934, "eval_num_tokens": 484780.0, "eval_runtime": 34.7758, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.719, "step": 60 }, { "epoch": 0.16649323621227888, "grad_norm": 1.2044650316238403, "learning_rate": 4.313304721030043e-05, "loss": 0.0664, "mean_token_accuracy": 0.9934882044792175, "num_tokens": 646431.0, "step": 80 }, { "epoch": 0.16649323621227888, "eval_loss": 0.012011010199785233, "eval_mean_token_accuracy": 0.9936227130889893, "eval_num_tokens": 646431.0, "eval_runtime": 34.3329, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 80 }, { "epoch": 0.2081165452653486, "grad_norm": 0.4138229191303253, "learning_rate": 4.098712446351932e-05, "loss": 0.0513, "mean_token_accuracy": 0.994041533768177, "num_tokens": 808054.0, "step": 100 }, { "epoch": 0.2081165452653486, "eval_loss": 0.011702906340360641, "eval_mean_token_accuracy": 0.9942703366279602, "eval_num_tokens": 808054.0, "eval_runtime": 34.3225, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 100 }, { "epoch": 0.2497398543184183, "grad_norm": 0.8489187955856323, "learning_rate": 3.88412017167382e-05, "loss": 0.0416, "mean_token_accuracy": 0.9943524189293385, "num_tokens": 969623.0, "step": 120 }, { "epoch": 0.2497398543184183, "eval_loss": 0.010666043497622013, "eval_mean_token_accuracy": 0.9943506979942321, "eval_num_tokens": 969623.0, "eval_runtime": 34.3761, "eval_samples_per_second": 2.909, "eval_steps_per_second": 0.727, "step": 120 }, { "epoch": 0.29136316337148804, "grad_norm": 0.49626249074935913, "learning_rate": 3.669527896995708e-05, "loss": 0.0384, "mean_token_accuracy": 0.9943184182047844, "num_tokens": 1131222.0, "step": 140 }, { "epoch": 0.29136316337148804, "eval_loss": 0.009807135909795761, "eval_mean_token_accuracy": 0.9946736145019531, "eval_num_tokens": 1131222.0, "eval_runtime": 34.34, "eval_samples_per_second": 2.912, "eval_steps_per_second": 0.728, "step": 140 }, { "epoch": 0.33298647242455776, "grad_norm": 0.5716305375099182, "learning_rate": 3.454935622317597e-05, "loss": 0.0384, "mean_token_accuracy": 0.9946457795798779, "num_tokens": 1292839.0, "step": 160 }, { "epoch": 0.33298647242455776, "eval_loss": 0.009297176264226437, "eval_mean_token_accuracy": 0.9946328043937683, "eval_num_tokens": 1292839.0, "eval_runtime": 34.557, "eval_samples_per_second": 2.894, "eval_steps_per_second": 0.723, "step": 160 }, { "epoch": 0.37460978147762747, "grad_norm": 0.4264802932739258, "learning_rate": 3.240343347639485e-05, "loss": 0.0373, "mean_token_accuracy": 0.9946053452789784, "num_tokens": 1454417.0, "step": 180 }, { "epoch": 0.37460978147762747, "eval_loss": 0.009250417351722717, "eval_mean_token_accuracy": 0.9949553918838501, "eval_num_tokens": 1454417.0, "eval_runtime": 34.3425, "eval_samples_per_second": 2.912, "eval_steps_per_second": 0.728, "step": 180 }, { "epoch": 0.4162330905306972, "grad_norm": 0.3315845727920532, "learning_rate": 3.0257510729613737e-05, "loss": 0.0326, "mean_token_accuracy": 0.9949214711785317, "num_tokens": 1615982.0, "step": 200 }, { "epoch": 0.4162330905306972, "eval_loss": 0.00862209778279066, "eval_mean_token_accuracy": 0.994794466495514, "eval_num_tokens": 1615982.0, "eval_runtime": 34.317, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.729, "step": 200 }, { "epoch": 0.4578563995837669, "grad_norm": 0.3415542542934418, "learning_rate": 2.811158798283262e-05, "loss": 0.0328, "mean_token_accuracy": 0.9948225237429142, "num_tokens": 1777586.0, "step": 220 }, { "epoch": 0.4578563995837669, "eval_loss": 0.008552273735404015, "eval_mean_token_accuracy": 0.9952386736869812, "eval_num_tokens": 1777586.0, "eval_runtime": 34.357, "eval_samples_per_second": 2.911, "eval_steps_per_second": 0.728, "step": 220 }, { "epoch": 0.4994797086368366, "grad_norm": 0.5801106691360474, "learning_rate": 2.59656652360515e-05, "loss": 0.0317, "mean_token_accuracy": 0.9949369013309479, "num_tokens": 1939204.0, "step": 240 }, { "epoch": 0.4994797086368366, "eval_loss": 0.008411003276705742, "eval_mean_token_accuracy": 0.9951176619529725, "eval_num_tokens": 1939204.0, "eval_runtime": 34.3383, "eval_samples_per_second": 2.912, "eval_steps_per_second": 0.728, "step": 240 }, { "epoch": 0.5411030176899063, "grad_norm": 0.3876211941242218, "learning_rate": 2.3819742489270388e-05, "loss": 0.0322, "mean_token_accuracy": 0.9952259331941604, "num_tokens": 2100821.0, "step": 260 }, { "epoch": 0.5411030176899063, "eval_loss": 0.008060808293521404, "eval_mean_token_accuracy": 0.9952384281158447, "eval_num_tokens": 2100821.0, "eval_runtime": 34.5381, "eval_samples_per_second": 2.895, "eval_steps_per_second": 0.724, "step": 260 }, { "epoch": 0.5827263267429761, "grad_norm": 0.3304857015609741, "learning_rate": 2.1673819742489272e-05, "loss": 0.0326, "mean_token_accuracy": 0.9948949187994003, "num_tokens": 2262372.0, "step": 280 }, { "epoch": 0.5827263267429761, "eval_loss": 0.008030685596168041, "eval_mean_token_accuracy": 0.9953594398498535, "eval_num_tokens": 2262372.0, "eval_runtime": 34.2589, "eval_samples_per_second": 2.919, "eval_steps_per_second": 0.73, "step": 280 }, { "epoch": 0.6243496357960457, "grad_norm": 0.262820839881897, "learning_rate": 1.9527896995708157e-05, "loss": 0.0295, "mean_token_accuracy": 0.9954247616231442, "num_tokens": 2423931.0, "step": 300 }, { "epoch": 0.6243496357960457, "eval_loss": 0.0076558589935302734, "eval_mean_token_accuracy": 0.9952384281158447, "eval_num_tokens": 2423931.0, "eval_runtime": 34.2994, "eval_samples_per_second": 2.915, "eval_steps_per_second": 0.729, "step": 300 }, { "epoch": 0.6659729448491155, "grad_norm": 0.4573795199394226, "learning_rate": 1.7381974248927038e-05, "loss": 0.0308, "mean_token_accuracy": 0.9953766994178295, "num_tokens": 2585532.0, "step": 320 }, { "epoch": 0.6659729448491155, "eval_loss": 0.007896814495325089, "eval_mean_token_accuracy": 0.9952785062789917, "eval_num_tokens": 2585532.0, "eval_runtime": 34.2979, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 320 }, { "epoch": 0.7075962539021852, "grad_norm": 0.3572976887226105, "learning_rate": 1.5236051502145923e-05, "loss": 0.0311, "mean_token_accuracy": 0.9952577523887157, "num_tokens": 2747068.0, "step": 340 }, { "epoch": 0.7075962539021852, "eval_loss": 0.007795471698045731, "eval_mean_token_accuracy": 0.9952789568901061, "eval_num_tokens": 2747068.0, "eval_runtime": 34.3024, "eval_samples_per_second": 2.915, "eval_steps_per_second": 0.729, "step": 340 }, { "epoch": 0.7492195629552549, "grad_norm": 0.25351160764694214, "learning_rate": 1.3090128755364809e-05, "loss": 0.0302, "mean_token_accuracy": 0.9953413404524326, "num_tokens": 2908697.0, "step": 360 }, { "epoch": 0.7492195629552549, "eval_loss": 0.007526129484176636, "eval_mean_token_accuracy": 0.9951572942733765, "eval_num_tokens": 2908697.0, "eval_runtime": 34.4802, "eval_samples_per_second": 2.9, "eval_steps_per_second": 0.725, "step": 360 }, { "epoch": 0.7908428720083247, "grad_norm": 0.45476171374320984, "learning_rate": 1.0944206008583692e-05, "loss": 0.0301, "mean_token_accuracy": 0.9954766884446145, "num_tokens": 3070284.0, "step": 380 }, { "epoch": 0.7908428720083247, "eval_loss": 0.007462680339813232, "eval_mean_token_accuracy": 0.9952787923812866, "eval_num_tokens": 3070284.0, "eval_runtime": 34.2958, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 380 }, { "epoch": 0.8324661810613944, "grad_norm": 0.37078818678855896, "learning_rate": 8.798283261802575e-06, "loss": 0.0298, "mean_token_accuracy": 0.9952507764101028, "num_tokens": 3231892.0, "step": 400 }, { "epoch": 0.8324661810613944, "eval_loss": 0.0073690456338226795, "eval_mean_token_accuracy": 0.9953188729286194, "eval_num_tokens": 3231892.0, "eval_runtime": 34.3243, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 400 }, { "epoch": 0.8740894901144641, "grad_norm": 0.4091401696205139, "learning_rate": 6.65236051502146e-06, "loss": 0.0296, "mean_token_accuracy": 0.9953283965587616, "num_tokens": 3393525.0, "step": 420 }, { "epoch": 0.8740894901144641, "eval_loss": 0.007452231831848621, "eval_mean_token_accuracy": 0.9953588700294494, "eval_num_tokens": 3393525.0, "eval_runtime": 34.2775, "eval_samples_per_second": 2.917, "eval_steps_per_second": 0.729, "step": 420 }, { "epoch": 0.9157127991675338, "grad_norm": 0.2830151915550232, "learning_rate": 4.5064377682403434e-06, "loss": 0.0296, "mean_token_accuracy": 0.9951381236314774, "num_tokens": 3555132.0, "step": 440 }, { "epoch": 0.9157127991675338, "eval_loss": 0.0072263493202626705, "eval_mean_token_accuracy": 0.9954402089118958, "eval_num_tokens": 3555132.0, "eval_runtime": 34.3128, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.729, "step": 440 }, { "epoch": 0.9573361082206036, "grad_norm": 0.23723173141479492, "learning_rate": 2.3605150214592277e-06, "loss": 0.0294, "mean_token_accuracy": 0.9953089639544487, "num_tokens": 3716673.0, "step": 460 }, { "epoch": 0.9573361082206036, "eval_loss": 0.007136988453567028, "eval_mean_token_accuracy": 0.9953193616867065, "eval_num_tokens": 3716673.0, "eval_runtime": 34.5004, "eval_samples_per_second": 2.899, "eval_steps_per_second": 0.725, "step": 460 }, { "epoch": 0.9989594172736732, "grad_norm": 0.34731465578079224, "learning_rate": 2.145922746781116e-07, "loss": 0.0289, "mean_token_accuracy": 0.9953534014523029, "num_tokens": 3878323.0, "step": 480 }, { "epoch": 0.9989594172736732, "eval_loss": 0.007239112630486488, "eval_mean_token_accuracy": 0.99535968542099, "eval_num_tokens": 3878323.0, "eval_runtime": 34.2833, "eval_samples_per_second": 2.917, "eval_steps_per_second": 0.729, "step": 480 } ], "logging_steps": 20, "max_steps": 481, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.013575302866206e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }