{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996968676321994, "eval_steps": 500, "global_step": 2226, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06736274840013473, "grad_norm": 182163.578125, "learning_rate": 1.955974842767296e-05, "loss": 0.9643, "step": 50 }, { "epoch": 0.13472549680026946, "grad_norm": 168414.515625, "learning_rate": 1.9110512129380053e-05, "loss": 0.8565, "step": 100 }, { "epoch": 0.20208824520040417, "grad_norm": 359631.59375, "learning_rate": 1.8661275831087154e-05, "loss": 0.8063, "step": 150 }, { "epoch": 0.2694509936005389, "grad_norm": 221002.4375, "learning_rate": 1.8212039532794252e-05, "loss": 0.7914, "step": 200 }, { "epoch": 0.33681374200067365, "grad_norm": 150741.375, "learning_rate": 1.776280323450135e-05, "loss": 0.7781, "step": 250 }, { "epoch": 0.40417649040080833, "grad_norm": 264707.5, "learning_rate": 1.7313566936208447e-05, "loss": 0.789, "step": 300 }, { "epoch": 0.47153923880094306, "grad_norm": 211475.046875, "learning_rate": 1.6864330637915545e-05, "loss": 0.7845, "step": 350 }, { "epoch": 0.5389019872010778, "grad_norm": 172460.0, "learning_rate": 1.6415094339622643e-05, "loss": 0.7485, "step": 400 }, { "epoch": 0.6062647356012125, "grad_norm": 189772.09375, "learning_rate": 1.596585804132974e-05, "loss": 0.7623, "step": 450 }, { "epoch": 0.6736274840013473, "grad_norm": 182620.875, "learning_rate": 1.5516621743036838e-05, "loss": 0.7543, "step": 500 }, { "epoch": 0.740990232401482, "grad_norm": 177907.078125, "learning_rate": 1.5067385444743936e-05, "loss": 0.7368, "step": 550 }, { "epoch": 0.8083529808016167, "grad_norm": 146371.859375, "learning_rate": 1.4618149146451035e-05, "loss": 0.7309, "step": 600 }, { "epoch": 0.8757157292017514, "grad_norm": 154067.765625, "learning_rate": 1.4168912848158131e-05, "loss": 0.7303, "step": 650 }, { "epoch": 0.9430784776018861, "grad_norm": 166902.390625, "learning_rate": 1.371967654986523e-05, "loss": 0.7491, "step": 700 }, { "epoch": 1.0, "eval_bleu": 31.6726, "eval_gen_len": 37.4648, "eval_loss": 0.6205606460571289, "eval_runtime": 634.4197, "eval_samples_per_second": 3.941, "eval_steps_per_second": 0.247, "step": 743 }, { "epoch": 1.0094307847760189, "grad_norm": 138230.65625, "learning_rate": 1.3270440251572328e-05, "loss": 0.679, "step": 750 }, { "epoch": 1.0767935331761536, "grad_norm": 172958.375, "learning_rate": 1.2821203953279426e-05, "loss": 0.6253, "step": 800 }, { "epoch": 1.1441562815762882, "grad_norm": 153910.15625, "learning_rate": 1.2371967654986523e-05, "loss": 0.6214, "step": 850 }, { "epoch": 1.2115190299764231, "grad_norm": 157281.734375, "learning_rate": 1.1922731356693623e-05, "loss": 0.6509, "step": 900 }, { "epoch": 1.2788817783765578, "grad_norm": 160740.234375, "learning_rate": 1.1473495058400719e-05, "loss": 0.6283, "step": 950 }, { "epoch": 1.3462445267766925, "grad_norm": 172679.90625, "learning_rate": 1.1024258760107818e-05, "loss": 0.6503, "step": 1000 }, { "epoch": 1.4136072751768272, "grad_norm": 154958.9375, "learning_rate": 1.0575022461814914e-05, "loss": 0.6278, "step": 1050 }, { "epoch": 1.4809700235769618, "grad_norm": 207510.6875, "learning_rate": 1.0125786163522013e-05, "loss": 0.6305, "step": 1100 }, { "epoch": 1.5483327719770967, "grad_norm": 174465.65625, "learning_rate": 9.676549865229111e-06, "loss": 0.6234, "step": 1150 }, { "epoch": 1.6156955203772314, "grad_norm": 137803.953125, "learning_rate": 9.227313566936209e-06, "loss": 0.619, "step": 1200 }, { "epoch": 1.683058268777366, "grad_norm": 155922.09375, "learning_rate": 8.778077268643306e-06, "loss": 0.6271, "step": 1250 }, { "epoch": 1.750421017177501, "grad_norm": 144567.671875, "learning_rate": 8.328840970350404e-06, "loss": 0.616, "step": 1300 }, { "epoch": 1.8177837655776354, "grad_norm": 158599.484375, "learning_rate": 7.879604672057503e-06, "loss": 0.6253, "step": 1350 }, { "epoch": 1.8851465139777703, "grad_norm": 172037.515625, "learning_rate": 7.4303683737646e-06, "loss": 0.6334, "step": 1400 }, { "epoch": 1.952509262377905, "grad_norm": 174420.90625, "learning_rate": 6.981132075471699e-06, "loss": 0.6179, "step": 1450 }, { "epoch": 2.0, "eval_bleu": 29.6195, "eval_gen_len": 40.4748, "eval_loss": 0.598081111907959, "eval_runtime": 684.6129, "eval_samples_per_second": 3.652, "eval_steps_per_second": 0.229, "step": 1486 }, { "epoch": 2.0188615695520378, "grad_norm": 122954.3125, "learning_rate": 6.531895777178796e-06, "loss": 0.6083, "step": 1500 }, { "epoch": 2.0862243179521727, "grad_norm": 142081.5625, "learning_rate": 6.082659478885895e-06, "loss": 0.5866, "step": 1550 }, { "epoch": 2.153587066352307, "grad_norm": 134754.171875, "learning_rate": 5.6334231805929925e-06, "loss": 0.5507, "step": 1600 }, { "epoch": 2.220949814752442, "grad_norm": 174082.390625, "learning_rate": 5.184186882300091e-06, "loss": 0.5608, "step": 1650 }, { "epoch": 2.2883125631525765, "grad_norm": 139355.5, "learning_rate": 4.734950584007188e-06, "loss": 0.567, "step": 1700 }, { "epoch": 2.3556753115527114, "grad_norm": 181866.828125, "learning_rate": 4.2857142857142855e-06, "loss": 0.5493, "step": 1750 }, { "epoch": 2.4230380599528463, "grad_norm": 145802.578125, "learning_rate": 3.836477987421384e-06, "loss": 0.5613, "step": 1800 }, { "epoch": 2.4904008083529807, "grad_norm": 171856.1875, "learning_rate": 3.387241689128482e-06, "loss": 0.5826, "step": 1850 }, { "epoch": 2.5577635567531156, "grad_norm": 159035.875, "learning_rate": 2.9380053908355797e-06, "loss": 0.5721, "step": 1900 }, { "epoch": 2.62512630515325, "grad_norm": 148289.84375, "learning_rate": 2.488769092542678e-06, "loss": 0.5705, "step": 1950 }, { "epoch": 2.692489053553385, "grad_norm": 177999.140625, "learning_rate": 2.0395327942497755e-06, "loss": 0.5776, "step": 2000 }, { "epoch": 2.75985180195352, "grad_norm": 179621.078125, "learning_rate": 1.5902964959568734e-06, "loss": 0.5683, "step": 2050 }, { "epoch": 2.8272145503536543, "grad_norm": 125237.75, "learning_rate": 1.1410601976639714e-06, "loss": 0.5801, "step": 2100 }, { "epoch": 2.894577298753789, "grad_norm": 170392.578125, "learning_rate": 6.918238993710692e-07, "loss": 0.5681, "step": 2150 }, { "epoch": 2.9619400471539237, "grad_norm": 148443.96875, "learning_rate": 2.4258760107816715e-07, "loss": 0.5702, "step": 2200 } ], "logging_steps": 50, "max_steps": 2226, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2577417685696512.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }