{ "best_metric": 1.213085651397705, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.5913660555884093, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011827321111768185, "eval_loss": 1.9876823425292969, "eval_runtime": 21.5168, "eval_samples_per_second": 16.592, "eval_steps_per_second": 4.183, "step": 1 }, { "epoch": 0.011827321111768185, "grad_norm": 0.5498068928718567, "learning_rate": 4.12e-05, "loss": 1.9079, "step": 10 }, { "epoch": 0.02365464222353637, "grad_norm": 0.6487964987754822, "learning_rate": 8.24e-05, "loss": 1.7271, "step": 20 }, { "epoch": 0.03548196333530455, "grad_norm": 0.6142700910568237, "learning_rate": 0.0001236, "loss": 1.5528, "step": 30 }, { "epoch": 0.04730928444707274, "grad_norm": 0.7988840341567993, "learning_rate": 0.0001648, "loss": 1.473, "step": 40 }, { "epoch": 0.05913660555884092, "grad_norm": 0.8337881565093994, "learning_rate": 0.000206, "loss": 1.4293, "step": 50 }, { "epoch": 0.05913660555884092, "eval_loss": 1.4428672790527344, "eval_runtime": 21.352, "eval_samples_per_second": 16.72, "eval_steps_per_second": 4.215, "step": 50 }, { "epoch": 0.0709639266706091, "grad_norm": 0.5612180829048157, "learning_rate": 0.0002057490971767619, "loss": 1.5017, "step": 60 }, { "epoch": 0.08279124778237729, "grad_norm": 0.610313892364502, "learning_rate": 0.00020499761108038175, "loss": 1.409, "step": 70 }, { "epoch": 0.09461856889414548, "grad_norm": 0.5548046827316284, "learning_rate": 0.00020374920287558198, "loss": 1.3791, "step": 80 }, { "epoch": 0.10644589000591366, "grad_norm": 0.5886740684509277, "learning_rate": 0.00020200995468164684, "loss": 1.3314, "step": 90 }, { "epoch": 0.11827321111768184, "grad_norm": 0.8145401477813721, "learning_rate": 0.00019978833994094855, "loss": 1.3111, "step": 100 }, { "epoch": 0.11827321111768184, "eval_loss": 1.3855363130569458, "eval_runtime": 21.2879, "eval_samples_per_second": 16.77, "eval_steps_per_second": 4.228, "step": 100 }, { "epoch": 0.13010053222945003, "grad_norm": 0.5458974242210388, "learning_rate": 0.00019709518213718787, "loss": 1.4332, "step": 110 }, { "epoch": 0.1419278533412182, "grad_norm": 0.6682223677635193, "learning_rate": 0.00019394360206446948, "loss": 1.4034, "step": 120 }, { "epoch": 0.1537551744529864, "grad_norm": 0.601813018321991, "learning_rate": 0.00019034895390411186, "loss": 1.2799, "step": 130 }, { "epoch": 0.16558249556475457, "grad_norm": 0.5965726971626282, "learning_rate": 0.0001863287504206196, "loss": 1.3391, "step": 140 }, { "epoch": 0.17740981667652278, "grad_norm": 0.7983888387680054, "learning_rate": 0.00018190257764125471, "loss": 1.313, "step": 150 }, { "epoch": 0.17740981667652278, "eval_loss": 1.349453330039978, "eval_runtime": 21.445, "eval_samples_per_second": 16.647, "eval_steps_per_second": 4.197, "step": 150 }, { "epoch": 0.18923713778829096, "grad_norm": 0.5388337969779968, "learning_rate": 0.00017709199943488106, "loss": 1.3871, "step": 160 }, { "epoch": 0.20106445890005914, "grad_norm": 0.5469598174095154, "learning_rate": 0.00017192045245496238, "loss": 1.3436, "step": 170 }, { "epoch": 0.21289178001182732, "grad_norm": 0.5442042350769043, "learning_rate": 0.00016641313195854277, "loss": 1.281, "step": 180 }, { "epoch": 0.2247191011235955, "grad_norm": 0.5752059817314148, "learning_rate": 0.0001605968690574869, "loss": 1.2313, "step": 190 }, { "epoch": 0.23654642223536368, "grad_norm": 0.7629579305648804, "learning_rate": 0.0001545, "loss": 1.2525, "step": 200 }, { "epoch": 0.23654642223536368, "eval_loss": 1.3042761087417603, "eval_runtime": 21.0813, "eval_samples_per_second": 16.934, "eval_steps_per_second": 4.269, "step": 200 }, { "epoch": 0.2483737433471319, "grad_norm": 0.517902672290802, "learning_rate": 0.00014815222811927496, "loss": 1.3525, "step": 210 }, { "epoch": 0.26020106445890007, "grad_norm": 0.5861254334449768, "learning_rate": 0.00014158447912183896, "loss": 1.3479, "step": 220 }, { "epoch": 0.2720283855706682, "grad_norm": 0.5790980458259583, "learning_rate": 0.00013482875042061958, "loss": 1.2344, "step": 230 }, { "epoch": 0.2838557066824364, "grad_norm": 0.5689371228218079, "learning_rate": 0.00012791795524676576, "loss": 1.1846, "step": 240 }, { "epoch": 0.29568302779420463, "grad_norm": 1.021458625793457, "learning_rate": 0.00012088576229969385, "loss": 1.2891, "step": 250 }, { "epoch": 0.29568302779420463, "eval_loss": 1.2808476686477661, "eval_runtime": 21.2719, "eval_samples_per_second": 16.783, "eval_steps_per_second": 4.231, "step": 250 }, { "epoch": 0.3075103489059728, "grad_norm": 0.4944184720516205, "learning_rate": 0.0001137664317165683, "loss": 1.3615, "step": 260 }, { "epoch": 0.319337670017741, "grad_norm": 0.531223475933075, "learning_rate": 0.00010659464816035761, "loss": 1.2744, "step": 270 }, { "epoch": 0.33116499112950915, "grad_norm": 0.4596438705921173, "learning_rate": 9.940535183964242e-05, "loss": 1.232, "step": 280 }, { "epoch": 0.34299231224127735, "grad_norm": 0.5282168984413147, "learning_rate": 9.22335682834317e-05, "loss": 1.243, "step": 290 }, { "epoch": 0.35481963335304556, "grad_norm": 0.773091197013855, "learning_rate": 8.511423770030617e-05, "loss": 1.1971, "step": 300 }, { "epoch": 0.35481963335304556, "eval_loss": 1.2491676807403564, "eval_runtime": 21.4459, "eval_samples_per_second": 16.647, "eval_steps_per_second": 4.197, "step": 300 }, { "epoch": 0.3666469544648137, "grad_norm": 0.47350648045539856, "learning_rate": 7.808204475323423e-05, "loss": 1.309, "step": 310 }, { "epoch": 0.3784742755765819, "grad_norm": 0.5329492688179016, "learning_rate": 7.117124957938042e-05, "loss": 1.2375, "step": 320 }, { "epoch": 0.39030159668835007, "grad_norm": 0.571625828742981, "learning_rate": 6.441552087816105e-05, "loss": 1.2416, "step": 330 }, { "epoch": 0.4021289178001183, "grad_norm": 0.5600525140762329, "learning_rate": 5.784777188072502e-05, "loss": 1.2067, "step": 340 }, { "epoch": 0.41395623891188643, "grad_norm": 0.8236201405525208, "learning_rate": 5.150000000000002e-05, "loss": 1.2022, "step": 350 }, { "epoch": 0.41395623891188643, "eval_loss": 1.2360426187515259, "eval_runtime": 21.2849, "eval_samples_per_second": 16.772, "eval_steps_per_second": 4.228, "step": 350 }, { "epoch": 0.42578356002365464, "grad_norm": 0.6204054355621338, "learning_rate": 4.540313094251309e-05, "loss": 1.2624, "step": 360 }, { "epoch": 0.43761088113542285, "grad_norm": 0.49718907475471497, "learning_rate": 3.958686804145719e-05, "loss": 1.2795, "step": 370 }, { "epoch": 0.449438202247191, "grad_norm": 0.4726181626319885, "learning_rate": 3.4079547545037634e-05, "loss": 1.2434, "step": 380 }, { "epoch": 0.4612655233589592, "grad_norm": 0.5426561236381531, "learning_rate": 2.8908000565118947e-05, "loss": 1.1879, "step": 390 }, { "epoch": 0.47309284447072736, "grad_norm": 0.7943010330200195, "learning_rate": 2.4097422358745275e-05, "loss": 1.2145, "step": 400 }, { "epoch": 0.47309284447072736, "eval_loss": 1.2227905988693237, "eval_runtime": 21.2641, "eval_samples_per_second": 16.789, "eval_steps_per_second": 4.232, "step": 400 }, { "epoch": 0.48492016558249557, "grad_norm": 0.5392206311225891, "learning_rate": 1.9671249579380422e-05, "loss": 1.29, "step": 410 }, { "epoch": 0.4967474866942638, "grad_norm": 0.5592563152313232, "learning_rate": 1.5651046095888127e-05, "loss": 1.2821, "step": 420 }, { "epoch": 0.5085748078060319, "grad_norm": 0.5508091449737549, "learning_rate": 1.205639793553052e-05, "loss": 1.1917, "step": 430 }, { "epoch": 0.5204021289178001, "grad_norm": 0.6197274327278137, "learning_rate": 8.904817862812098e-06, "loss": 1.2411, "step": 440 }, { "epoch": 0.5322294500295683, "grad_norm": 0.7078971266746521, "learning_rate": 6.211660059051443e-06, "loss": 1.2405, "step": 450 }, { "epoch": 0.5322294500295683, "eval_loss": 1.2150496244430542, "eval_runtime": 21.4227, "eval_samples_per_second": 16.665, "eval_steps_per_second": 4.201, "step": 450 }, { "epoch": 0.5440567711413364, "grad_norm": 0.6016212701797485, "learning_rate": 3.990045318353154e-06, "loss": 1.3333, "step": 460 }, { "epoch": 0.5558840922531046, "grad_norm": 0.5523563623428345, "learning_rate": 2.250797124418014e-06, "loss": 1.1967, "step": 470 }, { "epoch": 0.5677114133648729, "grad_norm": 0.48314350843429565, "learning_rate": 1.0023889196182526e-06, "loss": 1.1624, "step": 480 }, { "epoch": 0.5795387344766411, "grad_norm": 0.5281088352203369, "learning_rate": 2.5090282323810766e-07, "loss": 1.2216, "step": 490 }, { "epoch": 0.5913660555884093, "grad_norm": 0.6948708295822144, "learning_rate": 0.0, "loss": 1.226, "step": 500 }, { "epoch": 0.5913660555884093, "eval_loss": 1.213085651397705, "eval_runtime": 21.0865, "eval_samples_per_second": 16.93, "eval_steps_per_second": 4.268, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.548518135123149e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }