{ "best_metric": 0.0007766146445646882, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 1.0, "eval_steps": 50, "global_step": 463, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021598272138228943, "eval_loss": 0.11870575696229935, "eval_runtime": 28.391, "eval_samples_per_second": 6.868, "eval_steps_per_second": 1.726, "step": 1 }, { "epoch": 0.02159827213822894, "grad_norm": 0.6378040313720703, "learning_rate": 4.08e-05, "loss": 0.1198, "step": 10 }, { "epoch": 0.04319654427645788, "grad_norm": 0.3366822600364685, "learning_rate": 8.16e-05, "loss": 0.0585, "step": 20 }, { "epoch": 0.06479481641468683, "grad_norm": 1.0418509244918823, "learning_rate": 0.0001224, "loss": 0.0195, "step": 30 }, { "epoch": 0.08639308855291576, "grad_norm": 0.7536942362785339, "learning_rate": 0.0001632, "loss": 0.0173, "step": 40 }, { "epoch": 0.1079913606911447, "grad_norm": 2.1272497177124023, "learning_rate": 0.000204, "loss": 0.0183, "step": 50 }, { "epoch": 0.1079913606911447, "eval_loss": 0.011936341412365437, "eval_runtime": 28.1854, "eval_samples_per_second": 6.918, "eval_steps_per_second": 1.738, "step": 50 }, { "epoch": 0.12958963282937366, "grad_norm": 0.5203682780265808, "learning_rate": 0.00020370504185350093, "loss": 0.0316, "step": 60 }, { "epoch": 0.1511879049676026, "grad_norm": 0.8338865041732788, "learning_rate": 0.00020282187330239947, "loss": 0.0168, "step": 70 }, { "epoch": 0.17278617710583152, "grad_norm": 0.63815838098526, "learning_rate": 0.0002013556021458894, "loss": 0.0222, "step": 80 }, { "epoch": 0.19438444924406048, "grad_norm": 0.11844746768474579, "learning_rate": 0.00019931470855304105, "loss": 0.0154, "step": 90 }, { "epoch": 0.2159827213822894, "grad_norm": 4.035724639892578, "learning_rate": 0.00019671099601780242, "loss": 0.016, "step": 100 }, { "epoch": 0.2159827213822894, "eval_loss": 0.02815873548388481, "eval_runtime": 28.4968, "eval_samples_per_second": 6.843, "eval_steps_per_second": 1.719, "step": 100 }, { "epoch": 0.23758099352051837, "grad_norm": 0.14545699954032898, "learning_rate": 0.00019355952309357377, "loss": 0.0326, "step": 110 }, { "epoch": 0.2591792656587473, "grad_norm": 1.333457350730896, "learning_rate": 0.00018987851630216821, "loss": 0.0127, "step": 120 }, { "epoch": 0.28077753779697623, "grad_norm": 0.19191227853298187, "learning_rate": 0.0001856892647208494, "loss": 0.0164, "step": 130 }, { "epoch": 0.3023758099352052, "grad_norm": 0.6895551085472107, "learning_rate": 0.00018101599685710056, "loss": 0.0143, "step": 140 }, { "epoch": 0.32397408207343414, "grad_norm": 0.06516972184181213, "learning_rate": 0.00017588574052321935, "loss": 0.0108, "step": 150 }, { "epoch": 0.32397408207343414, "eval_loss": 0.007597790565341711, "eval_runtime": 28.5719, "eval_samples_per_second": 6.825, "eval_steps_per_second": 1.715, "step": 150 }, { "epoch": 0.34557235421166305, "grad_norm": 0.01516179833561182, "learning_rate": 0.0001703281665211531, "loss": 0.0107, "step": 160 }, { "epoch": 0.367170626349892, "grad_norm": 3.366034507751465, "learning_rate": 0.00016437541704162093, "loss": 0.0119, "step": 170 }, { "epoch": 0.38876889848812096, "grad_norm": 0.2688906192779541, "learning_rate": 0.00015806191976997693, "loss": 0.0033, "step": 180 }, { "epoch": 0.4103671706263499, "grad_norm": 0.06040094047784805, "learning_rate": 0.00015142418877393032, "loss": 0.0028, "step": 190 }, { "epoch": 0.4319654427645788, "grad_norm": 0.10505006462335587, "learning_rate": 0.00014450061332468804, "loss": 0.0051, "step": 200 }, { "epoch": 0.4319654427645788, "eval_loss": 0.016910606995224953, "eval_runtime": 28.5258, "eval_samples_per_second": 6.836, "eval_steps_per_second": 1.718, "step": 200 }, { "epoch": 0.4535637149028078, "grad_norm": 0.40670064091682434, "learning_rate": 0.0001373312358728708, "loss": 0.0122, "step": 210 }, { "epoch": 0.47516198704103674, "grad_norm": 1.2365318536758423, "learning_rate": 0.00012995752046327736, "loss": 0.0043, "step": 220 }, { "epoch": 0.49676025917926564, "grad_norm": 0.05308518186211586, "learning_rate": 0.0001224221129278691, "loss": 0.0041, "step": 230 }, { "epoch": 0.5183585313174947, "grad_norm": 0.05913418531417847, "learning_rate": 0.00011476859424389611, "loss": 0.0024, "step": 240 }, { "epoch": 0.5399568034557235, "grad_norm": 0.029558856040239334, "learning_rate": 0.00010704122848361768, "loss": 0.0031, "step": 250 }, { "epoch": 0.5399568034557235, "eval_loss": 0.0030939881689846516, "eval_runtime": 28.2074, "eval_samples_per_second": 6.913, "eval_steps_per_second": 1.737, "step": 250 }, { "epoch": 0.5615550755939525, "grad_norm": 0.13922685384750366, "learning_rate": 9.928470681334698e-05, "loss": 0.0058, "step": 260 }, { "epoch": 0.5831533477321814, "grad_norm": 0.06830060482025146, "learning_rate": 9.154388902240052e-05, "loss": 0.0033, "step": 270 }, { "epoch": 0.6047516198704104, "grad_norm": 0.024560509249567986, "learning_rate": 8.386354407681761e-05, "loss": 0.0019, "step": 280 }, { "epoch": 0.6263498920086393, "grad_norm": 0.0010884521761909127, "learning_rate": 7.62880911983556e-05, "loss": 0.0004, "step": 290 }, { "epoch": 0.6479481641468683, "grad_norm": 0.04456557705998421, "learning_rate": 6.886134296622857e-05, "loss": 0.0003, "step": 300 }, { "epoch": 0.6479481641468683, "eval_loss": 0.001991268480196595, "eval_runtime": 28.47, "eval_samples_per_second": 6.849, "eval_steps_per_second": 1.721, "step": 300 }, { "epoch": 0.6695464362850972, "grad_norm": 0.012475444003939629, "learning_rate": 6.16262519273594e-05, "loss": 0.0013, "step": 310 }, { "epoch": 0.6911447084233261, "grad_norm": 0.19239909946918488, "learning_rate": 5.4624662180622114e-05, "loss": 0.0039, "step": 320 }, { "epoch": 0.712742980561555, "grad_norm": 0.025792984291911125, "learning_rate": 4.7897067371786276e-05, "loss": 0.0023, "step": 330 }, { "epoch": 0.734341252699784, "grad_norm": 0.013253681361675262, "learning_rate": 4.148237649879712e-05, "loss": 0.0021, "step": 340 }, { "epoch": 0.755939524838013, "grad_norm": 0.0363602451980114, "learning_rate": 3.541768888185587e-05, "loss": 0.0029, "step": 350 }, { "epoch": 0.755939524838013, "eval_loss": 0.0015392228960990906, "eval_runtime": 28.3248, "eval_samples_per_second": 6.884, "eval_steps_per_second": 1.73, "step": 350 }, { "epoch": 0.7775377969762419, "grad_norm": 0.004180555697530508, "learning_rate": 2.973807959975846e-05, "loss": 0.004, "step": 360 }, { "epoch": 0.7991360691144709, "grad_norm": 0.009106243029236794, "learning_rate": 2.447639663342025e-05, "loss": 0.0021, "step": 370 }, { "epoch": 0.8207343412526998, "grad_norm": 0.011521597392857075, "learning_rate": 1.9663070889806247e-05, "loss": 0.0054, "step": 380 }, { "epoch": 0.8423326133909287, "grad_norm": 0.0030731502920389175, "learning_rate": 1.5325940204991634e-05, "loss": 0.0011, "step": 390 }, { "epoch": 0.8639308855291576, "grad_norm": 0.04495791345834732, "learning_rate": 1.1490088344229713e-05, "loss": 0.0023, "step": 400 }, { "epoch": 0.8639308855291576, "eval_loss": 0.0008633602410554886, "eval_runtime": 28.3009, "eval_samples_per_second": 6.89, "eval_steps_per_second": 1.731, "step": 400 }, { "epoch": 0.8855291576673866, "grad_norm": 0.04856608808040619, "learning_rate": 8.177699930169379e-06, "loss": 0.0023, "step": 410 }, { "epoch": 0.9071274298056156, "grad_norm": 0.004636272322386503, "learning_rate": 5.4079321382439e-06, "loss": 0.0015, "step": 420 }, { "epoch": 0.9287257019438445, "grad_norm": 0.12631860375404358, "learning_rate": 3.1968039012795417e-06, "loss": 0.0005, "step": 430 }, { "epoch": 0.9503239740820735, "grad_norm": 0.0010080524953082204, "learning_rate": 1.5571032641086151e-06, "loss": 0.0015, "step": 440 }, { "epoch": 0.9719222462203023, "grad_norm": 0.02956795133650303, "learning_rate": 4.983134240007501e-07, "loss": 0.0003, "step": 450 }, { "epoch": 0.9719222462203023, "eval_loss": 0.0007766146445646882, "eval_runtime": 28.3411, "eval_samples_per_second": 6.88, "eval_steps_per_second": 1.729, "step": 450 }, { "epoch": 0.9935205183585313, "grad_norm": 0.060769353061914444, "learning_rate": 2.6557884657352427e-08, "loss": 0.0049, "step": 460 } ], "logging_steps": 10, "max_steps": 463, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3904763809103872e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }