{ "best_metric": 0.6936941742897034, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.0357347055460263, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.14694110920526e-05, "eval_loss": 1.1522774696350098, "eval_runtime": 271.4524, "eval_samples_per_second": 21.705, "eval_steps_per_second": 5.426, "step": 1 }, { "epoch": 0.000714694110920526, "grad_norm": 0.6790405511856079, "learning_rate": 4.0600000000000004e-05, "loss": 1.8005, "step": 10 }, { "epoch": 0.001429388221841052, "grad_norm": 0.8249218463897705, "learning_rate": 8.120000000000001e-05, "loss": 1.7599, "step": 20 }, { "epoch": 0.002144082332761578, "grad_norm": 0.9275533556938171, "learning_rate": 0.00012179999999999999, "loss": 1.7436, "step": 30 }, { "epoch": 0.002858776443682104, "grad_norm": 1.2592201232910156, "learning_rate": 0.00016240000000000002, "loss": 1.9385, "step": 40 }, { "epoch": 0.00357347055460263, "grad_norm": 2.3246257305145264, "learning_rate": 0.000203, "loss": 2.1042, "step": 50 }, { "epoch": 0.00357347055460263, "eval_loss": 0.9425861835479736, "eval_runtime": 271.9521, "eval_samples_per_second": 21.666, "eval_steps_per_second": 5.416, "step": 50 }, { "epoch": 0.004288164665523156, "grad_norm": 0.5499325394630432, "learning_rate": 0.00020275275110137215, "loss": 1.6205, "step": 60 }, { "epoch": 0.005002858776443682, "grad_norm": 0.657888650894165, "learning_rate": 0.00020201220897726938, "loss": 1.5361, "step": 70 }, { "epoch": 0.005717552887364208, "grad_norm": 0.7335652112960815, "learning_rate": 0.00020078198147448128, "loss": 1.6822, "step": 80 }, { "epoch": 0.006432246998284734, "grad_norm": 1.0221036672592163, "learning_rate": 0.00019906806213773937, "loss": 1.7223, "step": 90 }, { "epoch": 0.00714694110920526, "grad_norm": 2.625779151916504, "learning_rate": 0.0001968788010097697, "loss": 1.8717, "step": 100 }, { "epoch": 0.00714694110920526, "eval_loss": 0.8531848788261414, "eval_runtime": 271.6576, "eval_samples_per_second": 21.689, "eval_steps_per_second": 5.422, "step": 100 }, { "epoch": 0.007861635220125786, "grad_norm": 0.5337308049201965, "learning_rate": 0.00019422486395072398, "loss": 1.5742, "step": 110 }, { "epoch": 0.008576329331046312, "grad_norm": 0.7009676694869995, "learning_rate": 0.0001911191806751811, "loss": 1.6358, "step": 120 }, { "epoch": 0.009291023441966839, "grad_norm": 0.66788649559021, "learning_rate": 0.00018757688175987723, "loss": 1.6009, "step": 130 }, { "epoch": 0.010005717552887363, "grad_norm": 1.010290265083313, "learning_rate": 0.00018361522492905716, "loss": 1.638, "step": 140 }, { "epoch": 0.01072041166380789, "grad_norm": 2.071993350982666, "learning_rate": 0.00017925351097657625, "loss": 1.6186, "step": 150 }, { "epoch": 0.01072041166380789, "eval_loss": 0.8275316953659058, "eval_runtime": 271.5389, "eval_samples_per_second": 21.699, "eval_steps_per_second": 5.425, "step": 150 }, { "epoch": 0.011435105774728416, "grad_norm": 0.48222827911376953, "learning_rate": 0.00017451298973437308, "loss": 1.3411, "step": 160 }, { "epoch": 0.012149799885648942, "grad_norm": 0.6215487122535706, "learning_rate": 0.0001694167565454241, "loss": 1.4259, "step": 170 }, { "epoch": 0.012864493996569469, "grad_norm": 0.8803397417068481, "learning_rate": 0.0001639896397455543, "loss": 1.4885, "step": 180 }, { "epoch": 0.013579188107489995, "grad_norm": 0.899418294429779, "learning_rate": 0.0001582580797022808, "loss": 1.7306, "step": 190 }, { "epoch": 0.01429388221841052, "grad_norm": 1.875759482383728, "learning_rate": 0.00015225, "loss": 1.7291, "step": 200 }, { "epoch": 0.01429388221841052, "eval_loss": 0.7944059371948242, "eval_runtime": 271.637, "eval_samples_per_second": 21.691, "eval_steps_per_second": 5.423, "step": 200 }, { "epoch": 0.015008576329331046, "grad_norm": 0.5201715230941772, "learning_rate": 0.00014599467139909136, "loss": 1.4731, "step": 210 }, { "epoch": 0.015723270440251572, "grad_norm": 0.6637020707130432, "learning_rate": 0.0001395225692317151, "loss": 1.4816, "step": 220 }, { "epoch": 0.016437964551172097, "grad_norm": 0.7978131175041199, "learning_rate": 0.00013286522492905717, "loss": 1.5038, "step": 230 }, { "epoch": 0.017152658662092625, "grad_norm": 0.9858562350273132, "learning_rate": 0.00012605507240336626, "loss": 1.6433, "step": 240 }, { "epoch": 0.01786735277301315, "grad_norm": 1.3631006479263306, "learning_rate": 0.00011912529003319345, "loss": 1.6774, "step": 250 }, { "epoch": 0.01786735277301315, "eval_loss": 0.7708129286766052, "eval_runtime": 272.7688, "eval_samples_per_second": 21.601, "eval_steps_per_second": 5.4, "step": 250 }, { "epoch": 0.018582046883933678, "grad_norm": 0.5159099698066711, "learning_rate": 0.00011210963902166683, "loss": 1.4165, "step": 260 }, { "epoch": 0.019296740994854202, "grad_norm": 0.6102124452590942, "learning_rate": 0.00010504229891530386, "loss": 1.3505, "step": 270 }, { "epoch": 0.020011435105774727, "grad_norm": 0.8650922179222107, "learning_rate": 9.795770108469618e-05, "loss": 1.513, "step": 280 }, { "epoch": 0.020726129216695255, "grad_norm": 1.0572717189788818, "learning_rate": 9.08903609783332e-05, "loss": 1.6069, "step": 290 }, { "epoch": 0.02144082332761578, "grad_norm": 1.9492979049682617, "learning_rate": 8.387470996680658e-05, "loss": 1.6611, "step": 300 }, { "epoch": 0.02144082332761578, "eval_loss": 0.7363295555114746, "eval_runtime": 271.6884, "eval_samples_per_second": 21.687, "eval_steps_per_second": 5.422, "step": 300 }, { "epoch": 0.022155517438536308, "grad_norm": 0.5079029202461243, "learning_rate": 7.694492759663374e-05, "loss": 1.3944, "step": 310 }, { "epoch": 0.022870211549456832, "grad_norm": 0.6057290434837341, "learning_rate": 7.013477507094284e-05, "loss": 1.3385, "step": 320 }, { "epoch": 0.02358490566037736, "grad_norm": 0.831353485584259, "learning_rate": 6.347743076828492e-05, "loss": 1.4873, "step": 330 }, { "epoch": 0.024299599771297885, "grad_norm": 0.8882721066474915, "learning_rate": 5.700532860090863e-05, "loss": 1.5831, "step": 340 }, { "epoch": 0.02501429388221841, "grad_norm": 1.5294054746627808, "learning_rate": 5.075000000000002e-05, "loss": 1.5363, "step": 350 }, { "epoch": 0.02501429388221841, "eval_loss": 0.7126619219779968, "eval_runtime": 271.693, "eval_samples_per_second": 21.686, "eval_steps_per_second": 5.422, "step": 350 }, { "epoch": 0.025728987993138937, "grad_norm": 0.5537640452384949, "learning_rate": 4.4741920297719214e-05, "loss": 1.3678, "step": 360 }, { "epoch": 0.026443682104059462, "grad_norm": 0.552107572555542, "learning_rate": 3.901036025444568e-05, "loss": 1.383, "step": 370 }, { "epoch": 0.02715837621497999, "grad_norm": 0.8851740956306458, "learning_rate": 3.358324345457592e-05, "loss": 1.4735, "step": 380 }, { "epoch": 0.027873070325900515, "grad_norm": 0.9548201560974121, "learning_rate": 2.8487010265626928e-05, "loss": 1.4667, "step": 390 }, { "epoch": 0.02858776443682104, "grad_norm": 1.7331346273422241, "learning_rate": 2.3746489023423744e-05, "loss": 1.5478, "step": 400 }, { "epoch": 0.02858776443682104, "eval_loss": 0.7001160979270935, "eval_runtime": 271.7977, "eval_samples_per_second": 21.678, "eval_steps_per_second": 5.419, "step": 400 }, { "epoch": 0.029302458547741567, "grad_norm": 0.5737258791923523, "learning_rate": 1.9384775070942844e-05, "loss": 1.3097, "step": 410 }, { "epoch": 0.030017152658662092, "grad_norm": 0.623741626739502, "learning_rate": 1.5423118240122765e-05, "loss": 1.2598, "step": 420 }, { "epoch": 0.03073184676958262, "grad_norm": 0.7869819402694702, "learning_rate": 1.188081932481891e-05, "loss": 1.4332, "step": 430 }, { "epoch": 0.031446540880503145, "grad_norm": 0.976081371307373, "learning_rate": 8.775136049276001e-06, "loss": 1.4363, "step": 440 }, { "epoch": 0.03216123499142367, "grad_norm": 1.7860438823699951, "learning_rate": 6.121198990230306e-06, "loss": 1.555, "step": 450 }, { "epoch": 0.03216123499142367, "eval_loss": 0.6946857571601868, "eval_runtime": 271.7817, "eval_samples_per_second": 21.679, "eval_steps_per_second": 5.42, "step": 450 }, { "epoch": 0.032875929102344194, "grad_norm": 0.5253943204879761, "learning_rate": 3.931937862260632e-06, "loss": 1.1963, "step": 460 }, { "epoch": 0.033590623213264725, "grad_norm": 0.5864456295967102, "learning_rate": 2.2180185255187225e-06, "loss": 1.3404, "step": 470 }, { "epoch": 0.03430531732418525, "grad_norm": 0.8669192790985107, "learning_rate": 9.877910227306082e-07, "loss": 1.4906, "step": 480 }, { "epoch": 0.035020011435105775, "grad_norm": 0.977800726890564, "learning_rate": 2.472488986278439e-07, "loss": 1.4202, "step": 490 }, { "epoch": 0.0357347055460263, "grad_norm": 1.68795907497406, "learning_rate": 0.0, "loss": 1.6401, "step": 500 }, { "epoch": 0.0357347055460263, "eval_loss": 0.6936941742897034, "eval_runtime": 273.0352, "eval_samples_per_second": 21.58, "eval_steps_per_second": 5.395, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.66313582395392e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }