{ "best_metric": 0.1438828855752945, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.09550186228631459, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019100372457262916, "eval_loss": 1.2516456842422485, "eval_runtime": 177.6989, "eval_samples_per_second": 12.409, "eval_steps_per_second": 3.106, "step": 1 }, { "epoch": 0.0019100372457262916, "grad_norm": 1.0476511716842651, "learning_rate": 4.0400000000000006e-05, "loss": 1.153, "step": 10 }, { "epoch": 0.003820074491452583, "grad_norm": 1.070288896560669, "learning_rate": 8.080000000000001e-05, "loss": 1.119, "step": 20 }, { "epoch": 0.005730111737178875, "grad_norm": 1.223846197128296, "learning_rate": 0.00012119999999999999, "loss": 0.6905, "step": 30 }, { "epoch": 0.007640148982905166, "grad_norm": 0.6122403740882874, "learning_rate": 0.00016160000000000002, "loss": 0.4386, "step": 40 }, { "epoch": 0.009550186228631458, "grad_norm": 0.9764545559883118, "learning_rate": 0.000202, "loss": 0.4557, "step": 50 }, { "epoch": 0.009550186228631458, "eval_loss": 0.5318617820739746, "eval_runtime": 177.5974, "eval_samples_per_second": 12.416, "eval_steps_per_second": 3.108, "step": 50 }, { "epoch": 0.01146022347435775, "grad_norm": 0.5438581109046936, "learning_rate": 0.00020175396907624226, "loss": 0.511, "step": 60 }, { "epoch": 0.013370260720084041, "grad_norm": 0.4670880138874054, "learning_rate": 0.0002010170749428986, "loss": 0.3416, "step": 70 }, { "epoch": 0.015280297965810333, "grad_norm": 0.7363126873970032, "learning_rate": 0.00019979290767411438, "loss": 0.2788, "step": 80 }, { "epoch": 0.017190335211536626, "grad_norm": 0.521145224571228, "learning_rate": 0.0001980874312897702, "loss": 0.3133, "step": 90 }, { "epoch": 0.019100372457262916, "grad_norm": 1.2795084714889526, "learning_rate": 0.00019590895469937675, "loss": 0.268, "step": 100 }, { "epoch": 0.019100372457262916, "eval_loss": 0.3358194828033447, "eval_runtime": 177.6482, "eval_samples_per_second": 12.412, "eval_steps_per_second": 3.107, "step": 100 }, { "epoch": 0.02101040970298921, "grad_norm": 0.6757239699363708, "learning_rate": 0.0001932680912219027, "loss": 0.347, "step": 110 }, { "epoch": 0.0229204469487155, "grad_norm": 0.5455737709999084, "learning_rate": 0.00019017770687875164, "loss": 0.281, "step": 120 }, { "epoch": 0.024830484194441792, "grad_norm": 0.5992138981819153, "learning_rate": 0.000186652857711799, "loss": 0.2512, "step": 130 }, { "epoch": 0.026740521440168082, "grad_norm": 0.49668246507644653, "learning_rate": 0.00018271071643186968, "loss": 0.1711, "step": 140 }, { "epoch": 0.028650558685894376, "grad_norm": 0.8957052826881409, "learning_rate": 0.00017837048875501678, "loss": 0.1674, "step": 150 }, { "epoch": 0.028650558685894376, "eval_loss": 0.27206987142562866, "eval_runtime": 177.7253, "eval_samples_per_second": 12.407, "eval_steps_per_second": 3.106, "step": 150 }, { "epoch": 0.030560595931620665, "grad_norm": 0.586548924446106, "learning_rate": 0.00017365331983420376, "loss": 0.3344, "step": 160 }, { "epoch": 0.032470633177346955, "grad_norm": 0.4355241060256958, "learning_rate": 0.0001685821912422447, "loss": 0.2535, "step": 170 }, { "epoch": 0.03438067042307325, "grad_norm": 0.6418226957321167, "learning_rate": 0.00016318180900789148, "loss": 0.2191, "step": 180 }, { "epoch": 0.03629070766879954, "grad_norm": 0.4115605354309082, "learning_rate": 0.00015747848325054544, "loss": 0.2409, "step": 190 }, { "epoch": 0.03820074491452583, "grad_norm": 0.4212948679924011, "learning_rate": 0.0001515, "loss": 0.1163, "step": 200 }, { "epoch": 0.03820074491452583, "eval_loss": 0.2181142419576645, "eval_runtime": 177.4423, "eval_samples_per_second": 12.427, "eval_steps_per_second": 3.111, "step": 200 }, { "epoch": 0.04011078216025212, "grad_norm": 0.6589187383651733, "learning_rate": 0.00014527548582569683, "loss": 0.3187, "step": 210 }, { "epoch": 0.04202081940597842, "grad_norm": 0.40305858850479126, "learning_rate": 0.00013883526593500714, "loss": 0.2596, "step": 220 }, { "epoch": 0.04393085665170471, "grad_norm": 0.44552844762802124, "learning_rate": 0.0001322107164318697, "loss": 0.1996, "step": 230 }, { "epoch": 0.045840893897431, "grad_norm": 0.32739052176475525, "learning_rate": 0.00012543411145556643, "loss": 0.2006, "step": 240 }, { "epoch": 0.047750931143157295, "grad_norm": 0.5908793807029724, "learning_rate": 0.00011853846594435998, "loss": 0.11, "step": 250 }, { "epoch": 0.047750931143157295, "eval_loss": 0.18999893963336945, "eval_runtime": 177.5744, "eval_samples_per_second": 12.417, "eval_steps_per_second": 3.109, "step": 250 }, { "epoch": 0.049660968388883585, "grad_norm": 0.5332140326499939, "learning_rate": 0.00011155737479003301, "loss": 0.2983, "step": 260 }, { "epoch": 0.051571005634609875, "grad_norm": 0.6416130661964417, "learning_rate": 0.00010452484916695262, "loss": 0.2444, "step": 270 }, { "epoch": 0.053481042880336165, "grad_norm": 0.37119925022125244, "learning_rate": 9.747515083304742e-05, "loss": 0.1693, "step": 280 }, { "epoch": 0.05539108012606246, "grad_norm": 0.4137580096721649, "learning_rate": 9.044262520996702e-05, "loss": 0.1027, "step": 290 }, { "epoch": 0.05730111737178875, "grad_norm": 0.8877851366996765, "learning_rate": 8.346153405564004e-05, "loss": 0.092, "step": 300 }, { "epoch": 0.05730111737178875, "eval_loss": 0.17685508728027344, "eval_runtime": 178.4471, "eval_samples_per_second": 12.357, "eval_steps_per_second": 3.093, "step": 300 }, { "epoch": 0.05921115461751504, "grad_norm": 0.4535675644874573, "learning_rate": 7.656588854443357e-05, "loss": 0.2437, "step": 310 }, { "epoch": 0.06112119186324133, "grad_norm": 0.5770543813705444, "learning_rate": 6.978928356813031e-05, "loss": 0.2017, "step": 320 }, { "epoch": 0.06303122910896762, "grad_norm": 0.5218977928161621, "learning_rate": 6.316473406499288e-05, "loss": 0.1578, "step": 330 }, { "epoch": 0.06494126635469391, "grad_norm": 0.30960023403167725, "learning_rate": 5.672451417430317e-05, "loss": 0.0787, "step": 340 }, { "epoch": 0.06685130360042021, "grad_norm": 0.33792850375175476, "learning_rate": 5.050000000000002e-05, "loss": 0.0482, "step": 350 }, { "epoch": 0.06685130360042021, "eval_loss": 0.16439248621463776, "eval_runtime": 177.4636, "eval_samples_per_second": 12.425, "eval_steps_per_second": 3.11, "step": 350 }, { "epoch": 0.0687613408461465, "grad_norm": 0.3826381266117096, "learning_rate": 4.452151674945458e-05, "loss": 0.2723, "step": 360 }, { "epoch": 0.0706713780918728, "grad_norm": 0.741492509841919, "learning_rate": 3.8818190992108515e-05, "loss": 0.2213, "step": 370 }, { "epoch": 0.07258141533759908, "grad_norm": 0.3961448669433594, "learning_rate": 3.3417808757755355e-05, "loss": 0.2094, "step": 380 }, { "epoch": 0.07449145258332537, "grad_norm": 1.2866545915603638, "learning_rate": 2.8346680165796253e-05, "loss": 0.1332, "step": 390 }, { "epoch": 0.07640148982905166, "grad_norm": 0.6757580637931824, "learning_rate": 2.362951124498323e-05, "loss": 0.0435, "step": 400 }, { "epoch": 0.07640148982905166, "eval_loss": 0.15090037882328033, "eval_runtime": 177.674, "eval_samples_per_second": 12.41, "eval_steps_per_second": 3.107, "step": 400 }, { "epoch": 0.07831152707477795, "grad_norm": 0.2972387671470642, "learning_rate": 1.928928356813032e-05, "loss": 0.2427, "step": 410 }, { "epoch": 0.08022156432050424, "grad_norm": 0.5251720547676086, "learning_rate": 1.5347142288200977e-05, "loss": 0.1642, "step": 420 }, { "epoch": 0.08213160156623055, "grad_norm": 0.22827404737472534, "learning_rate": 1.1822293121248375e-05, "loss": 0.1806, "step": 430 }, { "epoch": 0.08404163881195684, "grad_norm": 0.08879908174276352, "learning_rate": 8.731908778097302e-06, "loss": 0.0702, "step": 440 }, { "epoch": 0.08595167605768313, "grad_norm": 0.23056891560554504, "learning_rate": 6.09104530062326e-06, "loss": 0.0481, "step": 450 }, { "epoch": 0.08595167605768313, "eval_loss": 0.1438828855752945, "eval_runtime": 177.4573, "eval_samples_per_second": 12.426, "eval_steps_per_second": 3.111, "step": 450 }, { "epoch": 0.08786171330340942, "grad_norm": 0.43199360370635986, "learning_rate": 3.912568710229791e-06, "loss": 0.2193, "step": 460 }, { "epoch": 0.0897717505491357, "grad_norm": 0.4930032789707184, "learning_rate": 2.2070923258856255e-06, "loss": 0.2111, "step": 470 }, { "epoch": 0.091681787794862, "grad_norm": 0.39143937826156616, "learning_rate": 9.829250571013935e-07, "loss": 0.1228, "step": 480 }, { "epoch": 0.09359182504058829, "grad_norm": 0.1679701954126358, "learning_rate": 2.4603092375775605e-07, "loss": 0.0783, "step": 490 }, { "epoch": 0.09550186228631459, "grad_norm": 0.26929542422294617, "learning_rate": 0.0, "loss": 0.0281, "step": 500 }, { "epoch": 0.09550186228631459, "eval_loss": 0.143937349319458, "eval_runtime": 177.8576, "eval_samples_per_second": 12.398, "eval_steps_per_second": 3.104, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2824243645448192e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }