{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06756756756756757, "grad_norm": 3.4261663240782494, "learning_rate": 1.3157894736842106e-05, "loss": 1.4125, "step": 5 }, { "epoch": 0.13513513513513514, "grad_norm": 2.245735390651891, "learning_rate": 2.6315789473684212e-05, "loss": 1.2188, "step": 10 }, { "epoch": 0.20270270270270271, "grad_norm": 1.6457443250895738, "learning_rate": 3.9473684210526316e-05, "loss": 1.0957, "step": 15 }, { "epoch": 0.2702702702702703, "grad_norm": 1.6865888677076188, "learning_rate": 4.9999098771046674e-05, "loss": 1.0601, "step": 20 }, { "epoch": 0.33783783783783783, "grad_norm": 1.666502580311869, "learning_rate": 4.996756333769319e-05, "loss": 1.0148, "step": 25 }, { "epoch": 0.40540540540540543, "grad_norm": 1.3339531822515116, "learning_rate": 4.989103862744732e-05, "loss": 1.0003, "step": 30 }, { "epoch": 0.47297297297297297, "grad_norm": 1.3184340126355656, "learning_rate": 4.976967787416565e-05, "loss": 2.3477, "step": 35 }, { "epoch": 0.5405405405405406, "grad_norm": 1.1190274982246844, "learning_rate": 4.9603724091852456e-05, "loss": 0.9947, "step": 40 }, { "epoch": 0.6081081081081081, "grad_norm": 1.1511902644387824, "learning_rate": 4.9393509588046036e-05, "loss": 0.9678, "step": 45 }, { "epoch": 0.6756756756756757, "grad_norm": 1.0945331385038577, "learning_rate": 4.9139455298402656e-05, "loss": 0.9578, "step": 50 }, { "epoch": 0.7432432432432432, "grad_norm": 1.1300453480249584, "learning_rate": 4.884206994381078e-05, "loss": 0.9634, "step": 55 }, { "epoch": 0.8108108108108109, "grad_norm": 0.963524297894151, "learning_rate": 4.8501949011723144e-05, "loss": 0.9477, "step": 60 }, { "epoch": 0.8783783783783784, "grad_norm": 1.0636183973323095, "learning_rate": 4.811977356374667e-05, "loss": 0.9586, "step": 65 }, { "epoch": 0.9459459459459459, "grad_norm": 1.066685209936076, "learning_rate": 4.769630887187782e-05, "loss": 0.9477, "step": 70 }, { "epoch": 1.0135135135135136, "grad_norm": 1.291428060818525, "learning_rate": 4.72324028861141e-05, "loss": 0.9049, "step": 75 }, { "epoch": 1.0810810810810811, "grad_norm": 1.148432172425945, "learning_rate": 4.6728984536510454e-05, "loss": 0.782, "step": 80 }, { "epoch": 1.1486486486486487, "grad_norm": 1.2392485810964904, "learning_rate": 4.6187061873080264e-05, "loss": 0.7614, "step": 85 }, { "epoch": 1.2162162162162162, "grad_norm": 1.1289267203176847, "learning_rate": 4.560772004726575e-05, "loss": 0.7484, "step": 90 }, { "epoch": 1.2837837837837838, "grad_norm": 0.9647581297441358, "learning_rate": 4.499211913901974e-05, "loss": 0.7505, "step": 95 }, { "epoch": 1.3513513513513513, "grad_norm": 1.0554783842280795, "learning_rate": 4.434149183384977e-05, "loss": 0.7615, "step": 100 }, { "epoch": 1.4189189189189189, "grad_norm": 1.0046093718247744, "learning_rate": 4.3657140954476165e-05, "loss": 0.7606, "step": 105 }, { "epoch": 1.4864864864864864, "grad_norm": 1.017356383159543, "learning_rate": 4.294043685204651e-05, "loss": 0.7538, "step": 110 }, { "epoch": 1.554054054054054, "grad_norm": 0.957520360195128, "learning_rate": 4.219281466213066e-05, "loss": 0.7649, "step": 115 }, { "epoch": 1.6216216216216215, "grad_norm": 1.0258379858209186, "learning_rate": 4.141577143099074e-05, "loss": 0.7653, "step": 120 }, { "epoch": 1.689189189189189, "grad_norm": 1.0089041760037447, "learning_rate": 4.06108631178804e-05, "loss": 0.7489, "step": 125 }, { "epoch": 1.7567567567567568, "grad_norm": 0.9596931649356711, "learning_rate": 3.977970147937635e-05, "loss": 0.766, "step": 130 }, { "epoch": 1.8243243243243243, "grad_norm": 1.0452079214800263, "learning_rate": 3.892395084198053e-05, "loss": 0.7569, "step": 135 }, { "epoch": 1.8918918918918919, "grad_norm": 2.658413589889494, "learning_rate": 3.8045324769455834e-05, "loss": 0.7516, "step": 140 }, { "epoch": 1.9594594594594594, "grad_norm": 0.961689698659803, "learning_rate": 3.714558263156872e-05, "loss": 0.7534, "step": 145 }, { "epoch": 2.027027027027027, "grad_norm": 1.2391671243602853, "learning_rate": 3.622652608110924e-05, "loss": 0.6757, "step": 150 }, { "epoch": 2.0945945945945947, "grad_norm": 1.6526356733635839, "learning_rate": 3.528999544624324e-05, "loss": 0.5261, "step": 155 }, { "epoch": 2.1621621621621623, "grad_norm": 1.0557027457454706, "learning_rate": 3.433786604542056e-05, "loss": 0.5027, "step": 160 }, { "epoch": 2.22972972972973, "grad_norm": 1.0910599708508169, "learning_rate": 3.337204443221834e-05, "loss": 0.4968, "step": 165 }, { "epoch": 2.2972972972972974, "grad_norm": 1.0721275697200998, "learning_rate": 3.2394464577638765e-05, "loss": 0.4971, "step": 170 }, { "epoch": 2.364864864864865, "grad_norm": 1.1434487748088678, "learning_rate": 3.140708399750594e-05, "loss": 0.4957, "step": 175 }, { "epoch": 2.4324324324324325, "grad_norm": 1.1545114999219812, "learning_rate": 3.0411879832716246e-05, "loss": 0.5029, "step": 180 }, { "epoch": 2.5, "grad_norm": 1.041125870919675, "learning_rate": 2.9410844890191458e-05, "loss": 0.5169, "step": 185 }, { "epoch": 2.5675675675675675, "grad_norm": 1.175924979776691, "learning_rate": 2.840598365246184e-05, "loss": 0.5165, "step": 190 }, { "epoch": 2.635135135135135, "grad_norm": 1.0172037788629136, "learning_rate": 2.7399308263870027e-05, "loss": 0.5104, "step": 195 }, { "epoch": 2.7027027027027026, "grad_norm": 1.0909635619769098, "learning_rate": 2.6392834501432746e-05, "loss": 0.5204, "step": 200 }, { "epoch": 2.77027027027027, "grad_norm": 1.0751601634075874, "learning_rate": 2.538857773842847e-05, "loss": 0.5148, "step": 205 }, { "epoch": 2.8378378378378377, "grad_norm": 1.0538257257155987, "learning_rate": 2.438854890879351e-05, "loss": 0.5193, "step": 210 }, { "epoch": 2.9054054054054053, "grad_norm": 1.1028773915647776, "learning_rate": 2.3394750480407467e-05, "loss": 0.5253, "step": 215 }, { "epoch": 2.972972972972973, "grad_norm": 1.057212593933844, "learning_rate": 2.2409172445331257e-05, "loss": 0.5147, "step": 220 }, { "epoch": 3.0405405405405403, "grad_norm": 1.2686125905161307, "learning_rate": 2.143378833502677e-05, "loss": 0.3906, "step": 225 }, { "epoch": 3.108108108108108, "grad_norm": 2.0819711074809186, "learning_rate": 2.0470551268537457e-05, "loss": 0.278, "step": 230 }, { "epoch": 3.175675675675676, "grad_norm": 1.1899957055285104, "learning_rate": 1.9521390041542946e-05, "loss": 0.279, "step": 235 }, { "epoch": 3.2432432432432434, "grad_norm": 1.0429855830298884, "learning_rate": 1.8588205264118974e-05, "loss": 0.2658, "step": 240 }, { "epoch": 3.310810810810811, "grad_norm": 1.2125097679092012, "learning_rate": 1.7672865554936467e-05, "loss": 0.2567, "step": 245 }, { "epoch": 3.3783783783783785, "grad_norm": 1.057277497432442, "learning_rate": 1.6777203799520573e-05, "loss": 0.257, "step": 250 }, { "epoch": 3.445945945945946, "grad_norm": 1.0711915766982019, "learning_rate": 1.5903013480062085e-05, "loss": 0.2615, "step": 255 }, { "epoch": 3.5135135135135136, "grad_norm": 1.0628794272491933, "learning_rate": 1.5052045084130445e-05, "loss": 0.2614, "step": 260 }, { "epoch": 3.581081081081081, "grad_norm": 1.1662096739632066, "learning_rate": 1.4226002599479743e-05, "loss": 0.2649, "step": 265 }, { "epoch": 3.6486486486486487, "grad_norm": 1.1118655992241262, "learning_rate": 1.3426540101966257e-05, "loss": 0.2604, "step": 270 }, { "epoch": 3.7162162162162162, "grad_norm": 1.0529271525205388, "learning_rate": 1.2655258443410268e-05, "loss": 0.2579, "step": 275 }, { "epoch": 3.7837837837837838, "grad_norm": 1.0548277025378372, "learning_rate": 1.1913702046034016e-05, "loss": 0.2599, "step": 280 }, { "epoch": 3.8513513513513513, "grad_norm": 1.4173329724650015, "learning_rate": 1.1203355809895042e-05, "loss": 0.2656, "step": 285 }, { "epoch": 3.918918918918919, "grad_norm": 1.0269890510409652, "learning_rate": 1.0525642139507222e-05, "loss": 0.2574, "step": 290 }, { "epoch": 3.9864864864864864, "grad_norm": 1.1498277528126646, "learning_rate": 9.881918095603501e-06, "loss": 0.2574, "step": 295 }, { "epoch": 4.054054054054054, "grad_norm": 0.8436946587158761, "learning_rate": 9.273472677743755e-06, "loss": 0.1599, "step": 300 }, { "epoch": 4.121621621621622, "grad_norm": 1.2705698581623819, "learning_rate": 8.701524243208935e-06, "loss": 0.1228, "step": 305 }, { "epoch": 4.1891891891891895, "grad_norm": 1.0968579890950991, "learning_rate": 8.167218067350122e-06, "loss": 0.1248, "step": 310 }, { "epoch": 4.256756756756757, "grad_norm": 0.8038199196704696, "learning_rate": 7.67162405027753e-06, "loss": 0.1151, "step": 315 }, { "epoch": 4.324324324324325, "grad_norm": 0.8103081581659332, "learning_rate": 7.21573457448164e-06, "loss": 0.1128, "step": 320 }, { "epoch": 4.391891891891892, "grad_norm": 0.9357373873541008, "learning_rate": 6.800462517676456e-06, "loss": 0.1133, "step": 325 }, { "epoch": 4.45945945945946, "grad_norm": 0.8885827558287142, "learning_rate": 6.426639424843892e-06, "loss": 0.113, "step": 330 }, { "epoch": 4.527027027027027, "grad_norm": 0.8834804613963296, "learning_rate": 6.095013843139662e-06, "loss": 0.1097, "step": 335 }, { "epoch": 4.594594594594595, "grad_norm": 0.9242949281405863, "learning_rate": 5.806249822994849e-06, "loss": 0.1116, "step": 340 }, { "epoch": 4.662162162162162, "grad_norm": 0.869762515620012, "learning_rate": 5.560925588414595e-06, "loss": 0.1115, "step": 345 }, { "epoch": 4.72972972972973, "grad_norm": 0.9028003791945425, "learning_rate": 5.359532379136472e-06, "loss": 0.1138, "step": 350 }, { "epoch": 4.797297297297297, "grad_norm": 0.89280810525534, "learning_rate": 5.20247346696706e-06, "loss": 0.1125, "step": 355 }, { "epoch": 4.864864864864865, "grad_norm": 0.8976015534415421, "learning_rate": 5.090063348266363e-06, "loss": 0.1118, "step": 360 }, { "epoch": 4.9324324324324325, "grad_norm": 0.8875204874091983, "learning_rate": 5.022527114197126e-06, "loss": 0.1156, "step": 365 }, { "epoch": 5.0, "grad_norm": 0.8600305844700733, "learning_rate": 5e-06, "loss": 0.1105, "step": 370 }, { "epoch": 5.0, "step": 370, "total_flos": 720778578231296.0, "train_loss": 0.5552037896336736, "train_runtime": 9234.6195, "train_samples_per_second": 5.122, "train_steps_per_second": 0.04 } ], "logging_steps": 5, "max_steps": 370, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 720778578231296.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }