{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989913253984265, "eval_steps": 100, "global_step": 619, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016138793625176517, "grad_norm": 3.143124580383301, "learning_rate": 7.258064516129033e-06, "loss": 1.6561, "mean_token_accuracy": 0.7124999972060323, "num_tokens": 40960.0, "step": 10 }, { "epoch": 0.03227758725035303, "grad_norm": 2.3436570167541504, "learning_rate": 1.5322580645161292e-05, "loss": 0.1926, "mean_token_accuracy": 0.966169273853302, "num_tokens": 81920.0, "step": 20 }, { "epoch": 0.048416380875529554, "grad_norm": 3.512237071990967, "learning_rate": 2.338709677419355e-05, "loss": 0.1742, "mean_token_accuracy": 0.9685420736670494, "num_tokens": 122880.0, "step": 30 }, { "epoch": 0.06455517450070607, "grad_norm": 2.72192645072937, "learning_rate": 3.1451612903225806e-05, "loss": 0.1687, "mean_token_accuracy": 0.9700342424213886, "num_tokens": 163840.0, "step": 40 }, { "epoch": 0.08069396812588259, "grad_norm": 2.1354479789733887, "learning_rate": 3.951612903225806e-05, "loss": 0.1726, "mean_token_accuracy": 0.9692270033061504, "num_tokens": 204800.0, "step": 50 }, { "epoch": 0.09683276175105911, "grad_norm": 3.7211921215057373, "learning_rate": 4.7580645161290326e-05, "loss": 0.1884, "mean_token_accuracy": 0.9661448106169701, "num_tokens": 245760.0, "step": 60 }, { "epoch": 0.11297155537623563, "grad_norm": 1.5104479789733887, "learning_rate": 4.937163375224417e-05, "loss": 0.1869, "mean_token_accuracy": 0.9675391376018524, "num_tokens": 286720.0, "step": 70 }, { "epoch": 0.12911034900141213, "grad_norm": 2.113689422607422, "learning_rate": 4.847396768402155e-05, "loss": 0.1815, "mean_token_accuracy": 0.9682729855179787, "num_tokens": 327680.0, "step": 80 }, { "epoch": 0.14524914262658867, "grad_norm": 1.308586597442627, "learning_rate": 4.7576301615798926e-05, "loss": 0.1955, "mean_token_accuracy": 0.9657534204423428, "num_tokens": 368640.0, "step": 90 }, { "epoch": 0.16138793625176517, "grad_norm": 1.7842236757278442, "learning_rate": 4.667863554757631e-05, "loss": 0.1893, "mean_token_accuracy": 0.9666340447962284, "num_tokens": 409600.0, "step": 100 }, { "epoch": 0.1775267298769417, "grad_norm": 1.172074556350708, "learning_rate": 4.578096947935368e-05, "loss": 0.1849, "mean_token_accuracy": 0.9681262239813805, "num_tokens": 450560.0, "step": 110 }, { "epoch": 0.19366552350211821, "grad_norm": 1.2354941368103027, "learning_rate": 4.488330341113106e-05, "loss": 0.1884, "mean_token_accuracy": 0.9675146743655205, "num_tokens": 491520.0, "step": 120 }, { "epoch": 0.20980431712729475, "grad_norm": 1.0571320056915283, "learning_rate": 4.398563734290844e-05, "loss": 0.1968, "mean_token_accuracy": 0.9660469628870487, "num_tokens": 532480.0, "step": 130 }, { "epoch": 0.22594311075247125, "grad_norm": 1.255366563796997, "learning_rate": 4.308797127468582e-05, "loss": 0.1882, "mean_token_accuracy": 0.9669765122234821, "num_tokens": 573440.0, "step": 140 }, { "epoch": 0.24208190437764776, "grad_norm": 0.7935602068901062, "learning_rate": 4.21903052064632e-05, "loss": 0.189, "mean_token_accuracy": 0.9665362000465393, "num_tokens": 614400.0, "step": 150 }, { "epoch": 0.25822069800282427, "grad_norm": 1.0346981287002563, "learning_rate": 4.129263913824057e-05, "loss": 0.1911, "mean_token_accuracy": 0.9670254394412041, "num_tokens": 655360.0, "step": 160 }, { "epoch": 0.27435949162800083, "grad_norm": 1.0733519792556763, "learning_rate": 4.0394973070017954e-05, "loss": 0.1945, "mean_token_accuracy": 0.9651418760418892, "num_tokens": 696320.0, "step": 170 }, { "epoch": 0.29049828525317734, "grad_norm": 1.1686853170394897, "learning_rate": 3.9497307001795335e-05, "loss": 0.1883, "mean_token_accuracy": 0.9680283769965172, "num_tokens": 737280.0, "step": 180 }, { "epoch": 0.30663707887835384, "grad_norm": 1.4509366750717163, "learning_rate": 3.859964093357271e-05, "loss": 0.1815, "mean_token_accuracy": 0.9682240642607212, "num_tokens": 778240.0, "step": 190 }, { "epoch": 0.32277587250353035, "grad_norm": 1.7541567087173462, "learning_rate": 3.770197486535009e-05, "loss": 0.1799, "mean_token_accuracy": 0.9676614426076412, "num_tokens": 819200.0, "step": 200 }, { "epoch": 0.33891466612870685, "grad_norm": 0.8904175162315369, "learning_rate": 3.6804308797127465e-05, "loss": 0.1789, "mean_token_accuracy": 0.9690802305936813, "num_tokens": 860160.0, "step": 210 }, { "epoch": 0.3550534597538834, "grad_norm": 1.066307783126831, "learning_rate": 3.5906642728904846e-05, "loss": 0.1884, "mean_token_accuracy": 0.967025438696146, "num_tokens": 901120.0, "step": 220 }, { "epoch": 0.3711922533790599, "grad_norm": 0.8745315670967102, "learning_rate": 3.500897666068223e-05, "loss": 0.1873, "mean_token_accuracy": 0.9678571395576, "num_tokens": 942080.0, "step": 230 }, { "epoch": 0.38733104700423643, "grad_norm": 1.2400248050689697, "learning_rate": 3.411131059245961e-05, "loss": 0.1843, "mean_token_accuracy": 0.9680772952735424, "num_tokens": 983040.0, "step": 240 }, { "epoch": 0.40346984062941293, "grad_norm": 0.9607365131378174, "learning_rate": 3.321364452423699e-05, "loss": 0.2003, "mean_token_accuracy": 0.9650195680558682, "num_tokens": 1024000.0, "step": 250 }, { "epoch": 0.4196086342545895, "grad_norm": 1.288979172706604, "learning_rate": 3.231597845601436e-05, "loss": 0.1874, "mean_token_accuracy": 0.9666095845401287, "num_tokens": 1064960.0, "step": 260 }, { "epoch": 0.435747427879766, "grad_norm": 1.0444912910461426, "learning_rate": 3.1418312387791744e-05, "loss": 0.2023, "mean_token_accuracy": 0.9641144774854183, "num_tokens": 1105920.0, "step": 270 }, { "epoch": 0.4518862215049425, "grad_norm": 1.1588054895401, "learning_rate": 3.0520646319569125e-05, "loss": 0.1824, "mean_token_accuracy": 0.9674902133643627, "num_tokens": 1146880.0, "step": 280 }, { "epoch": 0.468025015130119, "grad_norm": 1.1021692752838135, "learning_rate": 2.96229802513465e-05, "loss": 0.1845, "mean_token_accuracy": 0.9678815990686417, "num_tokens": 1187840.0, "step": 290 }, { "epoch": 0.4841638087552955, "grad_norm": 1.2420721054077148, "learning_rate": 2.872531418312388e-05, "loss": 0.1975, "mean_token_accuracy": 0.9652152620255947, "num_tokens": 1228800.0, "step": 300 }, { "epoch": 0.5003026023804721, "grad_norm": 1.1611697673797607, "learning_rate": 2.7827648114901255e-05, "loss": 0.1798, "mean_token_accuracy": 0.9679794535040855, "num_tokens": 1269760.0, "step": 310 }, { "epoch": 0.5164413960056485, "grad_norm": 0.9205463528633118, "learning_rate": 2.6929982046678636e-05, "loss": 0.1881, "mean_token_accuracy": 0.9673923663794994, "num_tokens": 1310720.0, "step": 320 }, { "epoch": 0.5325801896308251, "grad_norm": 1.0037810802459717, "learning_rate": 2.6032315978456017e-05, "loss": 0.1788, "mean_token_accuracy": 0.9688356101512909, "num_tokens": 1351680.0, "step": 330 }, { "epoch": 0.5487189832560017, "grad_norm": 1.0601574182510376, "learning_rate": 2.5134649910233395e-05, "loss": 0.1839, "mean_token_accuracy": 0.9674412839114666, "num_tokens": 1392640.0, "step": 340 }, { "epoch": 0.5648577768811781, "grad_norm": 1.083649754524231, "learning_rate": 2.4236983842010776e-05, "loss": 0.1866, "mean_token_accuracy": 0.9668052822351456, "num_tokens": 1433600.0, "step": 350 }, { "epoch": 0.5809965705063547, "grad_norm": 0.9604955315589905, "learning_rate": 2.3339317773788153e-05, "loss": 0.1792, "mean_token_accuracy": 0.9675391390919685, "num_tokens": 1474560.0, "step": 360 }, { "epoch": 0.5971353641315311, "grad_norm": 1.1074978113174438, "learning_rate": 2.244165170556553e-05, "loss": 0.19, "mean_token_accuracy": 0.966560660302639, "num_tokens": 1515520.0, "step": 370 }, { "epoch": 0.6132741577567077, "grad_norm": 1.0041981935501099, "learning_rate": 2.154398563734291e-05, "loss": 0.1958, "mean_token_accuracy": 0.9653864920139312, "num_tokens": 1556480.0, "step": 380 }, { "epoch": 0.6294129513818842, "grad_norm": 0.9702057838439941, "learning_rate": 2.0646319569120286e-05, "loss": 0.1776, "mean_token_accuracy": 0.9691046938300133, "num_tokens": 1597440.0, "step": 390 }, { "epoch": 0.6455517450070607, "grad_norm": 0.767419159412384, "learning_rate": 1.9748653500897668e-05, "loss": 0.1626, "mean_token_accuracy": 0.9722602725028991, "num_tokens": 1638400.0, "step": 400 }, { "epoch": 0.6616905386322373, "grad_norm": 1.132580041885376, "learning_rate": 1.8850987432675045e-05, "loss": 0.1807, "mean_token_accuracy": 0.9676369808614254, "num_tokens": 1679360.0, "step": 410 }, { "epoch": 0.6778293322574137, "grad_norm": 1.1673997640609741, "learning_rate": 1.7953321364452423e-05, "loss": 0.1911, "mean_token_accuracy": 0.9661203488707543, "num_tokens": 1720320.0, "step": 420 }, { "epoch": 0.6939681258825903, "grad_norm": 1.094133734703064, "learning_rate": 1.7055655296229804e-05, "loss": 0.1811, "mean_token_accuracy": 0.9679305233061314, "num_tokens": 1761280.0, "step": 430 }, { "epoch": 0.7101069195077668, "grad_norm": 0.9924063086509705, "learning_rate": 1.615798922800718e-05, "loss": 0.1657, "mean_token_accuracy": 0.9702054776251317, "num_tokens": 1802240.0, "step": 440 }, { "epoch": 0.7262457131329433, "grad_norm": 1.4084646701812744, "learning_rate": 1.5260323159784563e-05, "loss": 0.181, "mean_token_accuracy": 0.9674412921071053, "num_tokens": 1843200.0, "step": 450 }, { "epoch": 0.7423845067581198, "grad_norm": 1.202055811882019, "learning_rate": 1.436265709156194e-05, "loss": 0.1733, "mean_token_accuracy": 0.9695694677531719, "num_tokens": 1884160.0, "step": 460 }, { "epoch": 0.7585233003832963, "grad_norm": 0.8700960278511047, "learning_rate": 1.3464991023339318e-05, "loss": 0.1799, "mean_token_accuracy": 0.9682240679860115, "num_tokens": 1925120.0, "step": 470 }, { "epoch": 0.7746620940084729, "grad_norm": 0.9524367451667786, "learning_rate": 1.2567324955116697e-05, "loss": 0.1843, "mean_token_accuracy": 0.9665117390453816, "num_tokens": 1966080.0, "step": 480 }, { "epoch": 0.7908008876336494, "grad_norm": 0.836246132850647, "learning_rate": 1.1669658886894077e-05, "loss": 0.1835, "mean_token_accuracy": 0.9679794497787952, "num_tokens": 2007040.0, "step": 490 }, { "epoch": 0.8069396812588259, "grad_norm": 0.9284518361091614, "learning_rate": 1.0771992818671454e-05, "loss": 0.1762, "mean_token_accuracy": 0.9691536143422127, "num_tokens": 2048000.0, "step": 500 }, { "epoch": 0.8230784748840024, "grad_norm": 0.9532117247581482, "learning_rate": 9.874326750448834e-06, "loss": 0.1707, "mean_token_accuracy": 0.9692759282886982, "num_tokens": 2088960.0, "step": 510 }, { "epoch": 0.839217268509179, "grad_norm": 1.1545753479003906, "learning_rate": 8.976660682226211e-06, "loss": 0.176, "mean_token_accuracy": 0.9688845381140709, "num_tokens": 2129920.0, "step": 520 }, { "epoch": 0.8553560621343554, "grad_norm": 0.9475525617599487, "learning_rate": 8.07899461400359e-06, "loss": 0.1772, "mean_token_accuracy": 0.9683953016996384, "num_tokens": 2170880.0, "step": 530 }, { "epoch": 0.871494855759532, "grad_norm": 0.9343079924583435, "learning_rate": 7.18132854578097e-06, "loss": 0.1799, "mean_token_accuracy": 0.9672211319208145, "num_tokens": 2211840.0, "step": 540 }, { "epoch": 0.8876336493847085, "grad_norm": 1.152690052986145, "learning_rate": 6.283662477558349e-06, "loss": 0.1676, "mean_token_accuracy": 0.9698140859603882, "num_tokens": 2252800.0, "step": 550 }, { "epoch": 0.903772443009885, "grad_norm": 0.960165798664093, "learning_rate": 5.385996409335727e-06, "loss": 0.1758, "mean_token_accuracy": 0.9693982377648354, "num_tokens": 2293760.0, "step": 560 }, { "epoch": 0.9199112366350616, "grad_norm": 0.8894097208976746, "learning_rate": 4.488330341113106e-06, "loss": 0.1829, "mean_token_accuracy": 0.9673923671245575, "num_tokens": 2334720.0, "step": 570 }, { "epoch": 0.936050030260238, "grad_norm": 1.2608239650726318, "learning_rate": 3.590664272890485e-06, "loss": 0.1762, "mean_token_accuracy": 0.968297453969717, "num_tokens": 2375680.0, "step": 580 }, { "epoch": 0.9521888238854146, "grad_norm": 1.042999029159546, "learning_rate": 2.6929982046678636e-06, "loss": 0.1581, "mean_token_accuracy": 0.9708414882421493, "num_tokens": 2416640.0, "step": 590 }, { "epoch": 0.968327617510591, "grad_norm": 0.7505899667739868, "learning_rate": 1.7953321364452425e-06, "loss": 0.1758, "mean_token_accuracy": 0.9684442266821861, "num_tokens": 2457600.0, "step": 600 }, { "epoch": 0.9844664111357676, "grad_norm": 0.9210404753684998, "learning_rate": 8.976660682226213e-07, "loss": 0.1713, "mean_token_accuracy": 0.969838547706604, "num_tokens": 2498560.0, "step": 610 } ], "logging_steps": 10, "max_steps": 619, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6700630554968064.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }