|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9989913253984265, |
|
"eval_steps": 100, |
|
"global_step": 619, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016138793625176517, |
|
"grad_norm": 3.143124580383301, |
|
"learning_rate": 7.258064516129033e-06, |
|
"loss": 1.6561, |
|
"mean_token_accuracy": 0.7124999972060323, |
|
"num_tokens": 40960.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03227758725035303, |
|
"grad_norm": 2.3436570167541504, |
|
"learning_rate": 1.5322580645161292e-05, |
|
"loss": 0.1926, |
|
"mean_token_accuracy": 0.966169273853302, |
|
"num_tokens": 81920.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.048416380875529554, |
|
"grad_norm": 3.512237071990967, |
|
"learning_rate": 2.338709677419355e-05, |
|
"loss": 0.1742, |
|
"mean_token_accuracy": 0.9685420736670494, |
|
"num_tokens": 122880.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06455517450070607, |
|
"grad_norm": 2.72192645072937, |
|
"learning_rate": 3.1451612903225806e-05, |
|
"loss": 0.1687, |
|
"mean_token_accuracy": 0.9700342424213886, |
|
"num_tokens": 163840.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08069396812588259, |
|
"grad_norm": 2.1354479789733887, |
|
"learning_rate": 3.951612903225806e-05, |
|
"loss": 0.1726, |
|
"mean_token_accuracy": 0.9692270033061504, |
|
"num_tokens": 204800.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09683276175105911, |
|
"grad_norm": 3.7211921215057373, |
|
"learning_rate": 4.7580645161290326e-05, |
|
"loss": 0.1884, |
|
"mean_token_accuracy": 0.9661448106169701, |
|
"num_tokens": 245760.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11297155537623563, |
|
"grad_norm": 1.5104479789733887, |
|
"learning_rate": 4.937163375224417e-05, |
|
"loss": 0.1869, |
|
"mean_token_accuracy": 0.9675391376018524, |
|
"num_tokens": 286720.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12911034900141213, |
|
"grad_norm": 2.113689422607422, |
|
"learning_rate": 4.847396768402155e-05, |
|
"loss": 0.1815, |
|
"mean_token_accuracy": 0.9682729855179787, |
|
"num_tokens": 327680.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14524914262658867, |
|
"grad_norm": 1.308586597442627, |
|
"learning_rate": 4.7576301615798926e-05, |
|
"loss": 0.1955, |
|
"mean_token_accuracy": 0.9657534204423428, |
|
"num_tokens": 368640.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16138793625176517, |
|
"grad_norm": 1.7842236757278442, |
|
"learning_rate": 4.667863554757631e-05, |
|
"loss": 0.1893, |
|
"mean_token_accuracy": 0.9666340447962284, |
|
"num_tokens": 409600.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1775267298769417, |
|
"grad_norm": 1.172074556350708, |
|
"learning_rate": 4.578096947935368e-05, |
|
"loss": 0.1849, |
|
"mean_token_accuracy": 0.9681262239813805, |
|
"num_tokens": 450560.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19366552350211821, |
|
"grad_norm": 1.2354941368103027, |
|
"learning_rate": 4.488330341113106e-05, |
|
"loss": 0.1884, |
|
"mean_token_accuracy": 0.9675146743655205, |
|
"num_tokens": 491520.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.20980431712729475, |
|
"grad_norm": 1.0571320056915283, |
|
"learning_rate": 4.398563734290844e-05, |
|
"loss": 0.1968, |
|
"mean_token_accuracy": 0.9660469628870487, |
|
"num_tokens": 532480.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.22594311075247125, |
|
"grad_norm": 1.255366563796997, |
|
"learning_rate": 4.308797127468582e-05, |
|
"loss": 0.1882, |
|
"mean_token_accuracy": 0.9669765122234821, |
|
"num_tokens": 573440.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24208190437764776, |
|
"grad_norm": 0.7935602068901062, |
|
"learning_rate": 4.21903052064632e-05, |
|
"loss": 0.189, |
|
"mean_token_accuracy": 0.9665362000465393, |
|
"num_tokens": 614400.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.25822069800282427, |
|
"grad_norm": 1.0346981287002563, |
|
"learning_rate": 4.129263913824057e-05, |
|
"loss": 0.1911, |
|
"mean_token_accuracy": 0.9670254394412041, |
|
"num_tokens": 655360.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.27435949162800083, |
|
"grad_norm": 1.0733519792556763, |
|
"learning_rate": 4.0394973070017954e-05, |
|
"loss": 0.1945, |
|
"mean_token_accuracy": 0.9651418760418892, |
|
"num_tokens": 696320.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.29049828525317734, |
|
"grad_norm": 1.1686853170394897, |
|
"learning_rate": 3.9497307001795335e-05, |
|
"loss": 0.1883, |
|
"mean_token_accuracy": 0.9680283769965172, |
|
"num_tokens": 737280.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.30663707887835384, |
|
"grad_norm": 1.4509366750717163, |
|
"learning_rate": 3.859964093357271e-05, |
|
"loss": 0.1815, |
|
"mean_token_accuracy": 0.9682240642607212, |
|
"num_tokens": 778240.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32277587250353035, |
|
"grad_norm": 1.7541567087173462, |
|
"learning_rate": 3.770197486535009e-05, |
|
"loss": 0.1799, |
|
"mean_token_accuracy": 0.9676614426076412, |
|
"num_tokens": 819200.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.33891466612870685, |
|
"grad_norm": 0.8904175162315369, |
|
"learning_rate": 3.6804308797127465e-05, |
|
"loss": 0.1789, |
|
"mean_token_accuracy": 0.9690802305936813, |
|
"num_tokens": 860160.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3550534597538834, |
|
"grad_norm": 1.066307783126831, |
|
"learning_rate": 3.5906642728904846e-05, |
|
"loss": 0.1884, |
|
"mean_token_accuracy": 0.967025438696146, |
|
"num_tokens": 901120.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3711922533790599, |
|
"grad_norm": 0.8745315670967102, |
|
"learning_rate": 3.500897666068223e-05, |
|
"loss": 0.1873, |
|
"mean_token_accuracy": 0.9678571395576, |
|
"num_tokens": 942080.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.38733104700423643, |
|
"grad_norm": 1.2400248050689697, |
|
"learning_rate": 3.411131059245961e-05, |
|
"loss": 0.1843, |
|
"mean_token_accuracy": 0.9680772952735424, |
|
"num_tokens": 983040.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.40346984062941293, |
|
"grad_norm": 0.9607365131378174, |
|
"learning_rate": 3.321364452423699e-05, |
|
"loss": 0.2003, |
|
"mean_token_accuracy": 0.9650195680558682, |
|
"num_tokens": 1024000.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4196086342545895, |
|
"grad_norm": 1.288979172706604, |
|
"learning_rate": 3.231597845601436e-05, |
|
"loss": 0.1874, |
|
"mean_token_accuracy": 0.9666095845401287, |
|
"num_tokens": 1064960.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.435747427879766, |
|
"grad_norm": 1.0444912910461426, |
|
"learning_rate": 3.1418312387791744e-05, |
|
"loss": 0.2023, |
|
"mean_token_accuracy": 0.9641144774854183, |
|
"num_tokens": 1105920.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4518862215049425, |
|
"grad_norm": 1.1588054895401, |
|
"learning_rate": 3.0520646319569125e-05, |
|
"loss": 0.1824, |
|
"mean_token_accuracy": 0.9674902133643627, |
|
"num_tokens": 1146880.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.468025015130119, |
|
"grad_norm": 1.1021692752838135, |
|
"learning_rate": 2.96229802513465e-05, |
|
"loss": 0.1845, |
|
"mean_token_accuracy": 0.9678815990686417, |
|
"num_tokens": 1187840.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4841638087552955, |
|
"grad_norm": 1.2420721054077148, |
|
"learning_rate": 2.872531418312388e-05, |
|
"loss": 0.1975, |
|
"mean_token_accuracy": 0.9652152620255947, |
|
"num_tokens": 1228800.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5003026023804721, |
|
"grad_norm": 1.1611697673797607, |
|
"learning_rate": 2.7827648114901255e-05, |
|
"loss": 0.1798, |
|
"mean_token_accuracy": 0.9679794535040855, |
|
"num_tokens": 1269760.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5164413960056485, |
|
"grad_norm": 0.9205463528633118, |
|
"learning_rate": 2.6929982046678636e-05, |
|
"loss": 0.1881, |
|
"mean_token_accuracy": 0.9673923663794994, |
|
"num_tokens": 1310720.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5325801896308251, |
|
"grad_norm": 1.0037810802459717, |
|
"learning_rate": 2.6032315978456017e-05, |
|
"loss": 0.1788, |
|
"mean_token_accuracy": 0.9688356101512909, |
|
"num_tokens": 1351680.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5487189832560017, |
|
"grad_norm": 1.0601574182510376, |
|
"learning_rate": 2.5134649910233395e-05, |
|
"loss": 0.1839, |
|
"mean_token_accuracy": 0.9674412839114666, |
|
"num_tokens": 1392640.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5648577768811781, |
|
"grad_norm": 1.083649754524231, |
|
"learning_rate": 2.4236983842010776e-05, |
|
"loss": 0.1866, |
|
"mean_token_accuracy": 0.9668052822351456, |
|
"num_tokens": 1433600.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5809965705063547, |
|
"grad_norm": 0.9604955315589905, |
|
"learning_rate": 2.3339317773788153e-05, |
|
"loss": 0.1792, |
|
"mean_token_accuracy": 0.9675391390919685, |
|
"num_tokens": 1474560.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5971353641315311, |
|
"grad_norm": 1.1074978113174438, |
|
"learning_rate": 2.244165170556553e-05, |
|
"loss": 0.19, |
|
"mean_token_accuracy": 0.966560660302639, |
|
"num_tokens": 1515520.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6132741577567077, |
|
"grad_norm": 1.0041981935501099, |
|
"learning_rate": 2.154398563734291e-05, |
|
"loss": 0.1958, |
|
"mean_token_accuracy": 0.9653864920139312, |
|
"num_tokens": 1556480.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6294129513818842, |
|
"grad_norm": 0.9702057838439941, |
|
"learning_rate": 2.0646319569120286e-05, |
|
"loss": 0.1776, |
|
"mean_token_accuracy": 0.9691046938300133, |
|
"num_tokens": 1597440.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6455517450070607, |
|
"grad_norm": 0.767419159412384, |
|
"learning_rate": 1.9748653500897668e-05, |
|
"loss": 0.1626, |
|
"mean_token_accuracy": 0.9722602725028991, |
|
"num_tokens": 1638400.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6616905386322373, |
|
"grad_norm": 1.132580041885376, |
|
"learning_rate": 1.8850987432675045e-05, |
|
"loss": 0.1807, |
|
"mean_token_accuracy": 0.9676369808614254, |
|
"num_tokens": 1679360.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6778293322574137, |
|
"grad_norm": 1.1673997640609741, |
|
"learning_rate": 1.7953321364452423e-05, |
|
"loss": 0.1911, |
|
"mean_token_accuracy": 0.9661203488707543, |
|
"num_tokens": 1720320.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6939681258825903, |
|
"grad_norm": 1.094133734703064, |
|
"learning_rate": 1.7055655296229804e-05, |
|
"loss": 0.1811, |
|
"mean_token_accuracy": 0.9679305233061314, |
|
"num_tokens": 1761280.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7101069195077668, |
|
"grad_norm": 0.9924063086509705, |
|
"learning_rate": 1.615798922800718e-05, |
|
"loss": 0.1657, |
|
"mean_token_accuracy": 0.9702054776251317, |
|
"num_tokens": 1802240.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7262457131329433, |
|
"grad_norm": 1.4084646701812744, |
|
"learning_rate": 1.5260323159784563e-05, |
|
"loss": 0.181, |
|
"mean_token_accuracy": 0.9674412921071053, |
|
"num_tokens": 1843200.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7423845067581198, |
|
"grad_norm": 1.202055811882019, |
|
"learning_rate": 1.436265709156194e-05, |
|
"loss": 0.1733, |
|
"mean_token_accuracy": 0.9695694677531719, |
|
"num_tokens": 1884160.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7585233003832963, |
|
"grad_norm": 0.8700960278511047, |
|
"learning_rate": 1.3464991023339318e-05, |
|
"loss": 0.1799, |
|
"mean_token_accuracy": 0.9682240679860115, |
|
"num_tokens": 1925120.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7746620940084729, |
|
"grad_norm": 0.9524367451667786, |
|
"learning_rate": 1.2567324955116697e-05, |
|
"loss": 0.1843, |
|
"mean_token_accuracy": 0.9665117390453816, |
|
"num_tokens": 1966080.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7908008876336494, |
|
"grad_norm": 0.836246132850647, |
|
"learning_rate": 1.1669658886894077e-05, |
|
"loss": 0.1835, |
|
"mean_token_accuracy": 0.9679794497787952, |
|
"num_tokens": 2007040.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8069396812588259, |
|
"grad_norm": 0.9284518361091614, |
|
"learning_rate": 1.0771992818671454e-05, |
|
"loss": 0.1762, |
|
"mean_token_accuracy": 0.9691536143422127, |
|
"num_tokens": 2048000.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8230784748840024, |
|
"grad_norm": 0.9532117247581482, |
|
"learning_rate": 9.874326750448834e-06, |
|
"loss": 0.1707, |
|
"mean_token_accuracy": 0.9692759282886982, |
|
"num_tokens": 2088960.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.839217268509179, |
|
"grad_norm": 1.1545753479003906, |
|
"learning_rate": 8.976660682226211e-06, |
|
"loss": 0.176, |
|
"mean_token_accuracy": 0.9688845381140709, |
|
"num_tokens": 2129920.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8553560621343554, |
|
"grad_norm": 0.9475525617599487, |
|
"learning_rate": 8.07899461400359e-06, |
|
"loss": 0.1772, |
|
"mean_token_accuracy": 0.9683953016996384, |
|
"num_tokens": 2170880.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.871494855759532, |
|
"grad_norm": 0.9343079924583435, |
|
"learning_rate": 7.18132854578097e-06, |
|
"loss": 0.1799, |
|
"mean_token_accuracy": 0.9672211319208145, |
|
"num_tokens": 2211840.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8876336493847085, |
|
"grad_norm": 1.152690052986145, |
|
"learning_rate": 6.283662477558349e-06, |
|
"loss": 0.1676, |
|
"mean_token_accuracy": 0.9698140859603882, |
|
"num_tokens": 2252800.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.903772443009885, |
|
"grad_norm": 0.960165798664093, |
|
"learning_rate": 5.385996409335727e-06, |
|
"loss": 0.1758, |
|
"mean_token_accuracy": 0.9693982377648354, |
|
"num_tokens": 2293760.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9199112366350616, |
|
"grad_norm": 0.8894097208976746, |
|
"learning_rate": 4.488330341113106e-06, |
|
"loss": 0.1829, |
|
"mean_token_accuracy": 0.9673923671245575, |
|
"num_tokens": 2334720.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.936050030260238, |
|
"grad_norm": 1.2608239650726318, |
|
"learning_rate": 3.590664272890485e-06, |
|
"loss": 0.1762, |
|
"mean_token_accuracy": 0.968297453969717, |
|
"num_tokens": 2375680.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9521888238854146, |
|
"grad_norm": 1.042999029159546, |
|
"learning_rate": 2.6929982046678636e-06, |
|
"loss": 0.1581, |
|
"mean_token_accuracy": 0.9708414882421493, |
|
"num_tokens": 2416640.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.968327617510591, |
|
"grad_norm": 0.7505899667739868, |
|
"learning_rate": 1.7953321364452425e-06, |
|
"loss": 0.1758, |
|
"mean_token_accuracy": 0.9684442266821861, |
|
"num_tokens": 2457600.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9844664111357676, |
|
"grad_norm": 0.9210404753684998, |
|
"learning_rate": 8.976660682226213e-07, |
|
"loss": 0.1713, |
|
"mean_token_accuracy": 0.969838547706604, |
|
"num_tokens": 2498560.0, |
|
"step": 610 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 619, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6700630554968064.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|