shulijia's picture
Training in progress, step 619, checkpoint
98344e8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9989913253984265,
"eval_steps": 100,
"global_step": 619,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016138793625176517,
"grad_norm": 3.143124580383301,
"learning_rate": 7.258064516129033e-06,
"loss": 1.6561,
"mean_token_accuracy": 0.7124999972060323,
"num_tokens": 40960.0,
"step": 10
},
{
"epoch": 0.03227758725035303,
"grad_norm": 2.3436570167541504,
"learning_rate": 1.5322580645161292e-05,
"loss": 0.1926,
"mean_token_accuracy": 0.966169273853302,
"num_tokens": 81920.0,
"step": 20
},
{
"epoch": 0.048416380875529554,
"grad_norm": 3.512237071990967,
"learning_rate": 2.338709677419355e-05,
"loss": 0.1742,
"mean_token_accuracy": 0.9685420736670494,
"num_tokens": 122880.0,
"step": 30
},
{
"epoch": 0.06455517450070607,
"grad_norm": 2.72192645072937,
"learning_rate": 3.1451612903225806e-05,
"loss": 0.1687,
"mean_token_accuracy": 0.9700342424213886,
"num_tokens": 163840.0,
"step": 40
},
{
"epoch": 0.08069396812588259,
"grad_norm": 2.1354479789733887,
"learning_rate": 3.951612903225806e-05,
"loss": 0.1726,
"mean_token_accuracy": 0.9692270033061504,
"num_tokens": 204800.0,
"step": 50
},
{
"epoch": 0.09683276175105911,
"grad_norm": 3.7211921215057373,
"learning_rate": 4.7580645161290326e-05,
"loss": 0.1884,
"mean_token_accuracy": 0.9661448106169701,
"num_tokens": 245760.0,
"step": 60
},
{
"epoch": 0.11297155537623563,
"grad_norm": 1.5104479789733887,
"learning_rate": 4.937163375224417e-05,
"loss": 0.1869,
"mean_token_accuracy": 0.9675391376018524,
"num_tokens": 286720.0,
"step": 70
},
{
"epoch": 0.12911034900141213,
"grad_norm": 2.113689422607422,
"learning_rate": 4.847396768402155e-05,
"loss": 0.1815,
"mean_token_accuracy": 0.9682729855179787,
"num_tokens": 327680.0,
"step": 80
},
{
"epoch": 0.14524914262658867,
"grad_norm": 1.308586597442627,
"learning_rate": 4.7576301615798926e-05,
"loss": 0.1955,
"mean_token_accuracy": 0.9657534204423428,
"num_tokens": 368640.0,
"step": 90
},
{
"epoch": 0.16138793625176517,
"grad_norm": 1.7842236757278442,
"learning_rate": 4.667863554757631e-05,
"loss": 0.1893,
"mean_token_accuracy": 0.9666340447962284,
"num_tokens": 409600.0,
"step": 100
},
{
"epoch": 0.1775267298769417,
"grad_norm": 1.172074556350708,
"learning_rate": 4.578096947935368e-05,
"loss": 0.1849,
"mean_token_accuracy": 0.9681262239813805,
"num_tokens": 450560.0,
"step": 110
},
{
"epoch": 0.19366552350211821,
"grad_norm": 1.2354941368103027,
"learning_rate": 4.488330341113106e-05,
"loss": 0.1884,
"mean_token_accuracy": 0.9675146743655205,
"num_tokens": 491520.0,
"step": 120
},
{
"epoch": 0.20980431712729475,
"grad_norm": 1.0571320056915283,
"learning_rate": 4.398563734290844e-05,
"loss": 0.1968,
"mean_token_accuracy": 0.9660469628870487,
"num_tokens": 532480.0,
"step": 130
},
{
"epoch": 0.22594311075247125,
"grad_norm": 1.255366563796997,
"learning_rate": 4.308797127468582e-05,
"loss": 0.1882,
"mean_token_accuracy": 0.9669765122234821,
"num_tokens": 573440.0,
"step": 140
},
{
"epoch": 0.24208190437764776,
"grad_norm": 0.7935602068901062,
"learning_rate": 4.21903052064632e-05,
"loss": 0.189,
"mean_token_accuracy": 0.9665362000465393,
"num_tokens": 614400.0,
"step": 150
},
{
"epoch": 0.25822069800282427,
"grad_norm": 1.0346981287002563,
"learning_rate": 4.129263913824057e-05,
"loss": 0.1911,
"mean_token_accuracy": 0.9670254394412041,
"num_tokens": 655360.0,
"step": 160
},
{
"epoch": 0.27435949162800083,
"grad_norm": 1.0733519792556763,
"learning_rate": 4.0394973070017954e-05,
"loss": 0.1945,
"mean_token_accuracy": 0.9651418760418892,
"num_tokens": 696320.0,
"step": 170
},
{
"epoch": 0.29049828525317734,
"grad_norm": 1.1686853170394897,
"learning_rate": 3.9497307001795335e-05,
"loss": 0.1883,
"mean_token_accuracy": 0.9680283769965172,
"num_tokens": 737280.0,
"step": 180
},
{
"epoch": 0.30663707887835384,
"grad_norm": 1.4509366750717163,
"learning_rate": 3.859964093357271e-05,
"loss": 0.1815,
"mean_token_accuracy": 0.9682240642607212,
"num_tokens": 778240.0,
"step": 190
},
{
"epoch": 0.32277587250353035,
"grad_norm": 1.7541567087173462,
"learning_rate": 3.770197486535009e-05,
"loss": 0.1799,
"mean_token_accuracy": 0.9676614426076412,
"num_tokens": 819200.0,
"step": 200
},
{
"epoch": 0.33891466612870685,
"grad_norm": 0.8904175162315369,
"learning_rate": 3.6804308797127465e-05,
"loss": 0.1789,
"mean_token_accuracy": 0.9690802305936813,
"num_tokens": 860160.0,
"step": 210
},
{
"epoch": 0.3550534597538834,
"grad_norm": 1.066307783126831,
"learning_rate": 3.5906642728904846e-05,
"loss": 0.1884,
"mean_token_accuracy": 0.967025438696146,
"num_tokens": 901120.0,
"step": 220
},
{
"epoch": 0.3711922533790599,
"grad_norm": 0.8745315670967102,
"learning_rate": 3.500897666068223e-05,
"loss": 0.1873,
"mean_token_accuracy": 0.9678571395576,
"num_tokens": 942080.0,
"step": 230
},
{
"epoch": 0.38733104700423643,
"grad_norm": 1.2400248050689697,
"learning_rate": 3.411131059245961e-05,
"loss": 0.1843,
"mean_token_accuracy": 0.9680772952735424,
"num_tokens": 983040.0,
"step": 240
},
{
"epoch": 0.40346984062941293,
"grad_norm": 0.9607365131378174,
"learning_rate": 3.321364452423699e-05,
"loss": 0.2003,
"mean_token_accuracy": 0.9650195680558682,
"num_tokens": 1024000.0,
"step": 250
},
{
"epoch": 0.4196086342545895,
"grad_norm": 1.288979172706604,
"learning_rate": 3.231597845601436e-05,
"loss": 0.1874,
"mean_token_accuracy": 0.9666095845401287,
"num_tokens": 1064960.0,
"step": 260
},
{
"epoch": 0.435747427879766,
"grad_norm": 1.0444912910461426,
"learning_rate": 3.1418312387791744e-05,
"loss": 0.2023,
"mean_token_accuracy": 0.9641144774854183,
"num_tokens": 1105920.0,
"step": 270
},
{
"epoch": 0.4518862215049425,
"grad_norm": 1.1588054895401,
"learning_rate": 3.0520646319569125e-05,
"loss": 0.1824,
"mean_token_accuracy": 0.9674902133643627,
"num_tokens": 1146880.0,
"step": 280
},
{
"epoch": 0.468025015130119,
"grad_norm": 1.1021692752838135,
"learning_rate": 2.96229802513465e-05,
"loss": 0.1845,
"mean_token_accuracy": 0.9678815990686417,
"num_tokens": 1187840.0,
"step": 290
},
{
"epoch": 0.4841638087552955,
"grad_norm": 1.2420721054077148,
"learning_rate": 2.872531418312388e-05,
"loss": 0.1975,
"mean_token_accuracy": 0.9652152620255947,
"num_tokens": 1228800.0,
"step": 300
},
{
"epoch": 0.5003026023804721,
"grad_norm": 1.1611697673797607,
"learning_rate": 2.7827648114901255e-05,
"loss": 0.1798,
"mean_token_accuracy": 0.9679794535040855,
"num_tokens": 1269760.0,
"step": 310
},
{
"epoch": 0.5164413960056485,
"grad_norm": 0.9205463528633118,
"learning_rate": 2.6929982046678636e-05,
"loss": 0.1881,
"mean_token_accuracy": 0.9673923663794994,
"num_tokens": 1310720.0,
"step": 320
},
{
"epoch": 0.5325801896308251,
"grad_norm": 1.0037810802459717,
"learning_rate": 2.6032315978456017e-05,
"loss": 0.1788,
"mean_token_accuracy": 0.9688356101512909,
"num_tokens": 1351680.0,
"step": 330
},
{
"epoch": 0.5487189832560017,
"grad_norm": 1.0601574182510376,
"learning_rate": 2.5134649910233395e-05,
"loss": 0.1839,
"mean_token_accuracy": 0.9674412839114666,
"num_tokens": 1392640.0,
"step": 340
},
{
"epoch": 0.5648577768811781,
"grad_norm": 1.083649754524231,
"learning_rate": 2.4236983842010776e-05,
"loss": 0.1866,
"mean_token_accuracy": 0.9668052822351456,
"num_tokens": 1433600.0,
"step": 350
},
{
"epoch": 0.5809965705063547,
"grad_norm": 0.9604955315589905,
"learning_rate": 2.3339317773788153e-05,
"loss": 0.1792,
"mean_token_accuracy": 0.9675391390919685,
"num_tokens": 1474560.0,
"step": 360
},
{
"epoch": 0.5971353641315311,
"grad_norm": 1.1074978113174438,
"learning_rate": 2.244165170556553e-05,
"loss": 0.19,
"mean_token_accuracy": 0.966560660302639,
"num_tokens": 1515520.0,
"step": 370
},
{
"epoch": 0.6132741577567077,
"grad_norm": 1.0041981935501099,
"learning_rate": 2.154398563734291e-05,
"loss": 0.1958,
"mean_token_accuracy": 0.9653864920139312,
"num_tokens": 1556480.0,
"step": 380
},
{
"epoch": 0.6294129513818842,
"grad_norm": 0.9702057838439941,
"learning_rate": 2.0646319569120286e-05,
"loss": 0.1776,
"mean_token_accuracy": 0.9691046938300133,
"num_tokens": 1597440.0,
"step": 390
},
{
"epoch": 0.6455517450070607,
"grad_norm": 0.767419159412384,
"learning_rate": 1.9748653500897668e-05,
"loss": 0.1626,
"mean_token_accuracy": 0.9722602725028991,
"num_tokens": 1638400.0,
"step": 400
},
{
"epoch": 0.6616905386322373,
"grad_norm": 1.132580041885376,
"learning_rate": 1.8850987432675045e-05,
"loss": 0.1807,
"mean_token_accuracy": 0.9676369808614254,
"num_tokens": 1679360.0,
"step": 410
},
{
"epoch": 0.6778293322574137,
"grad_norm": 1.1673997640609741,
"learning_rate": 1.7953321364452423e-05,
"loss": 0.1911,
"mean_token_accuracy": 0.9661203488707543,
"num_tokens": 1720320.0,
"step": 420
},
{
"epoch": 0.6939681258825903,
"grad_norm": 1.094133734703064,
"learning_rate": 1.7055655296229804e-05,
"loss": 0.1811,
"mean_token_accuracy": 0.9679305233061314,
"num_tokens": 1761280.0,
"step": 430
},
{
"epoch": 0.7101069195077668,
"grad_norm": 0.9924063086509705,
"learning_rate": 1.615798922800718e-05,
"loss": 0.1657,
"mean_token_accuracy": 0.9702054776251317,
"num_tokens": 1802240.0,
"step": 440
},
{
"epoch": 0.7262457131329433,
"grad_norm": 1.4084646701812744,
"learning_rate": 1.5260323159784563e-05,
"loss": 0.181,
"mean_token_accuracy": 0.9674412921071053,
"num_tokens": 1843200.0,
"step": 450
},
{
"epoch": 0.7423845067581198,
"grad_norm": 1.202055811882019,
"learning_rate": 1.436265709156194e-05,
"loss": 0.1733,
"mean_token_accuracy": 0.9695694677531719,
"num_tokens": 1884160.0,
"step": 460
},
{
"epoch": 0.7585233003832963,
"grad_norm": 0.8700960278511047,
"learning_rate": 1.3464991023339318e-05,
"loss": 0.1799,
"mean_token_accuracy": 0.9682240679860115,
"num_tokens": 1925120.0,
"step": 470
},
{
"epoch": 0.7746620940084729,
"grad_norm": 0.9524367451667786,
"learning_rate": 1.2567324955116697e-05,
"loss": 0.1843,
"mean_token_accuracy": 0.9665117390453816,
"num_tokens": 1966080.0,
"step": 480
},
{
"epoch": 0.7908008876336494,
"grad_norm": 0.836246132850647,
"learning_rate": 1.1669658886894077e-05,
"loss": 0.1835,
"mean_token_accuracy": 0.9679794497787952,
"num_tokens": 2007040.0,
"step": 490
},
{
"epoch": 0.8069396812588259,
"grad_norm": 0.9284518361091614,
"learning_rate": 1.0771992818671454e-05,
"loss": 0.1762,
"mean_token_accuracy": 0.9691536143422127,
"num_tokens": 2048000.0,
"step": 500
},
{
"epoch": 0.8230784748840024,
"grad_norm": 0.9532117247581482,
"learning_rate": 9.874326750448834e-06,
"loss": 0.1707,
"mean_token_accuracy": 0.9692759282886982,
"num_tokens": 2088960.0,
"step": 510
},
{
"epoch": 0.839217268509179,
"grad_norm": 1.1545753479003906,
"learning_rate": 8.976660682226211e-06,
"loss": 0.176,
"mean_token_accuracy": 0.9688845381140709,
"num_tokens": 2129920.0,
"step": 520
},
{
"epoch": 0.8553560621343554,
"grad_norm": 0.9475525617599487,
"learning_rate": 8.07899461400359e-06,
"loss": 0.1772,
"mean_token_accuracy": 0.9683953016996384,
"num_tokens": 2170880.0,
"step": 530
},
{
"epoch": 0.871494855759532,
"grad_norm": 0.9343079924583435,
"learning_rate": 7.18132854578097e-06,
"loss": 0.1799,
"mean_token_accuracy": 0.9672211319208145,
"num_tokens": 2211840.0,
"step": 540
},
{
"epoch": 0.8876336493847085,
"grad_norm": 1.152690052986145,
"learning_rate": 6.283662477558349e-06,
"loss": 0.1676,
"mean_token_accuracy": 0.9698140859603882,
"num_tokens": 2252800.0,
"step": 550
},
{
"epoch": 0.903772443009885,
"grad_norm": 0.960165798664093,
"learning_rate": 5.385996409335727e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9693982377648354,
"num_tokens": 2293760.0,
"step": 560
},
{
"epoch": 0.9199112366350616,
"grad_norm": 0.8894097208976746,
"learning_rate": 4.488330341113106e-06,
"loss": 0.1829,
"mean_token_accuracy": 0.9673923671245575,
"num_tokens": 2334720.0,
"step": 570
},
{
"epoch": 0.936050030260238,
"grad_norm": 1.2608239650726318,
"learning_rate": 3.590664272890485e-06,
"loss": 0.1762,
"mean_token_accuracy": 0.968297453969717,
"num_tokens": 2375680.0,
"step": 580
},
{
"epoch": 0.9521888238854146,
"grad_norm": 1.042999029159546,
"learning_rate": 2.6929982046678636e-06,
"loss": 0.1581,
"mean_token_accuracy": 0.9708414882421493,
"num_tokens": 2416640.0,
"step": 590
},
{
"epoch": 0.968327617510591,
"grad_norm": 0.7505899667739868,
"learning_rate": 1.7953321364452425e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9684442266821861,
"num_tokens": 2457600.0,
"step": 600
},
{
"epoch": 0.9844664111357676,
"grad_norm": 0.9210404753684998,
"learning_rate": 8.976660682226213e-07,
"loss": 0.1713,
"mean_token_accuracy": 0.969838547706604,
"num_tokens": 2498560.0,
"step": 610
}
],
"logging_steps": 10,
"max_steps": 619,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6700630554968064.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}