foamGss-20B-trl / checkpoint-1660 /trainer_state.json
finalform's picture
Upload folder using huggingface_hub
c44d887 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1660,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.060350030175015085,
"grad_norm": 4.037877082824707,
"learning_rate": 5.76e-05,
"loss": 3.1642,
"mean_token_accuracy": 0.4683700659871101,
"num_tokens": 161483.0,
"step": 25
},
{
"epoch": 0.12070006035003017,
"grad_norm": 1.468239665031433,
"learning_rate": 0.0001176,
"loss": 1.5845,
"mean_token_accuracy": 0.6552255868911743,
"num_tokens": 294759.0,
"step": 50
},
{
"epoch": 0.18105009052504525,
"grad_norm": 0.647505521774292,
"learning_rate": 0.00017759999999999998,
"loss": 0.7171,
"mean_token_accuracy": 0.8258614861965179,
"num_tokens": 456515.0,
"step": 75
},
{
"epoch": 0.24140012070006034,
"grad_norm": 0.8074957728385925,
"learning_rate": 0.0002376,
"loss": 0.5756,
"mean_token_accuracy": 0.8531174421310425,
"num_tokens": 588873.0,
"step": 100
},
{
"epoch": 0.30175015087507545,
"grad_norm": 0.8116180896759033,
"learning_rate": 0.00029759999999999997,
"loss": 0.4766,
"mean_token_accuracy": 0.8744671362638473,
"num_tokens": 750990.0,
"step": 125
},
{
"epoch": 0.3621001810500905,
"grad_norm": 1.6603596210479736,
"learning_rate": 0.00029997368283986634,
"loss": 0.4431,
"mean_token_accuracy": 0.8835952073335648,
"num_tokens": 884144.0,
"step": 150
},
{
"epoch": 0.4224502112251056,
"grad_norm": 0.8336510062217712,
"learning_rate": 0.0002998903096401296,
"loss": 0.3611,
"mean_token_accuracy": 0.9041859984397889,
"num_tokens": 1046790.0,
"step": 175
},
{
"epoch": 0.4828002414001207,
"grad_norm": 0.987082839012146,
"learning_rate": 0.0002997498664538712,
"loss": 0.3448,
"mean_token_accuracy": 0.9090080863237381,
"num_tokens": 1180257.0,
"step": 200
},
{
"epoch": 0.5431502715751357,
"grad_norm": 0.8808548450469971,
"learning_rate": 0.0002995524067541326,
"loss": 0.2672,
"mean_token_accuracy": 0.9280223268270492,
"num_tokens": 1343936.0,
"step": 225
},
{
"epoch": 0.6035003017501509,
"grad_norm": 1.027520775794983,
"learning_rate": 0.00029929800572270793,
"loss": 0.2833,
"mean_token_accuracy": 0.9247983312606811,
"num_tokens": 1477991.0,
"step": 250
},
{
"epoch": 0.663850331925166,
"grad_norm": 0.6596710085868835,
"learning_rate": 0.00029898676022151893,
"loss": 0.2162,
"mean_token_accuracy": 0.9414234220981598,
"num_tokens": 1640162.0,
"step": 275
},
{
"epoch": 0.724200362100181,
"grad_norm": 1.1629737615585327,
"learning_rate": 0.00029861878875573505,
"loss": 0.2106,
"mean_token_accuracy": 0.9432486313581466,
"num_tokens": 1773232.0,
"step": 300
},
{
"epoch": 0.7845503922751962,
"grad_norm": 0.838697612285614,
"learning_rate": 0.0002981942314286536,
"loss": 0.1735,
"mean_token_accuracy": 0.9535212635993957,
"num_tokens": 1935668.0,
"step": 325
},
{
"epoch": 0.8449004224502112,
"grad_norm": 1.079795479774475,
"learning_rate": 0.0002977132498883555,
"loss": 0.1661,
"mean_token_accuracy": 0.9547489368915558,
"num_tokens": 2068608.0,
"step": 350
},
{
"epoch": 0.9052504526252263,
"grad_norm": 0.6005878448486328,
"learning_rate": 0.0002971760272661594,
"loss": 0.1547,
"mean_token_accuracy": 0.959057599902153,
"num_tokens": 2230863.0,
"step": 375
},
{
"epoch": 0.9656004828002414,
"grad_norm": 0.9613809585571289,
"learning_rate": 0.00029658276810689443,
"loss": 0.1472,
"mean_token_accuracy": 0.9600884807109833,
"num_tokens": 2363937.0,
"step": 400
},
{
"epoch": 1.0,
"eval_loss": 0.1443626582622528,
"eval_mean_token_accuracy": 0.960926349098618,
"eval_num_tokens": 2448118.0,
"eval_runtime": 259.3789,
"eval_samples_per_second": 1.423,
"eval_steps_per_second": 0.713,
"step": 415
},
{
"epoch": 1.024140012070006,
"grad_norm": 0.6829183101654053,
"learning_rate": 0.0002959336982910217,
"loss": 0.1492,
"mean_token_accuracy": 0.9617758946320445,
"num_tokens": 2519362.0,
"step": 425
},
{
"epoch": 1.0844900422450212,
"grad_norm": 0.792141854763031,
"learning_rate": 0.0002952290649486306,
"loss": 0.0897,
"mean_token_accuracy": 0.974867417216301,
"num_tokens": 2668621.0,
"step": 450
},
{
"epoch": 1.1448400724200363,
"grad_norm": 0.5825597643852234,
"learning_rate": 0.0002944691363653459,
"loss": 0.1148,
"mean_token_accuracy": 0.9691734218597412,
"num_tokens": 2815906.0,
"step": 475
},
{
"epoch": 1.2051901025950513,
"grad_norm": 0.6245791912078857,
"learning_rate": 0.0002936542018801788,
"loss": 0.0923,
"mean_token_accuracy": 0.9748454135656357,
"num_tokens": 2964114.0,
"step": 500
},
{
"epoch": 1.2655401327700664,
"grad_norm": 0.58636873960495,
"learning_rate": 0.0002927845717753632,
"loss": 0.1154,
"mean_token_accuracy": 0.9687699192762375,
"num_tokens": 3111714.0,
"step": 525
},
{
"epoch": 1.3258901629450814,
"grad_norm": 0.8928817510604858,
"learning_rate": 0.00029186057715821663,
"loss": 0.0886,
"mean_token_accuracy": 0.976014096736908,
"num_tokens": 3258422.0,
"step": 550
},
{
"epoch": 1.3862401931200965,
"grad_norm": 0.5934416055679321,
"learning_rate": 0.0002908825698350731,
"loss": 0.0996,
"mean_token_accuracy": 0.9732701396942138,
"num_tokens": 3405648.0,
"step": 575
},
{
"epoch": 1.4465902232951118,
"grad_norm": 0.4363284707069397,
"learning_rate": 0.00028985092217733463,
"loss": 0.0679,
"mean_token_accuracy": 0.9816953223943711,
"num_tokens": 3553294.0,
"step": 600
},
{
"epoch": 1.5069402534701268,
"grad_norm": 0.6062431335449219,
"learning_rate": 0.0002887660269796928,
"loss": 0.1015,
"mean_token_accuracy": 0.9734577733278275,
"num_tokens": 3699909.0,
"step": 625
},
{
"epoch": 1.567290283645142,
"grad_norm": 0.6455872654914856,
"learning_rate": 0.0002876282973105736,
"loss": 0.0657,
"mean_token_accuracy": 0.9815398609638214,
"num_tokens": 3847893.0,
"step": 650
},
{
"epoch": 1.627640313820157,
"grad_norm": 0.547113835811615,
"learning_rate": 0.0002864381663548645,
"loss": 0.1034,
"mean_token_accuracy": 0.9735846078395843,
"num_tokens": 3994245.0,
"step": 675
},
{
"epoch": 1.687990343995172,
"grad_norm": 0.6483117938041687,
"learning_rate": 0.0002851960872489806,
"loss": 0.0634,
"mean_token_accuracy": 0.9827442276477814,
"num_tokens": 4144165.0,
"step": 700
},
{
"epoch": 1.748340374170187,
"grad_norm": 0.4899543523788452,
"learning_rate": 0.00028390253290833605,
"loss": 0.0942,
"mean_token_accuracy": 0.9755072242021561,
"num_tokens": 4291469.0,
"step": 725
},
{
"epoch": 1.8086904043452021,
"grad_norm": 0.5036582350730896,
"learning_rate": 0.0002825579958472832,
"loss": 0.0639,
"mean_token_accuracy": 0.9823815160989762,
"num_tokens": 4439415.0,
"step": 750
},
{
"epoch": 1.8690404345202172,
"grad_norm": 0.4868822693824768,
"learning_rate": 0.00028116298799159045,
"loss": 0.0787,
"mean_token_accuracy": 0.9785297274589538,
"num_tokens": 4587739.0,
"step": 775
},
{
"epoch": 1.9293904646952322,
"grad_norm": 0.4633966386318207,
"learning_rate": 0.00027971804048352927,
"loss": 0.0618,
"mean_token_accuracy": 0.9832005858421325,
"num_tokens": 4736249.0,
"step": 800
},
{
"epoch": 1.9897404948702473,
"grad_norm": 0.6208447217941284,
"learning_rate": 0.0002782237034796442,
"loss": 0.0665,
"mean_token_accuracy": 0.9821873652935028,
"num_tokens": 4875075.0,
"step": 825
},
{
"epoch": 2.0,
"eval_loss": 0.07273826748132706,
"eval_mean_token_accuracy": 0.9805045517715247,
"eval_num_tokens": 4896236.0,
"eval_runtime": 259.4448,
"eval_samples_per_second": 1.422,
"eval_steps_per_second": 0.713,
"step": 830
},
{
"epoch": 2.048280024140012,
"grad_norm": 0.5759810209274292,
"learning_rate": 0.0002766805459412835,
"loss": 0.0716,
"mean_token_accuracy": 0.9827596778722153,
"num_tokens": 5028174.0,
"step": 850
},
{
"epoch": 2.1086300543150274,
"grad_norm": 0.4573070704936981,
"learning_rate": 0.0002750891554179702,
"loss": 0.0469,
"mean_token_accuracy": 0.9868157178163528,
"num_tokens": 5168210.0,
"step": 875
},
{
"epoch": 2.1689800844900424,
"grad_norm": 0.41777142882347107,
"learning_rate": 0.000273450137823695,
"loss": 0.0771,
"mean_token_accuracy": 0.9796839380264282,
"num_tokens": 5326702.0,
"step": 900
},
{
"epoch": 2.2293301146650575,
"grad_norm": 0.6522343754768372,
"learning_rate": 0.00027176411720621833,
"loss": 0.0467,
"mean_token_accuracy": 0.9872583884000778,
"num_tokens": 5467246.0,
"step": 925
},
{
"epoch": 2.2896801448400725,
"grad_norm": 0.475888729095459,
"learning_rate": 0.00027003173550946665,
"loss": 0.0672,
"mean_token_accuracy": 0.9817777210474015,
"num_tokens": 5620161.0,
"step": 950
},
{
"epoch": 2.3500301750150876,
"grad_norm": 0.5362425446510315,
"learning_rate": 0.00026825365232911573,
"loss": 0.0469,
"mean_token_accuracy": 0.9871275025606155,
"num_tokens": 5757651.0,
"step": 975
},
{
"epoch": 2.4103802051901027,
"grad_norm": 0.3906894326210022,
"learning_rate": 0.00026643054466145294,
"loss": 0.0655,
"mean_token_accuracy": 0.9822571390867233,
"num_tokens": 5912849.0,
"step": 1000
},
{
"epoch": 2.4707302353651177,
"grad_norm": 0.4455728828907013,
"learning_rate": 0.00026456310664561426,
"loss": 0.0456,
"mean_token_accuracy": 0.9875413435697555,
"num_tokens": 6050687.0,
"step": 1025
},
{
"epoch": 2.5310802655401328,
"grad_norm": 0.37768128514289856,
"learning_rate": 0.00026265204929929413,
"loss": 0.0636,
"mean_token_accuracy": 0.9826168727874756,
"num_tokens": 6208352.0,
"step": 1050
},
{
"epoch": 2.591430295715148,
"grad_norm": 0.644298255443573,
"learning_rate": 0.0002606981002480294,
"loss": 0.0468,
"mean_token_accuracy": 0.9868322855234146,
"num_tokens": 6347260.0,
"step": 1075
},
{
"epoch": 2.651780325890163,
"grad_norm": 0.41656455397605896,
"learning_rate": 0.00025870200344815905,
"loss": 0.0626,
"mean_token_accuracy": 0.9830433952808381,
"num_tokens": 6504918.0,
"step": 1100
},
{
"epoch": 2.712130356065178,
"grad_norm": 0.438490092754364,
"learning_rate": 0.00025666451890356674,
"loss": 0.0472,
"mean_token_accuracy": 0.9869848346710205,
"num_tokens": 6643886.0,
"step": 1125
},
{
"epoch": 2.772480386240193,
"grad_norm": 0.5694201588630676,
"learning_rate": 0.0002545864223763134,
"loss": 0.0619,
"mean_token_accuracy": 0.9835321408510208,
"num_tokens": 6799221.0,
"step": 1150
},
{
"epoch": 2.832830416415208,
"grad_norm": 0.3199928402900696,
"learning_rate": 0.00025246850509126926,
"loss": 0.0428,
"mean_token_accuracy": 0.9882478666305542,
"num_tokens": 6937936.0,
"step": 1175
},
{
"epoch": 2.8931804465902236,
"grad_norm": 0.3072766661643982,
"learning_rate": 0.00025031157343485924,
"loss": 0.0616,
"mean_token_accuracy": 0.9832239282131195,
"num_tokens": 7097502.0,
"step": 1200
},
{
"epoch": 2.9535304767652386,
"grad_norm": 0.46456214785575867,
"learning_rate": 0.000248116448648035,
"loss": 0.0413,
"mean_token_accuracy": 0.9884889626502991,
"num_tokens": 7237082.0,
"step": 1225
},
{
"epoch": 3.0,
"eval_loss": 0.05654008686542511,
"eval_mean_token_accuracy": 0.9848334161010949,
"eval_num_tokens": 7344354.0,
"eval_runtime": 260.2642,
"eval_samples_per_second": 1.418,
"eval_steps_per_second": 0.711,
"step": 1245
},
{
"epoch": 3.012070006035003,
"grad_norm": 0.36259251832962036,
"learning_rate": 0.0002458839665135921,
"loss": 0.0558,
"mean_token_accuracy": 0.9843784410928943,
"num_tokens": 7382070.0,
"step": 1250
},
{
"epoch": 3.0724200362100182,
"grad_norm": 0.33617448806762695,
"learning_rate": 0.00024361497703794942,
"loss": 0.0378,
"mean_token_accuracy": 0.9890254312753677,
"num_tokens": 7534586.0,
"step": 1275
},
{
"epoch": 3.1327700663850333,
"grad_norm": 0.25698503851890564,
"learning_rate": 0.0002413103441275136,
"loss": 0.0452,
"mean_token_accuracy": 0.9873906564712525,
"num_tokens": 7677109.0,
"step": 1300
},
{
"epoch": 3.1931200965600484,
"grad_norm": 0.34490227699279785,
"learning_rate": 0.00023897094525975098,
"loss": 0.0411,
"mean_token_accuracy": 0.9885961890220643,
"num_tokens": 7831173.0,
"step": 1325
},
{
"epoch": 3.2534701267350634,
"grad_norm": 0.3397490978240967,
"learning_rate": 0.00023659767114909245,
"loss": 0.0438,
"mean_token_accuracy": 0.9878141868114472,
"num_tokens": 7972828.0,
"step": 1350
},
{
"epoch": 3.3138201569100785,
"grad_norm": 0.5912103056907654,
"learning_rate": 0.0002341914254077985,
"loss": 0.0361,
"mean_token_accuracy": 0.9897723388671875,
"num_tokens": 8126638.0,
"step": 1375
},
{
"epoch": 3.3741701870850935,
"grad_norm": 0.3524228632450104,
"learning_rate": 0.00023175312420191312,
"loss": 0.0458,
"mean_token_accuracy": 0.9871816504001617,
"num_tokens": 8268641.0,
"step": 1400
},
{
"epoch": 3.4345202172601086,
"grad_norm": 0.4089486598968506,
"learning_rate": 0.00022928369590243834,
"loss": 0.0364,
"mean_token_accuracy": 0.9894253730773925,
"num_tokens": 8421996.0,
"step": 1425
},
{
"epoch": 3.4948702474351236,
"grad_norm": 0.32915744185447693,
"learning_rate": 0.00022678408073186163,
"loss": 0.0482,
"mean_token_accuracy": 0.9867742687463761,
"num_tokens": 8563876.0,
"step": 1450
},
{
"epoch": 3.5552202776101387,
"grad_norm": 0.2799423635005951,
"learning_rate": 0.0002242552304061707,
"loss": 0.0391,
"mean_token_accuracy": 0.9891943109035491,
"num_tokens": 8716396.0,
"step": 1475
},
{
"epoch": 3.6155703077851538,
"grad_norm": 0.34400495886802673,
"learning_rate": 0.00022169810777249253,
"loss": 0.0465,
"mean_token_accuracy": 0.9871583390235901,
"num_tokens": 8858385.0,
"step": 1500
},
{
"epoch": 3.675920337960169,
"grad_norm": 0.2578590512275696,
"learning_rate": 0.00021911368644249416,
"loss": 0.0389,
"mean_token_accuracy": 0.9889793866872787,
"num_tokens": 9014977.0,
"step": 1525
},
{
"epoch": 3.736270368135184,
"grad_norm": 0.4556424617767334,
"learning_rate": 0.00021650295042168486,
"loss": 0.0463,
"mean_token_accuracy": 0.987366048693657,
"num_tokens": 9157309.0,
"step": 1550
},
{
"epoch": 3.796620398310199,
"grad_norm": 0.20752571523189545,
"learning_rate": 0.00021386689373476087,
"loss": 0.0357,
"mean_token_accuracy": 0.989637770652771,
"num_tokens": 9310269.0,
"step": 1575
},
{
"epoch": 3.856970428485214,
"grad_norm": 0.5105423927307129,
"learning_rate": 0.0002112065200471357,
"loss": 0.0433,
"mean_token_accuracy": 0.9879990059137345,
"num_tokens": 9451742.0,
"step": 1600
},
{
"epoch": 3.9173204586602295,
"grad_norm": 0.2212332934141159,
"learning_rate": 0.00020852284228279925,
"loss": 0.0364,
"mean_token_accuracy": 0.9894431579113007,
"num_tokens": 9604731.0,
"step": 1625
},
{
"epoch": 3.9776704888352445,
"grad_norm": 0.23504111170768738,
"learning_rate": 0.00020581688223865225,
"loss": 0.0392,
"mean_token_accuracy": 0.9891186225414276,
"num_tokens": 9742102.0,
"step": 1650
},
{
"epoch": 4.0,
"eval_loss": 0.04928135126829147,
"eval_mean_token_accuracy": 0.9872021871644098,
"eval_num_tokens": 9792472.0,
"eval_runtime": 258.8897,
"eval_samples_per_second": 1.425,
"eval_steps_per_second": 0.715,
"step": 1660
}
],
"logging_steps": 25,
"max_steps": 4150,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1969928185525258e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}