Echo4b / trainer_state.json
cryptonaut's picture
Upload 17 files
b94156c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.012497188132670149,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.4994376265340298e-05,
"grad_norm": 22.25,
"learning_rate": 0.0008,
"loss": 12.4316,
"step": 1
},
{
"epoch": 0.00012497188132670149,
"grad_norm": 4.40625,
"learning_rate": 0.0007999840035991902,
"loss": 13.3525,
"step": 5
},
{
"epoch": 0.00024994376265340297,
"grad_norm": 4.5,
"learning_rate": 0.000799964008098178,
"loss": 14.8491,
"step": 10
},
{
"epoch": 0.00037491564398010446,
"grad_norm": 6.84375,
"learning_rate": 0.0007999440125971656,
"loss": 8.8491,
"step": 15
},
{
"epoch": 0.0004998875253068059,
"grad_norm": 3.75,
"learning_rate": 0.0007999240170961534,
"loss": 7.9633,
"step": 20
},
{
"epoch": 0.0006248594066335074,
"grad_norm": 4.75,
"learning_rate": 0.0007999040215951412,
"loss": 7.787,
"step": 25
},
{
"epoch": 0.0007498312879602089,
"grad_norm": 15.3125,
"learning_rate": 0.0007998840260941289,
"loss": 7.6196,
"step": 30
},
{
"epoch": 0.0008748031692869104,
"grad_norm": 3.03125,
"learning_rate": 0.0007998640305931166,
"loss": 7.453,
"step": 35
},
{
"epoch": 0.0009997750506136119,
"grad_norm": 3.921875,
"learning_rate": 0.0007998440350921043,
"loss": 7.3384,
"step": 40
},
{
"epoch": 0.0011247469319403134,
"grad_norm": 3.203125,
"learning_rate": 0.000799824039591092,
"loss": 7.2074,
"step": 45
},
{
"epoch": 0.0012497188132670149,
"grad_norm": 3.28125,
"learning_rate": 0.0007998040440900798,
"loss": 7.1685,
"step": 50
},
{
"epoch": 0.0013746906945937163,
"grad_norm": 3.578125,
"learning_rate": 0.0007997840485890674,
"loss": 7.2311,
"step": 55
},
{
"epoch": 0.0014996625759204178,
"grad_norm": 7.28125,
"learning_rate": 0.0007997640530880552,
"loss": 7.1694,
"step": 60
},
{
"epoch": 0.0016246344572471193,
"grad_norm": 3.0,
"learning_rate": 0.0007997440575870429,
"loss": 7.1179,
"step": 65
},
{
"epoch": 0.0017496063385738208,
"grad_norm": 2.453125,
"learning_rate": 0.0007997240620860307,
"loss": 7.1143,
"step": 70
},
{
"epoch": 0.0018745782199005223,
"grad_norm": 3.46875,
"learning_rate": 0.0007997040665850185,
"loss": 7.0387,
"step": 75
},
{
"epoch": 0.0019995501012272238,
"grad_norm": 3.484375,
"learning_rate": 0.0007996840710840061,
"loss": 6.825,
"step": 80
},
{
"epoch": 0.0021245219825539252,
"grad_norm": 2.859375,
"learning_rate": 0.0007996640755829939,
"loss": 6.7811,
"step": 85
},
{
"epoch": 0.0022494938638806267,
"grad_norm": 2.640625,
"learning_rate": 0.0007996440800819816,
"loss": 6.821,
"step": 90
},
{
"epoch": 0.0023744657452073282,
"grad_norm": 3.0,
"learning_rate": 0.0007996240845809693,
"loss": 6.7552,
"step": 95
},
{
"epoch": 0.0024994376265340297,
"grad_norm": 3.953125,
"learning_rate": 0.000799604089079957,
"loss": 6.7139,
"step": 100
},
{
"epoch": 0.002624409507860731,
"grad_norm": 2.296875,
"learning_rate": 0.0007995840935789447,
"loss": 6.7687,
"step": 105
},
{
"epoch": 0.0027493813891874327,
"grad_norm": 2.65625,
"learning_rate": 0.0007995640980779325,
"loss": 6.6156,
"step": 110
},
{
"epoch": 0.002874353270514134,
"grad_norm": 4.28125,
"learning_rate": 0.0007995441025769203,
"loss": 6.5183,
"step": 115
},
{
"epoch": 0.0029993251518408356,
"grad_norm": 2.4375,
"learning_rate": 0.000799524107075908,
"loss": 6.5023,
"step": 120
},
{
"epoch": 0.003124297033167537,
"grad_norm": 3.0,
"learning_rate": 0.0007995041115748957,
"loss": 6.3342,
"step": 125
},
{
"epoch": 0.0032492689144942386,
"grad_norm": 3.296875,
"learning_rate": 0.0007994841160738834,
"loss": 6.416,
"step": 130
},
{
"epoch": 0.00337424079582094,
"grad_norm": 3.90625,
"learning_rate": 0.0007994641205728712,
"loss": 6.3348,
"step": 135
},
{
"epoch": 0.0034992126771476416,
"grad_norm": 2.578125,
"learning_rate": 0.0007994441250718589,
"loss": 6.3827,
"step": 140
},
{
"epoch": 0.003624184558474343,
"grad_norm": 2.5,
"learning_rate": 0.0007994241295708465,
"loss": 6.4047,
"step": 145
},
{
"epoch": 0.0037491564398010446,
"grad_norm": 2.78125,
"learning_rate": 0.0007994041340698343,
"loss": 6.1781,
"step": 150
},
{
"epoch": 0.003874128321127746,
"grad_norm": 3.375,
"learning_rate": 0.000799384138568822,
"loss": 6.2532,
"step": 155
},
{
"epoch": 0.0039991002024544475,
"grad_norm": 2.34375,
"learning_rate": 0.0007993641430678099,
"loss": 6.2372,
"step": 160
},
{
"epoch": 0.004124072083781149,
"grad_norm": 2.796875,
"learning_rate": 0.0007993441475667975,
"loss": 6.0455,
"step": 165
},
{
"epoch": 0.0042490439651078505,
"grad_norm": 2.6875,
"learning_rate": 0.0007993241520657852,
"loss": 6.1694,
"step": 170
},
{
"epoch": 0.004374015846434552,
"grad_norm": 2.0625,
"learning_rate": 0.000799304156564773,
"loss": 6.045,
"step": 175
},
{
"epoch": 0.0044989877277612535,
"grad_norm": 2.765625,
"learning_rate": 0.0007992841610637607,
"loss": 6.0104,
"step": 180
},
{
"epoch": 0.004623959609087955,
"grad_norm": 1.796875,
"learning_rate": 0.0007992641655627485,
"loss": 5.9327,
"step": 185
},
{
"epoch": 0.0047489314904146564,
"grad_norm": 2.765625,
"learning_rate": 0.0007992441700617361,
"loss": 5.9164,
"step": 190
},
{
"epoch": 0.004873903371741358,
"grad_norm": 2.265625,
"learning_rate": 0.0007992241745607238,
"loss": 5.9775,
"step": 195
},
{
"epoch": 0.004998875253068059,
"grad_norm": 2.421875,
"learning_rate": 0.0007992041790597116,
"loss": 5.6958,
"step": 200
},
{
"epoch": 0.005123847134394761,
"grad_norm": 2.546875,
"learning_rate": 0.0007991841835586994,
"loss": 5.8286,
"step": 205
},
{
"epoch": 0.005248819015721462,
"grad_norm": 2.359375,
"learning_rate": 0.000799164188057687,
"loss": 5.7053,
"step": 210
},
{
"epoch": 0.005373790897048164,
"grad_norm": 2.28125,
"learning_rate": 0.0007991441925566748,
"loss": 5.725,
"step": 215
},
{
"epoch": 0.005498762778374865,
"grad_norm": 1.875,
"learning_rate": 0.0007991241970556625,
"loss": 5.6576,
"step": 220
},
{
"epoch": 0.005623734659701567,
"grad_norm": 2.3125,
"learning_rate": 0.0007991042015546503,
"loss": 5.7389,
"step": 225
},
{
"epoch": 0.005748706541028268,
"grad_norm": 2.546875,
"learning_rate": 0.000799084206053638,
"loss": 5.4624,
"step": 230
},
{
"epoch": 0.00587367842235497,
"grad_norm": 2.625,
"learning_rate": 0.0007990642105526257,
"loss": 5.6441,
"step": 235
},
{
"epoch": 0.005998650303681671,
"grad_norm": 2.5,
"learning_rate": 0.0007990442150516134,
"loss": 5.552,
"step": 240
},
{
"epoch": 0.006123622185008373,
"grad_norm": 1.640625,
"learning_rate": 0.0007990242195506011,
"loss": 5.3983,
"step": 245
},
{
"epoch": 0.006248594066335074,
"grad_norm": 2.640625,
"learning_rate": 0.000799004224049589,
"loss": 5.3757,
"step": 250
},
{
"epoch": 0.006373565947661776,
"grad_norm": 2.25,
"learning_rate": 0.0007989842285485766,
"loss": 5.4723,
"step": 255
},
{
"epoch": 0.006498537828988477,
"grad_norm": 2.25,
"learning_rate": 0.0007989642330475643,
"loss": 5.301,
"step": 260
},
{
"epoch": 0.006623509710315179,
"grad_norm": 1.90625,
"learning_rate": 0.0007989442375465521,
"loss": 5.4499,
"step": 265
},
{
"epoch": 0.00674848159164188,
"grad_norm": 2.65625,
"learning_rate": 0.0007989242420455398,
"loss": 5.5597,
"step": 270
},
{
"epoch": 0.006873453472968582,
"grad_norm": 2.25,
"learning_rate": 0.0007989042465445275,
"loss": 5.2123,
"step": 275
},
{
"epoch": 0.006998425354295283,
"grad_norm": 1.71875,
"learning_rate": 0.0007988842510435152,
"loss": 5.3393,
"step": 280
},
{
"epoch": 0.007123397235621985,
"grad_norm": 2.390625,
"learning_rate": 0.000798864255542503,
"loss": 5.2593,
"step": 285
},
{
"epoch": 0.007248369116948686,
"grad_norm": 2.125,
"learning_rate": 0.0007988442600414907,
"loss": 5.2776,
"step": 290
},
{
"epoch": 0.007373340998275388,
"grad_norm": 1.84375,
"learning_rate": 0.0007988242645404785,
"loss": 5.2529,
"step": 295
},
{
"epoch": 0.007498312879602089,
"grad_norm": 2.3125,
"learning_rate": 0.0007988042690394662,
"loss": 5.0664,
"step": 300
},
{
"epoch": 0.007623284760928791,
"grad_norm": 1.5234375,
"learning_rate": 0.0007987842735384539,
"loss": 5.3028,
"step": 305
},
{
"epoch": 0.007748256642255492,
"grad_norm": 1.7578125,
"learning_rate": 0.0007987642780374416,
"loss": 5.2152,
"step": 310
},
{
"epoch": 0.007873228523582194,
"grad_norm": 1.765625,
"learning_rate": 0.0007987442825364294,
"loss": 4.97,
"step": 315
},
{
"epoch": 0.007998200404908895,
"grad_norm": 2.203125,
"learning_rate": 0.000798724287035417,
"loss": 5.1328,
"step": 320
},
{
"epoch": 0.008123172286235597,
"grad_norm": 1.765625,
"learning_rate": 0.0007987042915344048,
"loss": 5.0475,
"step": 325
},
{
"epoch": 0.008248144167562298,
"grad_norm": 1.78125,
"learning_rate": 0.0007986842960333925,
"loss": 5.009,
"step": 330
},
{
"epoch": 0.008373116048889,
"grad_norm": 1.90625,
"learning_rate": 0.0007986643005323803,
"loss": 5.0827,
"step": 335
},
{
"epoch": 0.008498087930215701,
"grad_norm": 1.609375,
"learning_rate": 0.000798644305031368,
"loss": 5.0896,
"step": 340
},
{
"epoch": 0.008623059811542403,
"grad_norm": 1.6640625,
"learning_rate": 0.0007986243095303557,
"loss": 4.9185,
"step": 345
},
{
"epoch": 0.008748031692869104,
"grad_norm": 2.25,
"learning_rate": 0.0007986043140293435,
"loss": 5.0279,
"step": 350
},
{
"epoch": 0.008873003574195806,
"grad_norm": 2.140625,
"learning_rate": 0.0007985843185283312,
"loss": 4.8811,
"step": 355
},
{
"epoch": 0.008997975455522507,
"grad_norm": 1.640625,
"learning_rate": 0.0007985643230273188,
"loss": 5.0377,
"step": 360
},
{
"epoch": 0.00912294733684921,
"grad_norm": 2.078125,
"learning_rate": 0.0007985443275263066,
"loss": 4.9903,
"step": 365
},
{
"epoch": 0.00924791921817591,
"grad_norm": 2.15625,
"learning_rate": 0.0007985243320252943,
"loss": 4.8109,
"step": 370
},
{
"epoch": 0.009372891099502612,
"grad_norm": 1.859375,
"learning_rate": 0.0007985043365242821,
"loss": 4.8445,
"step": 375
},
{
"epoch": 0.009497862980829313,
"grad_norm": 2.171875,
"learning_rate": 0.0007984843410232698,
"loss": 4.887,
"step": 380
},
{
"epoch": 0.009622834862156015,
"grad_norm": 1.96875,
"learning_rate": 0.0007984643455222575,
"loss": 4.8373,
"step": 385
},
{
"epoch": 0.009747806743482716,
"grad_norm": 1.7890625,
"learning_rate": 0.0007984443500212453,
"loss": 4.8948,
"step": 390
},
{
"epoch": 0.009872778624809418,
"grad_norm": 2.09375,
"learning_rate": 0.000798424354520233,
"loss": 4.9098,
"step": 395
},
{
"epoch": 0.009997750506136119,
"grad_norm": 1.703125,
"learning_rate": 0.0007984043590192208,
"loss": 4.6948,
"step": 400
},
{
"epoch": 0.010122722387462821,
"grad_norm": 1.9453125,
"learning_rate": 0.0007983843635182084,
"loss": 4.7561,
"step": 405
},
{
"epoch": 0.010247694268789522,
"grad_norm": 2.171875,
"learning_rate": 0.0007983643680171961,
"loss": 4.6634,
"step": 410
},
{
"epoch": 0.010372666150116224,
"grad_norm": 2.140625,
"learning_rate": 0.0007983443725161839,
"loss": 4.6339,
"step": 415
},
{
"epoch": 0.010497638031442925,
"grad_norm": 1.953125,
"learning_rate": 0.0007983243770151716,
"loss": 4.7292,
"step": 420
},
{
"epoch": 0.010622609912769627,
"grad_norm": 1.4453125,
"learning_rate": 0.0007983043815141594,
"loss": 4.6211,
"step": 425
},
{
"epoch": 0.010747581794096328,
"grad_norm": 1.6484375,
"learning_rate": 0.0007982843860131471,
"loss": 4.6857,
"step": 430
},
{
"epoch": 0.01087255367542303,
"grad_norm": 1.8359375,
"learning_rate": 0.0007982643905121348,
"loss": 4.6335,
"step": 435
},
{
"epoch": 0.01099752555674973,
"grad_norm": 1.6484375,
"learning_rate": 0.0007982443950111226,
"loss": 4.7285,
"step": 440
},
{
"epoch": 0.011122497438076433,
"grad_norm": 1.609375,
"learning_rate": 0.0007982243995101103,
"loss": 4.7707,
"step": 445
},
{
"epoch": 0.011247469319403134,
"grad_norm": 1.78125,
"learning_rate": 0.000798204404009098,
"loss": 4.6769,
"step": 450
},
{
"epoch": 0.011372441200729836,
"grad_norm": 1.390625,
"learning_rate": 0.0007981844085080857,
"loss": 4.7311,
"step": 455
},
{
"epoch": 0.011497413082056537,
"grad_norm": 1.84375,
"learning_rate": 0.0007981644130070734,
"loss": 4.6677,
"step": 460
},
{
"epoch": 0.011622384963383239,
"grad_norm": 1.328125,
"learning_rate": 0.0007981444175060612,
"loss": 4.6862,
"step": 465
},
{
"epoch": 0.01174735684470994,
"grad_norm": 2.40625,
"learning_rate": 0.0007981244220050488,
"loss": 4.5674,
"step": 470
},
{
"epoch": 0.011872328726036642,
"grad_norm": 1.8671875,
"learning_rate": 0.0007981044265040366,
"loss": 4.5283,
"step": 475
},
{
"epoch": 0.011997300607363343,
"grad_norm": 1.625,
"learning_rate": 0.0007980844310030244,
"loss": 4.5966,
"step": 480
},
{
"epoch": 0.012122272488690045,
"grad_norm": 1.7421875,
"learning_rate": 0.0007980644355020121,
"loss": 4.4785,
"step": 485
},
{
"epoch": 0.012247244370016746,
"grad_norm": 1.765625,
"learning_rate": 0.0007980444400009999,
"loss": 4.5966,
"step": 490
},
{
"epoch": 0.012372216251343448,
"grad_norm": 1.8671875,
"learning_rate": 0.0007980244444999875,
"loss": 4.6428,
"step": 495
},
{
"epoch": 0.012497188132670149,
"grad_norm": 1.609375,
"learning_rate": 0.0007980044489989753,
"loss": 4.5225,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 200045,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2266079346688e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}