|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9995764506565015, |
|
"eval_steps": 500, |
|
"global_step": 590, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00847098686997035, |
|
"grad_norm": 13.040608406066895, |
|
"learning_rate": 1.6949152542372882e-06, |
|
"loss": 1.8317, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0169419737399407, |
|
"grad_norm": 7.28167200088501, |
|
"learning_rate": 3.3898305084745763e-06, |
|
"loss": 1.7467, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.025412960609911054, |
|
"grad_norm": 3.6362063884735107, |
|
"learning_rate": 5.084745762711865e-06, |
|
"loss": 1.4691, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0338839474798814, |
|
"grad_norm": 2.443824291229248, |
|
"learning_rate": 6.779661016949153e-06, |
|
"loss": 1.1816, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.042354934349851756, |
|
"grad_norm": 1.9287770986557007, |
|
"learning_rate": 8.47457627118644e-06, |
|
"loss": 1.0028, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05082592121982211, |
|
"grad_norm": 2.287205696105957, |
|
"learning_rate": 1.016949152542373e-05, |
|
"loss": 0.8979, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05929690808979246, |
|
"grad_norm": 1.8406411409378052, |
|
"learning_rate": 1.1864406779661018e-05, |
|
"loss": 0.8245, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0677678949597628, |
|
"grad_norm": 1.597153663635254, |
|
"learning_rate": 1.3559322033898305e-05, |
|
"loss": 0.801, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07623888182973317, |
|
"grad_norm": 1.4034295082092285, |
|
"learning_rate": 1.5254237288135594e-05, |
|
"loss": 0.7545, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08470986869970351, |
|
"grad_norm": 3.763864517211914, |
|
"learning_rate": 1.694915254237288e-05, |
|
"loss": 0.7336, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09318085556967387, |
|
"grad_norm": 1.451323390007019, |
|
"learning_rate": 1.864406779661017e-05, |
|
"loss": 0.7121, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10165184243964422, |
|
"grad_norm": 1.5066395998001099, |
|
"learning_rate": 1.9999824983320176e-05, |
|
"loss": 0.6881, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11012282930961458, |
|
"grad_norm": 1.4491875171661377, |
|
"learning_rate": 1.9993700042749937e-05, |
|
"loss": 0.673, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11859381617958492, |
|
"grad_norm": 1.27461838722229, |
|
"learning_rate": 1.9978830393392338e-05, |
|
"loss": 0.6507, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12706480304955528, |
|
"grad_norm": 1.191027045249939, |
|
"learning_rate": 1.995522904651977e-05, |
|
"loss": 0.6384, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1355357899195256, |
|
"grad_norm": 1.237403392791748, |
|
"learning_rate": 1.992291665383325e-05, |
|
"loss": 0.6637, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14400677678949597, |
|
"grad_norm": 1.1272355318069458, |
|
"learning_rate": 1.9881921489391738e-05, |
|
"loss": 0.635, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15247776365946633, |
|
"grad_norm": 1.168715476989746, |
|
"learning_rate": 1.983227942487172e-05, |
|
"loss": 0.6445, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1609487505294367, |
|
"grad_norm": 1.050166130065918, |
|
"learning_rate": 1.9774033898178668e-05, |
|
"loss": 0.6412, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16941973739940702, |
|
"grad_norm": 1.268314003944397, |
|
"learning_rate": 1.9707235875437932e-05, |
|
"loss": 0.6233, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17789072426937738, |
|
"grad_norm": 1.3509944677352905, |
|
"learning_rate": 1.963194380639825e-05, |
|
"loss": 0.6138, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.18636171113934774, |
|
"grad_norm": 1.1367347240447998, |
|
"learning_rate": 1.954822357328692e-05, |
|
"loss": 0.6045, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19483269800931807, |
|
"grad_norm": 1.1019567251205444, |
|
"learning_rate": 1.9456148433161387e-05, |
|
"loss": 0.6176, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20330368487928843, |
|
"grad_norm": 1.0358731746673584, |
|
"learning_rate": 1.9355798953807715e-05, |
|
"loss": 0.5925, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2117746717492588, |
|
"grad_norm": 1.0830167531967163, |
|
"learning_rate": 1.924726294324196e-05, |
|
"loss": 0.6107, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22024565861922915, |
|
"grad_norm": 1.0958232879638672, |
|
"learning_rate": 1.9130635372876245e-05, |
|
"loss": 0.5953, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.22871664548919948, |
|
"grad_norm": 1.0959957838058472, |
|
"learning_rate": 1.9006018294416648e-05, |
|
"loss": 0.6107, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.23718763235916984, |
|
"grad_norm": 1.1108900308609009, |
|
"learning_rate": 1.8873520750565716e-05, |
|
"loss": 0.5817, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2456586192291402, |
|
"grad_norm": 1.057532787322998, |
|
"learning_rate": 1.8733258679607674e-05, |
|
"loss": 0.6108, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.25412960609911056, |
|
"grad_norm": 1.2086259126663208, |
|
"learning_rate": 1.858535481395986e-05, |
|
"loss": 0.5875, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2626005929690809, |
|
"grad_norm": 1.0949163436889648, |
|
"learning_rate": 1.8429938572779154e-05, |
|
"loss": 0.5951, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2710715798390512, |
|
"grad_norm": 0.9488250017166138, |
|
"learning_rate": 1.8267145948717338e-05, |
|
"loss": 0.5868, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2795425667090216, |
|
"grad_norm": 1.0208171606063843, |
|
"learning_rate": 1.8097119388924524e-05, |
|
"loss": 0.5835, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.28801355357899194, |
|
"grad_norm": 1.1816986799240112, |
|
"learning_rate": 1.7920007670404738e-05, |
|
"loss": 0.5703, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.29648454044896233, |
|
"grad_norm": 1.1194366216659546, |
|
"learning_rate": 1.7735965769832754e-05, |
|
"loss": 0.5752, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.30495552731893266, |
|
"grad_norm": 1.1583960056304932, |
|
"learning_rate": 1.7545154727946065e-05, |
|
"loss": 0.5967, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.313426514188903, |
|
"grad_norm": 0.9982605576515198, |
|
"learning_rate": 1.7347741508630673e-05, |
|
"loss": 0.5798, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3218975010588734, |
|
"grad_norm": 1.17666494846344, |
|
"learning_rate": 1.7143898852824005e-05, |
|
"loss": 0.5647, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3303684879288437, |
|
"grad_norm": 0.9470816254615784, |
|
"learning_rate": 1.6933805127362744e-05, |
|
"loss": 0.546, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.33883947479881404, |
|
"grad_norm": 0.9323899745941162, |
|
"learning_rate": 1.671764416890793e-05, |
|
"loss": 0.5567, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.34731046166878443, |
|
"grad_norm": 1.0073851346969604, |
|
"learning_rate": 1.649560512308378e-05, |
|
"loss": 0.5649, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.35578144853875476, |
|
"grad_norm": 1.2009499073028564, |
|
"learning_rate": 1.6267882278971102e-05, |
|
"loss": 0.5563, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3642524354087251, |
|
"grad_norm": 1.1687507629394531, |
|
"learning_rate": 1.603467489910004e-05, |
|
"loss": 0.5704, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3727234222786955, |
|
"grad_norm": 1.0257385969161987, |
|
"learning_rate": 1.5796187045090943e-05, |
|
"loss": 0.5473, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3811944091486658, |
|
"grad_norm": 1.0044339895248413, |
|
"learning_rate": 1.5552627399095943e-05, |
|
"loss": 0.5557, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.38966539601863615, |
|
"grad_norm": 1.0678868293762207, |
|
"learning_rate": 1.5304209081197425e-05, |
|
"loss": 0.5487, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.39813638288860653, |
|
"grad_norm": 0.9731626510620117, |
|
"learning_rate": 1.5051149462923285e-05, |
|
"loss": 0.5632, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.40660736975857686, |
|
"grad_norm": 0.9739297032356262, |
|
"learning_rate": 1.4793669977041978e-05, |
|
"loss": 0.5604, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.41507835662854725, |
|
"grad_norm": 1.1298881769180298, |
|
"learning_rate": 1.4531995923803974e-05, |
|
"loss": 0.5748, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4235493434985176, |
|
"grad_norm": 0.946855902671814, |
|
"learning_rate": 1.4266356273799044e-05, |
|
"loss": 0.5386, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4320203303684879, |
|
"grad_norm": 0.8729770183563232, |
|
"learning_rate": 1.3996983467601921e-05, |
|
"loss": 0.5672, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4404913172384583, |
|
"grad_norm": 0.9312366843223572, |
|
"learning_rate": 1.372411321238166e-05, |
|
"loss": 0.5441, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.44896230410842863, |
|
"grad_norm": 0.9651340246200562, |
|
"learning_rate": 1.3447984275652638e-05, |
|
"loss": 0.5487, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.45743329097839897, |
|
"grad_norm": 0.998904287815094, |
|
"learning_rate": 1.3168838276347691e-05, |
|
"loss": 0.5397, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.46590427784836935, |
|
"grad_norm": 1.2853810787200928, |
|
"learning_rate": 1.2886919473396212e-05, |
|
"loss": 0.5386, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4743752647183397, |
|
"grad_norm": 1.006282925605774, |
|
"learning_rate": 1.2602474551992165e-05, |
|
"loss": 0.5501, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.48284625158831, |
|
"grad_norm": 0.9217785596847534, |
|
"learning_rate": 1.2315752407739093e-05, |
|
"loss": 0.529, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.4913172384582804, |
|
"grad_norm": 0.9664483070373535, |
|
"learning_rate": 1.2027003928860936e-05, |
|
"loss": 0.5295, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.49978822532825073, |
|
"grad_norm": 1.0580369234085083, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.5504, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5082592121982211, |
|
"grad_norm": 0.938615620136261, |
|
"learning_rate": 1.1444440164479215e-05, |
|
"loss": 0.5307, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5167301990681914, |
|
"grad_norm": 0.91016685962677, |
|
"learning_rate": 1.115113463516683e-05, |
|
"loss": 0.5214, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5252011859381618, |
|
"grad_norm": 0.9125617146492004, |
|
"learning_rate": 1.085682183756377e-05, |
|
"loss": 0.5407, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5336721728081322, |
|
"grad_norm": 0.9857435822486877, |
|
"learning_rate": 1.0561759301883714e-05, |
|
"loss": 0.5508, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5421431596781024, |
|
"grad_norm": 0.948715090751648, |
|
"learning_rate": 1.026620521437775e-05, |
|
"loss": 0.5275, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5506141465480728, |
|
"grad_norm": 0.9650415778160095, |
|
"learning_rate": 9.970418191415703e-06, |
|
"loss": 0.5143, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5590851334180432, |
|
"grad_norm": 0.9797161221504211, |
|
"learning_rate": 9.674657053191079e-06, |
|
"loss": 0.529, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5675561202880135, |
|
"grad_norm": 0.9044686555862427, |
|
"learning_rate": 9.379180597247661e-06, |
|
"loss": 0.5446, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5760271071579839, |
|
"grad_norm": 0.9637364149093628, |
|
"learning_rate": 9.084247372025938e-06, |
|
"loss": 0.5207, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5844980940279543, |
|
"grad_norm": 0.883882462978363, |
|
"learning_rate": 8.790115450627486e-06, |
|
"loss": 0.5177, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5929690808979247, |
|
"grad_norm": 0.9830591678619385, |
|
"learning_rate": 8.497042204995299e-06, |
|
"loss": 0.5386, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6014400677678949, |
|
"grad_norm": 0.9278344511985779, |
|
"learning_rate": 8.205284080707634e-06, |
|
"loss": 0.5258, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6099110546378653, |
|
"grad_norm": 0.8791617155075073, |
|
"learning_rate": 7.915096372582467e-06, |
|
"loss": 0.5407, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6183820415078357, |
|
"grad_norm": 0.9185119271278381, |
|
"learning_rate": 7.626733001288852e-06, |
|
"loss": 0.527, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.626853028377806, |
|
"grad_norm": 0.8270648717880249, |
|
"learning_rate": 7.3404462911607325e-06, |
|
"loss": 0.5312, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6353240152477764, |
|
"grad_norm": 0.919711709022522, |
|
"learning_rate": 7.056486749407552e-06, |
|
"loss": 0.5254, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6437950021177468, |
|
"grad_norm": 0.8501454591751099, |
|
"learning_rate": 6.775102846914912e-06, |
|
"loss": 0.5159, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.652265988987717, |
|
"grad_norm": 0.9181873202323914, |
|
"learning_rate": 6.4965408008270355e-06, |
|
"loss": 0.5175, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6607369758576874, |
|
"grad_norm": 0.8412730693817139, |
|
"learning_rate": 6.221044359101317e-06, |
|
"loss": 0.5249, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6692079627276578, |
|
"grad_norm": 0.8827643394470215, |
|
"learning_rate": 5.948854587223465e-06, |
|
"loss": 0.5222, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.6776789495976281, |
|
"grad_norm": 0.7885822653770447, |
|
"learning_rate": 5.680209657269871e-06, |
|
"loss": 0.5122, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6861499364675985, |
|
"grad_norm": 0.8819693922996521, |
|
"learning_rate": 5.415344639501754e-06, |
|
"loss": 0.5287, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6946209233375689, |
|
"grad_norm": 0.8051272034645081, |
|
"learning_rate": 5.1544912966735e-06, |
|
"loss": 0.5132, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7030919102075391, |
|
"grad_norm": 0.831628680229187, |
|
"learning_rate": 4.897877881235091e-06, |
|
"loss": 0.5088, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7115628970775095, |
|
"grad_norm": 0.8426679968833923, |
|
"learning_rate": 4.645728935606194e-06, |
|
"loss": 0.5163, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7200338839474799, |
|
"grad_norm": 0.8241559267044067, |
|
"learning_rate": 4.398265095696539e-06, |
|
"loss": 0.5174, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7285048708174502, |
|
"grad_norm": 0.8578051924705505, |
|
"learning_rate": 4.1557028978446415e-06, |
|
"loss": 0.5129, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7369758576874206, |
|
"grad_norm": 0.8619440197944641, |
|
"learning_rate": 3.918254589343683e-06, |
|
"loss": 0.5102, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.745446844557391, |
|
"grad_norm": 0.8140995502471924, |
|
"learning_rate": 3.6861279427204634e-06, |
|
"loss": 0.5052, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7539178314273612, |
|
"grad_norm": 0.7656389474868774, |
|
"learning_rate": 3.4595260739298174e-06, |
|
"loss": 0.5247, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7623888182973316, |
|
"grad_norm": 0.780764102935791, |
|
"learning_rate": 3.2386472646236565e-06, |
|
"loss": 0.5139, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.770859805167302, |
|
"grad_norm": 0.7912269234657288, |
|
"learning_rate": 3.023684788650154e-06, |
|
"loss": 0.5184, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.7793307920372723, |
|
"grad_norm": 0.7656291127204895, |
|
"learning_rate": 2.814826742934823e-06, |
|
"loss": 0.5168, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7878017789072427, |
|
"grad_norm": 0.7654049396514893, |
|
"learning_rate": 2.6122558828915647e-06, |
|
"loss": 0.5127, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.7962727657772131, |
|
"grad_norm": 0.8781611919403076, |
|
"learning_rate": 2.4161494625076164e-06, |
|
"loss": 0.5068, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8047437526471835, |
|
"grad_norm": 0.7922006249427795, |
|
"learning_rate": 2.2266790792424096e-06, |
|
"loss": 0.5243, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.8132147395171537, |
|
"grad_norm": 0.7805562019348145, |
|
"learning_rate": 2.044010523875969e-06, |
|
"loss": 0.5114, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8216857263871241, |
|
"grad_norm": 0.8500534296035767, |
|
"learning_rate": 1.868303635438332e-06, |
|
"loss": 0.4978, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.8301567132570945, |
|
"grad_norm": 0.7626408934593201, |
|
"learning_rate": 1.699712161346846e-06, |
|
"loss": 0.5108, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8386277001270648, |
|
"grad_norm": 0.7929341197013855, |
|
"learning_rate": 1.5383836228737815e-06, |
|
"loss": 0.5126, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8470986869970352, |
|
"grad_norm": 0.7848495244979858, |
|
"learning_rate": 1.3844591860619382e-06, |
|
"loss": 0.5037, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8555696738670056, |
|
"grad_norm": 0.7474762797355652, |
|
"learning_rate": 1.2380735382012576e-06, |
|
"loss": 0.5151, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.8640406607369758, |
|
"grad_norm": 0.7843493819236755, |
|
"learning_rate": 1.0993547699744366e-06, |
|
"loss": 0.5114, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8725116476069462, |
|
"grad_norm": 0.7787851095199585, |
|
"learning_rate": 9.684242633747642e-07, |
|
"loss": 0.5111, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.8809826344769166, |
|
"grad_norm": 0.7845005989074707, |
|
"learning_rate": 8.453965854941748e-07, |
|
"loss": 0.5046, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8894536213468869, |
|
"grad_norm": 0.7967577576637268, |
|
"learning_rate": 7.303793882745181e-07, |
|
"loss": 0.5016, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.8979246082168573, |
|
"grad_norm": 0.7523807883262634, |
|
"learning_rate": 6.234733143097215e-07, |
|
"loss": 0.4974, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9063955950868277, |
|
"grad_norm": 0.7827950119972229, |
|
"learning_rate": 5.247719087812897e-07, |
|
"loss": 0.4984, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.9148665819567979, |
|
"grad_norm": 0.7315457463264465, |
|
"learning_rate": 4.343615376042065e-07, |
|
"loss": 0.5147, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9233375688267683, |
|
"grad_norm": 0.8033891916275024, |
|
"learning_rate": 3.5232131185484075e-07, |
|
"loss": 0.5116, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.9318085556967387, |
|
"grad_norm": 0.7409123778343201, |
|
"learning_rate": 2.78723018547008e-07, |
|
"loss": 0.4918, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.940279542566709, |
|
"grad_norm": 0.7420827150344849, |
|
"learning_rate": 2.1363105781673888e-07, |
|
"loss": 0.5066, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.9487505294366794, |
|
"grad_norm": 0.7862848043441772, |
|
"learning_rate": 1.5710238657074218e-07, |
|
"loss": 0.5183, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9572215163066498, |
|
"grad_norm": 0.7402486205101013, |
|
"learning_rate": 1.0918646864784166e-07, |
|
"loss": 0.5182, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.96569250317662, |
|
"grad_norm": 0.7287072539329529, |
|
"learning_rate": 6.99252315370269e-08, |
|
"loss": 0.4976, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9741634900465904, |
|
"grad_norm": 0.7665913105010986, |
|
"learning_rate": 3.9353029689974676e-08, |
|
"loss": 0.5127, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.9826344769165608, |
|
"grad_norm": 0.6921188831329346, |
|
"learning_rate": 1.7496614460135174e-08, |
|
"loss": 0.5173, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9911054637865311, |
|
"grad_norm": 0.7035255432128906, |
|
"learning_rate": 4.375110694713192e-09, |
|
"loss": 0.4961, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.9995764506565015, |
|
"grad_norm": 0.7657430768013, |
|
"learning_rate": 0.0, |
|
"loss": 0.5122, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9995764506565015, |
|
"step": 590, |
|
"total_flos": 8.380405809686774e+17, |
|
"train_loss": 0.5979039826635587, |
|
"train_runtime": 5676.8996, |
|
"train_samples_per_second": 6.654, |
|
"train_steps_per_second": 0.104 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 590, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.380405809686774e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|