Mistral-Nemo-BD-RP / trainer_state.json
yeyongyu
add: add model weight files
214cf2e
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.001801801801802,
"eval_steps": 500,
"global_step": 833,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010810810810810811,
"grad_norm": 44.04393707727108,
"learning_rate": 2.9999066991504905e-05,
"loss": 2.3711,
"step": 3
},
{
"epoch": 0.021621621621621623,
"grad_norm": 31.04171552882201,
"learning_rate": 2.9996268082086924e-05,
"loss": 4.159,
"step": 6
},
{
"epoch": 0.032432432432432434,
"grad_norm": 11.54576214967521,
"learning_rate": 2.9991603619933566e-05,
"loss": 1.9733,
"step": 9
},
{
"epoch": 0.043243243243243246,
"grad_norm": 7.08841552325599,
"learning_rate": 2.9985074185309204e-05,
"loss": 1.7978,
"step": 12
},
{
"epoch": 0.05405405405405406,
"grad_norm": 5.949508626432288,
"learning_rate": 2.99766805904829e-05,
"loss": 1.7347,
"step": 15
},
{
"epoch": 0.06486486486486487,
"grad_norm": 6.168616775238258,
"learning_rate": 2.9966423879627356e-05,
"loss": 1.6033,
"step": 18
},
{
"epoch": 0.07567567567567568,
"grad_norm": 5.728079542384497,
"learning_rate": 2.9954305328689024e-05,
"loss": 1.7134,
"step": 21
},
{
"epoch": 0.08648648648648649,
"grad_norm": 6.284326561040228,
"learning_rate": 2.9940326445229367e-05,
"loss": 1.6933,
"step": 24
},
{
"epoch": 0.0972972972972973,
"grad_norm": 6.92658975011714,
"learning_rate": 2.9924488968237316e-05,
"loss": 1.5923,
"step": 27
},
{
"epoch": 0.10810810810810811,
"grad_norm": 6.538508694879061,
"learning_rate": 2.9906794867912953e-05,
"loss": 1.6931,
"step": 30
},
{
"epoch": 0.11891891891891893,
"grad_norm": 4.685530306007965,
"learning_rate": 2.98872463454224e-05,
"loss": 1.6559,
"step": 33
},
{
"epoch": 0.12972972972972974,
"grad_norm": 5.65503266442286,
"learning_rate": 2.9865845832623993e-05,
"loss": 1.6982,
"step": 36
},
{
"epoch": 0.14054054054054055,
"grad_norm": 4.886380857119004,
"learning_rate": 2.9842595991765766e-05,
"loss": 1.6503,
"step": 39
},
{
"epoch": 0.15135135135135136,
"grad_norm": 5.026086310034092,
"learning_rate": 2.981749971515426e-05,
"loss": 1.632,
"step": 42
},
{
"epoch": 0.16216216216216217,
"grad_norm": 4.795570770299284,
"learning_rate": 2.9790560124794702e-05,
"loss": 1.6824,
"step": 45
},
{
"epoch": 0.17297297297297298,
"grad_norm": 4.756143563325781,
"learning_rate": 2.976178057200266e-05,
"loss": 1.6694,
"step": 48
},
{
"epoch": 0.1837837837837838,
"grad_norm": 5.364943432581566,
"learning_rate": 2.9731164636987088e-05,
"loss": 1.6659,
"step": 51
},
{
"epoch": 0.1945945945945946,
"grad_norm": 5.181051766279552,
"learning_rate": 2.9698716128404985e-05,
"loss": 1.6443,
"step": 54
},
{
"epoch": 0.20540540540540542,
"grad_norm": 4.828479392346181,
"learning_rate": 2.9664439082887568e-05,
"loss": 1.6519,
"step": 57
},
{
"epoch": 0.21621621621621623,
"grad_norm": 5.824361936201152,
"learning_rate": 2.9628337764538135e-05,
"loss": 1.6532,
"step": 60
},
{
"epoch": 0.22702702702702704,
"grad_norm": 6.393887712988006,
"learning_rate": 2.9590416664401566e-05,
"loss": 1.6409,
"step": 63
},
{
"epoch": 0.23783783783783785,
"grad_norm": 5.8650966692501765,
"learning_rate": 2.955068049990568e-05,
"loss": 1.6105,
"step": 66
},
{
"epoch": 0.24864864864864866,
"grad_norm": 4.755849083042293,
"learning_rate": 2.9509134214274343e-05,
"loss": 1.6618,
"step": 69
},
{
"epoch": 0.2594594594594595,
"grad_norm": 3.9789106042521962,
"learning_rate": 2.9465782975912553e-05,
"loss": 1.6645,
"step": 72
},
{
"epoch": 0.2702702702702703,
"grad_norm": 4.592578050100414,
"learning_rate": 2.942063217776346e-05,
"loss": 1.605,
"step": 75
},
{
"epoch": 0.2810810810810811,
"grad_norm": 4.299733571350802,
"learning_rate": 2.9373687436637492e-05,
"loss": 1.6233,
"step": 78
},
{
"epoch": 0.2918918918918919,
"grad_norm": 4.585181401202116,
"learning_rate": 2.9324954592513626e-05,
"loss": 1.6587,
"step": 81
},
{
"epoch": 0.3027027027027027,
"grad_norm": 4.530135718622418,
"learning_rate": 2.927443970781287e-05,
"loss": 1.6333,
"step": 84
},
{
"epoch": 0.31351351351351353,
"grad_norm": 5.349009947479682,
"learning_rate": 2.9222149066644088e-05,
"loss": 1.6431,
"step": 87
},
{
"epoch": 0.32432432432432434,
"grad_norm": 4.266181060344957,
"learning_rate": 2.916808917402228e-05,
"loss": 1.598,
"step": 90
},
{
"epoch": 0.33513513513513515,
"grad_norm": 4.628674419343668,
"learning_rate": 2.911226675505932e-05,
"loss": 1.6375,
"step": 93
},
{
"epoch": 0.34594594594594597,
"grad_norm": 5.058644550344611,
"learning_rate": 2.905468875412735e-05,
"loss": 1.6427,
"step": 96
},
{
"epoch": 0.3567567567567568,
"grad_norm": 4.417115318512762,
"learning_rate": 2.8995362333994906e-05,
"loss": 1.6333,
"step": 99
},
{
"epoch": 0.3675675675675676,
"grad_norm": 4.0811273644284345,
"learning_rate": 2.8934294874935848e-05,
"loss": 1.5855,
"step": 102
},
{
"epoch": 0.3783783783783784,
"grad_norm": 3.962543949681069,
"learning_rate": 2.887149397381126e-05,
"loss": 1.6171,
"step": 105
},
{
"epoch": 0.3891891891891892,
"grad_norm": 3.934001274285887,
"learning_rate": 2.8806967443124372e-05,
"loss": 1.5538,
"step": 108
},
{
"epoch": 0.4,
"grad_norm": 4.891305018487301,
"learning_rate": 2.8740723310048682e-05,
"loss": 1.6476,
"step": 111
},
{
"epoch": 0.41081081081081083,
"grad_norm": 4.553656766898869,
"learning_rate": 2.8672769815429385e-05,
"loss": 1.5889,
"step": 114
},
{
"epoch": 0.42162162162162165,
"grad_norm": 4.450034079395778,
"learning_rate": 2.860311541275818e-05,
"loss": 1.5896,
"step": 117
},
{
"epoch": 0.43243243243243246,
"grad_norm": 3.9485209950285274,
"learning_rate": 2.8531768767121656e-05,
"loss": 1.6198,
"step": 120
},
{
"epoch": 0.44324324324324327,
"grad_norm": 4.35512448284715,
"learning_rate": 2.845873875412335e-05,
"loss": 1.6443,
"step": 123
},
{
"epoch": 0.4540540540540541,
"grad_norm": 4.651266887881564,
"learning_rate": 2.838403445877958e-05,
"loss": 1.6542,
"step": 126
},
{
"epoch": 0.4648648648648649,
"grad_norm": 4.5294288157069,
"learning_rate": 2.8307665174389323e-05,
"loss": 1.655,
"step": 129
},
{
"epoch": 0.4756756756756757,
"grad_norm": 4.104985239339571,
"learning_rate": 2.822964040137805e-05,
"loss": 1.6827,
"step": 132
},
{
"epoch": 0.4864864864864865,
"grad_norm": 3.7948515475231286,
"learning_rate": 2.8149969846115894e-05,
"loss": 1.6333,
"step": 135
},
{
"epoch": 0.4972972972972973,
"grad_norm": 4.449225329061536,
"learning_rate": 2.8068663419710182e-05,
"loss": 1.6185,
"step": 138
},
{
"epoch": 0.5081081081081081,
"grad_norm": 4.391987231579949,
"learning_rate": 2.7985731236772448e-05,
"loss": 1.6078,
"step": 141
},
{
"epoch": 0.518918918918919,
"grad_norm": 4.982415169833182,
"learning_rate": 2.7901183614160185e-05,
"loss": 1.6529,
"step": 144
},
{
"epoch": 0.5297297297297298,
"grad_norm": 4.176595151214056,
"learning_rate": 2.7815031069693412e-05,
"loss": 1.6073,
"step": 147
},
{
"epoch": 0.5405405405405406,
"grad_norm": 4.3206148554703105,
"learning_rate": 2.7727284320846246e-05,
"loss": 1.5561,
"step": 150
},
{
"epoch": 0.5513513513513514,
"grad_norm": 4.424758608775709,
"learning_rate": 2.7637954283413632e-05,
"loss": 1.6253,
"step": 153
},
{
"epoch": 0.5621621621621622,
"grad_norm": 5.349711813640235,
"learning_rate": 2.75470520701534e-05,
"loss": 1.7059,
"step": 156
},
{
"epoch": 0.572972972972973,
"grad_norm": 4.578654891344146,
"learning_rate": 2.7454588989403858e-05,
"loss": 1.6107,
"step": 159
},
{
"epoch": 0.5837837837837838,
"grad_norm": 40.22880683574773,
"learning_rate": 2.7360576543676972e-05,
"loss": 1.6278,
"step": 162
},
{
"epoch": 0.5945945945945946,
"grad_norm": 4.155039894851776,
"learning_rate": 2.7265026428227476e-05,
"loss": 1.6301,
"step": 165
},
{
"epoch": 0.6054054054054054,
"grad_norm": 4.159866031946415,
"learning_rate": 2.7167950529597963e-05,
"loss": 1.5342,
"step": 168
},
{
"epoch": 0.6162162162162163,
"grad_norm": 4.087141493968059,
"learning_rate": 2.706936092414018e-05,
"loss": 1.6033,
"step": 171
},
{
"epoch": 0.6270270270270271,
"grad_norm": 3.825646270675215,
"learning_rate": 2.696926987651271e-05,
"loss": 1.5288,
"step": 174
},
{
"epoch": 0.6378378378378379,
"grad_norm": 3.917523159059879,
"learning_rate": 2.686768983815526e-05,
"loss": 1.6363,
"step": 177
},
{
"epoch": 0.6486486486486487,
"grad_norm": 4.202629239471907,
"learning_rate": 2.676463344573965e-05,
"loss": 1.6052,
"step": 180
},
{
"epoch": 0.6594594594594595,
"grad_norm": 3.6368092847747304,
"learning_rate": 2.666011351959783e-05,
"loss": 1.6309,
"step": 183
},
{
"epoch": 0.6702702702702703,
"grad_norm": 4.253168243144118,
"learning_rate": 2.6554143062126995e-05,
"loss": 1.5592,
"step": 186
},
{
"epoch": 0.6810810810810811,
"grad_norm": 4.995354779998164,
"learning_rate": 2.6446735256172092e-05,
"loss": 1.6303,
"step": 189
},
{
"epoch": 0.6918918918918919,
"grad_norm": 4.421549411402136,
"learning_rate": 2.6337903463385836e-05,
"loss": 1.5769,
"step": 192
},
{
"epoch": 0.7027027027027027,
"grad_norm": 4.32615026522547,
"learning_rate": 2.6227661222566516e-05,
"loss": 1.613,
"step": 195
},
{
"epoch": 0.7135135135135136,
"grad_norm": 3.8575639988103836,
"learning_rate": 2.6116022247973773e-05,
"loss": 1.5844,
"step": 198
},
{
"epoch": 0.7243243243243244,
"grad_norm": 3.7279832633133028,
"learning_rate": 2.6003000427622484e-05,
"loss": 1.5301,
"step": 201
},
{
"epoch": 0.7351351351351352,
"grad_norm": 4.190711163922663,
"learning_rate": 2.5888609821555127e-05,
"loss": 1.592,
"step": 204
},
{
"epoch": 0.745945945945946,
"grad_norm": 4.733000892445367,
"learning_rate": 2.577286466009266e-05,
"loss": 1.6574,
"step": 207
},
{
"epoch": 0.7567567567567568,
"grad_norm": 4.577219211132897,
"learning_rate": 2.5655779342064276e-05,
"loss": 1.6289,
"step": 210
},
{
"epoch": 0.7675675675675676,
"grad_norm": 4.048131970531039,
"learning_rate": 2.553736843301615e-05,
"loss": 1.6169,
"step": 213
},
{
"epoch": 0.7783783783783784,
"grad_norm": 4.018546715630257,
"learning_rate": 2.5417646663399502e-05,
"loss": 1.5489,
"step": 216
},
{
"epoch": 0.7891891891891892,
"grad_norm": 3.7010313992210992,
"learning_rate": 2.529662892673806e-05,
"loss": 1.5596,
"step": 219
},
{
"epoch": 0.8,
"grad_norm": 4.557965597883243,
"learning_rate": 2.5174330277775354e-05,
"loss": 1.6145,
"step": 222
},
{
"epoch": 0.8108108108108109,
"grad_norm": 4.181549208740728,
"learning_rate": 2.5050765930601836e-05,
"loss": 1.5339,
"step": 225
},
{
"epoch": 0.8216216216216217,
"grad_norm": 3.7892758830012823,
"learning_rate": 2.4925951256762254e-05,
"loss": 1.5862,
"step": 228
},
{
"epoch": 0.8324324324324325,
"grad_norm": 3.6130747678919666,
"learning_rate": 2.4799901783343407e-05,
"loss": 1.4857,
"step": 231
},
{
"epoch": 0.8432432432432433,
"grad_norm": 3.639537345617851,
"learning_rate": 2.467263319104256e-05,
"loss": 1.5902,
"step": 234
},
{
"epoch": 0.8540540540540541,
"grad_norm": 4.0474919753332035,
"learning_rate": 2.4544161312216752e-05,
"loss": 1.5395,
"step": 237
},
{
"epoch": 0.8648648648648649,
"grad_norm": 3.800979434984059,
"learning_rate": 2.441450212891323e-05,
"loss": 1.5284,
"step": 240
},
{
"epoch": 0.8756756756756757,
"grad_norm": 3.3611120493742983,
"learning_rate": 2.4283671770881256e-05,
"loss": 1.515,
"step": 243
},
{
"epoch": 0.8864864864864865,
"grad_norm": 3.459228078638404,
"learning_rate": 2.415168651356556e-05,
"loss": 1.5745,
"step": 246
},
{
"epoch": 0.8972972972972973,
"grad_norm": 3.6185129562881513,
"learning_rate": 2.4018562776081643e-05,
"loss": 1.5989,
"step": 249
},
{
"epoch": 0.9081081081081082,
"grad_norm": 4.499909371969758,
"learning_rate": 2.388431711917324e-05,
"loss": 1.5609,
"step": 252
},
{
"epoch": 0.918918918918919,
"grad_norm": 3.6576864938242832,
"learning_rate": 2.3748966243152127e-05,
"loss": 1.5623,
"step": 255
},
{
"epoch": 0.9297297297297298,
"grad_norm": 4.261199238023545,
"learning_rate": 2.3612526985820586e-05,
"loss": 1.5523,
"step": 258
},
{
"epoch": 0.9405405405405406,
"grad_norm": 4.730374719738293,
"learning_rate": 2.347501632037678e-05,
"loss": 1.5813,
"step": 261
},
{
"epoch": 0.9513513513513514,
"grad_norm": 3.7110704143642503,
"learning_rate": 2.333645135330324e-05,
"loss": 1.4888,
"step": 264
},
{
"epoch": 0.9621621621621622,
"grad_norm": 3.481005791064881,
"learning_rate": 2.3196849322238816e-05,
"loss": 1.6186,
"step": 267
},
{
"epoch": 0.972972972972973,
"grad_norm": 3.9410070667987913,
"learning_rate": 2.3056227593834306e-05,
"loss": 1.5343,
"step": 270
},
{
"epoch": 0.9837837837837838,
"grad_norm": 3.73687483401855,
"learning_rate": 2.291460366159199e-05,
"loss": 1.527,
"step": 273
},
{
"epoch": 0.9945945945945946,
"grad_norm": 3.636935348418019,
"learning_rate": 2.277199514368947e-05,
"loss": 1.5228,
"step": 276
},
{
"epoch": 1.0054054054054054,
"grad_norm": 3.5028224113856457,
"learning_rate": 2.2628419780787887e-05,
"loss": 1.3043,
"step": 279
},
{
"epoch": 1.0162162162162163,
"grad_norm": 3.2714761796276455,
"learning_rate": 2.2483895433825023e-05,
"loss": 1.0507,
"step": 282
},
{
"epoch": 1.027027027027027,
"grad_norm": 3.180825722720309,
"learning_rate": 2.2338440081793332e-05,
"loss": 1.0155,
"step": 285
},
{
"epoch": 1.037837837837838,
"grad_norm": 2.9167211293609894,
"learning_rate": 2.2192071819503365e-05,
"loss": 1.0087,
"step": 288
},
{
"epoch": 1.0486486486486486,
"grad_norm": 3.1930797413555077,
"learning_rate": 2.2044808855332743e-05,
"loss": 0.9847,
"step": 291
},
{
"epoch": 1.0594594594594595,
"grad_norm": 3.0743072086936474,
"learning_rate": 2.1896669508961002e-05,
"loss": 1.0024,
"step": 294
},
{
"epoch": 1.0702702702702702,
"grad_norm": 3.3931402915538613,
"learning_rate": 2.1747672209090627e-05,
"loss": 1.0063,
"step": 297
},
{
"epoch": 1.0810810810810811,
"grad_norm": 3.427840497426894,
"learning_rate": 2.1597835491154495e-05,
"loss": 0.9924,
"step": 300
},
{
"epoch": 1.0918918918918918,
"grad_norm": 3.209752499479298,
"learning_rate": 2.1447177995010024e-05,
"loss": 1.0114,
"step": 303
},
{
"epoch": 1.1027027027027028,
"grad_norm": 2.9188122615255487,
"learning_rate": 2.1295718462620383e-05,
"loss": 0.9348,
"step": 306
},
{
"epoch": 1.1135135135135135,
"grad_norm": 3.2169410708018464,
"learning_rate": 2.1143475735722965e-05,
"loss": 0.9456,
"step": 309
},
{
"epoch": 1.1243243243243244,
"grad_norm": 3.2550857985332815,
"learning_rate": 2.099046875348543e-05,
"loss": 0.9704,
"step": 312
},
{
"epoch": 1.135135135135135,
"grad_norm": 3.200798957813093,
"learning_rate": 2.0836716550149685e-05,
"loss": 1.0187,
"step": 315
},
{
"epoch": 1.145945945945946,
"grad_norm": 3.026699827485341,
"learning_rate": 2.068223825266397e-05,
"loss": 0.9959,
"step": 318
},
{
"epoch": 1.1567567567567567,
"grad_norm": 2.966340597816754,
"learning_rate": 2.0527053078303463e-05,
"loss": 0.9672,
"step": 321
},
{
"epoch": 1.1675675675675676,
"grad_norm": 3.4796215218810578,
"learning_rate": 2.0371180332279642e-05,
"loss": 0.9631,
"step": 324
},
{
"epoch": 1.1783783783783783,
"grad_norm": 2.9446475013457203,
"learning_rate": 2.0214639405338653e-05,
"loss": 0.9922,
"step": 327
},
{
"epoch": 1.1891891891891893,
"grad_norm": 3.0107017661224447,
"learning_rate": 2.0057449771349123e-05,
"loss": 0.9846,
"step": 330
},
{
"epoch": 1.2,
"grad_norm": 3.1589173902147203,
"learning_rate": 1.989963098487957e-05,
"loss": 0.9945,
"step": 333
},
{
"epoch": 1.2108108108108109,
"grad_norm": 3.291095419768011,
"learning_rate": 1.9741202678765785e-05,
"loss": 1.0006,
"step": 336
},
{
"epoch": 1.2216216216216216,
"grad_norm": 3.0439357766975768,
"learning_rate": 1.9582184561668496e-05,
"loss": 1.0247,
"step": 339
},
{
"epoch": 1.2324324324324325,
"grad_norm": 2.7398517472244133,
"learning_rate": 1.942259641562159e-05,
"loss": 1.0129,
"step": 342
},
{
"epoch": 1.2432432432432432,
"grad_norm": 3.0466059717106098,
"learning_rate": 1.9262458093571193e-05,
"loss": 1.0257,
"step": 345
},
{
"epoch": 1.2540540540540541,
"grad_norm": 2.8458132575753714,
"learning_rate": 1.9101789516905953e-05,
"loss": 0.9715,
"step": 348
},
{
"epoch": 1.2648648648648648,
"grad_norm": 2.8328426905654656,
"learning_rate": 1.8940610672978803e-05,
"loss": 0.961,
"step": 351
},
{
"epoch": 1.2756756756756757,
"grad_norm": 3.030835646521939,
"learning_rate": 1.8778941612620482e-05,
"loss": 0.9884,
"step": 354
},
{
"epoch": 1.2864864864864864,
"grad_norm": 2.8633899892085024,
"learning_rate": 1.8616802447645223e-05,
"loss": 0.9937,
"step": 357
},
{
"epoch": 1.2972972972972974,
"grad_norm": 3.338996158976475,
"learning_rate": 1.8454213348348797e-05,
"loss": 0.9809,
"step": 360
},
{
"epoch": 1.308108108108108,
"grad_norm": 2.924814513226331,
"learning_rate": 1.8291194540999322e-05,
"loss": 0.9526,
"step": 363
},
{
"epoch": 1.318918918918919,
"grad_norm": 3.090470952947,
"learning_rate": 1.8127766305321072e-05,
"loss": 0.9912,
"step": 366
},
{
"epoch": 1.3297297297297297,
"grad_norm": 2.9540976533352867,
"learning_rate": 1.7963948971971686e-05,
"loss": 0.9725,
"step": 369
},
{
"epoch": 1.3405405405405406,
"grad_norm": 2.9280101457384986,
"learning_rate": 1.7799762920012982e-05,
"loss": 0.9508,
"step": 372
},
{
"epoch": 1.3513513513513513,
"grad_norm": 3.129222083901634,
"learning_rate": 1.763522857437579e-05,
"loss": 0.9952,
"step": 375
},
{
"epoch": 1.3621621621621622,
"grad_norm": 3.3207813445482315,
"learning_rate": 1.747036640331908e-05,
"loss": 0.9778,
"step": 378
},
{
"epoch": 1.372972972972973,
"grad_norm": 2.941815984953935,
"learning_rate": 1.7305196915883662e-05,
"loss": 0.9922,
"step": 381
},
{
"epoch": 1.3837837837837839,
"grad_norm": 3.1943275224301475,
"learning_rate": 1.713974065934086e-05,
"loss": 0.9738,
"step": 384
},
{
"epoch": 1.3945945945945946,
"grad_norm": 2.9545782873135478,
"learning_rate": 1.6974018216636394e-05,
"loss": 0.9712,
"step": 387
},
{
"epoch": 1.4054054054054055,
"grad_norm": 2.832796057451163,
"learning_rate": 1.6808050203829845e-05,
"loss": 1.0121,
"step": 390
},
{
"epoch": 1.4162162162162162,
"grad_norm": 3.2139763823586196,
"learning_rate": 1.6641857267530003e-05,
"loss": 0.9702,
"step": 393
},
{
"epoch": 1.427027027027027,
"grad_norm": 3.3210065946822827,
"learning_rate": 1.6475460082326377e-05,
"loss": 1.0018,
"step": 396
},
{
"epoch": 1.4378378378378378,
"grad_norm": 3.166940843865695,
"learning_rate": 1.6308879348217293e-05,
"loss": 0.9959,
"step": 399
},
{
"epoch": 1.4486486486486487,
"grad_norm": 3.1486302485385878,
"learning_rate": 1.6142135788034743e-05,
"loss": 0.9477,
"step": 402
},
{
"epoch": 1.4594594594594594,
"grad_norm": 3.1328749208815547,
"learning_rate": 1.5975250144866492e-05,
"loss": 0.9854,
"step": 405
},
{
"epoch": 1.4702702702702704,
"grad_norm": 2.9503463010514035,
"learning_rate": 1.5808243179475568e-05,
"loss": 1.0001,
"step": 408
},
{
"epoch": 1.481081081081081,
"grad_norm": 2.8925905903725355,
"learning_rate": 1.564113566771764e-05,
"loss": 0.9475,
"step": 411
},
{
"epoch": 1.491891891891892,
"grad_norm": 3.2184062381528196,
"learning_rate": 1.547394839795645e-05,
"loss": 0.9862,
"step": 414
},
{
"epoch": 1.5027027027027027,
"grad_norm": 3.0205819182026077,
"learning_rate": 1.530670216847772e-05,
"loss": 0.9689,
"step": 417
},
{
"epoch": 1.5135135135135136,
"grad_norm": 2.886699137488658,
"learning_rate": 1.5139417784901836e-05,
"loss": 0.9578,
"step": 420
},
{
"epoch": 1.5243243243243243,
"grad_norm": 3.0019029659558494,
"learning_rate": 1.4972116057595592e-05,
"loss": 0.9526,
"step": 423
},
{
"epoch": 1.535135135135135,
"grad_norm": 3.141168035086649,
"learning_rate": 1.480481779908337e-05,
"loss": 0.9621,
"step": 426
},
{
"epoch": 1.545945945945946,
"grad_norm": 2.842053920465437,
"learning_rate": 1.463754382145802e-05,
"loss": 0.9821,
"step": 429
},
{
"epoch": 1.5567567567567568,
"grad_norm": 3.220671922556498,
"learning_rate": 1.4470314933791828e-05,
"loss": 0.9547,
"step": 432
},
{
"epoch": 1.5675675675675675,
"grad_norm": 2.9122625506605586,
"learning_rate": 1.430315193954783e-05,
"loss": 0.9678,
"step": 435
},
{
"epoch": 1.5783783783783782,
"grad_norm": 2.6568836209674274,
"learning_rate": 1.4136075633991864e-05,
"loss": 0.9566,
"step": 438
},
{
"epoch": 1.5891891891891892,
"grad_norm": 2.7282858715379077,
"learning_rate": 1.3969106801605577e-05,
"loss": 0.9195,
"step": 441
},
{
"epoch": 1.6,
"grad_norm": 2.602059773028646,
"learning_rate": 1.3802266213500843e-05,
"loss": 0.955,
"step": 444
},
{
"epoch": 1.6108108108108108,
"grad_norm": 3.3786673839231423,
"learning_rate": 1.3635574624835798e-05,
"loss": 0.9645,
"step": 447
},
{
"epoch": 1.6216216216216215,
"grad_norm": 2.6986089589909,
"learning_rate": 1.3469052772232874e-05,
"loss": 0.98,
"step": 450
},
{
"epoch": 1.6324324324324324,
"grad_norm": 2.89235283174837,
"learning_rate": 1.3302721371199165e-05,
"loss": 0.9588,
"step": 453
},
{
"epoch": 1.6432432432432433,
"grad_norm": 2.935829812088402,
"learning_rate": 1.3136601113549349e-05,
"loss": 0.9354,
"step": 456
},
{
"epoch": 1.654054054054054,
"grad_norm": 2.737725384464134,
"learning_rate": 1.2970712664831644e-05,
"loss": 0.9574,
"step": 459
},
{
"epoch": 1.6648648648648647,
"grad_norm": 2.867513411111901,
"learning_rate": 1.2805076661756965e-05,
"loss": 0.9446,
"step": 462
},
{
"epoch": 1.6756756756756757,
"grad_norm": 2.787422977164954,
"learning_rate": 1.2639713709631709e-05,
"loss": 0.9558,
"step": 465
},
{
"epoch": 1.6864864864864866,
"grad_norm": 2.9076119130942026,
"learning_rate": 1.2474644379794421e-05,
"loss": 0.9286,
"step": 468
},
{
"epoch": 1.6972972972972973,
"grad_norm": 2.826910897786021,
"learning_rate": 1.2309889207056708e-05,
"loss": 0.9556,
"step": 471
},
{
"epoch": 1.708108108108108,
"grad_norm": 3.027330313518151,
"learning_rate": 1.2145468687148672e-05,
"loss": 0.9157,
"step": 474
},
{
"epoch": 1.718918918918919,
"grad_norm": 2.870766642781988,
"learning_rate": 1.1981403274169219e-05,
"loss": 0.9708,
"step": 477
},
{
"epoch": 1.7297297297297298,
"grad_norm": 2.745031713488,
"learning_rate": 1.1817713378041568e-05,
"loss": 0.9404,
"step": 480
},
{
"epoch": 1.7405405405405405,
"grad_norm": 2.8171111310049506,
"learning_rate": 1.1654419361974195e-05,
"loss": 0.9423,
"step": 483
},
{
"epoch": 1.7513513513513512,
"grad_norm": 2.8549039042787503,
"learning_rate": 1.1491541539927668e-05,
"loss": 0.951,
"step": 486
},
{
"epoch": 1.7621621621621621,
"grad_norm": 2.664120980356897,
"learning_rate": 1.1329100174087534e-05,
"loss": 0.9287,
"step": 489
},
{
"epoch": 1.772972972972973,
"grad_norm": 2.7039305514008096,
"learning_rate": 1.1167115472343693e-05,
"loss": 0.9584,
"step": 492
},
{
"epoch": 1.7837837837837838,
"grad_norm": 2.6778342659825025,
"learning_rate": 1.1005607585776527e-05,
"loss": 0.9151,
"step": 495
},
{
"epoch": 1.7945945945945945,
"grad_norm": 2.6005910068753857,
"learning_rate": 1.0844596606150055e-05,
"loss": 0.9501,
"step": 498
},
{
"epoch": 1.8054054054054054,
"grad_norm": 2.6765741105098364,
"learning_rate": 1.0684102563412519e-05,
"loss": 0.931,
"step": 501
},
{
"epoch": 1.8162162162162163,
"grad_norm": 2.811327607536862,
"learning_rate": 1.0524145423204623e-05,
"loss": 0.9793,
"step": 504
},
{
"epoch": 1.827027027027027,
"grad_norm": 2.92527323842401,
"learning_rate": 1.036474508437579e-05,
"loss": 0.9776,
"step": 507
},
{
"epoch": 1.8378378378378377,
"grad_norm": 2.789416429817517,
"learning_rate": 1.020592137650872e-05,
"loss": 0.947,
"step": 510
},
{
"epoch": 1.8486486486486486,
"grad_norm": 2.754589393028259,
"learning_rate": 1.004769405745257e-05,
"loss": 0.9685,
"step": 513
},
{
"epoch": 1.8594594594594596,
"grad_norm": 2.593923465827381,
"learning_rate": 9.890082810865046e-06,
"loss": 0.9317,
"step": 516
},
{
"epoch": 1.8702702702702703,
"grad_norm": 3.005765748087634,
"learning_rate": 9.733107243763754e-06,
"loss": 0.9612,
"step": 519
},
{
"epoch": 1.881081081081081,
"grad_norm": 2.6391444135921462,
"learning_rate": 9.576786884087037e-06,
"loss": 0.9431,
"step": 522
},
{
"epoch": 1.8918918918918919,
"grad_norm": 2.7958859686864823,
"learning_rate": 9.421141178264702e-06,
"loss": 0.9473,
"step": 525
},
{
"epoch": 1.9027027027027028,
"grad_norm": 2.8853568858381746,
"learning_rate": 9.266189488798854e-06,
"loss": 0.9404,
"step": 528
},
{
"epoch": 1.9135135135135135,
"grad_norm": 3.011863176958825,
"learning_rate": 9.111951091855164e-06,
"loss": 0.9424,
"step": 531
},
{
"epoch": 1.9243243243243242,
"grad_norm": 2.624513223364359,
"learning_rate": 8.95844517486492e-06,
"loss": 0.9404,
"step": 534
},
{
"epoch": 1.9351351351351351,
"grad_norm": 2.629936136635792,
"learning_rate": 8.805690834138076e-06,
"loss": 0.9588,
"step": 537
},
{
"epoch": 1.945945945945946,
"grad_norm": 2.8522026479916023,
"learning_rate": 8.65370707248763e-06,
"loss": 0.9339,
"step": 540
},
{
"epoch": 1.9567567567567568,
"grad_norm": 3.097766550094928,
"learning_rate": 8.502512796865686e-06,
"loss": 0.9394,
"step": 543
},
{
"epoch": 1.9675675675675675,
"grad_norm": 2.749849929848188,
"learning_rate": 8.352126816011382e-06,
"loss": 0.9402,
"step": 546
},
{
"epoch": 1.9783783783783784,
"grad_norm": 2.792496713680321,
"learning_rate": 8.202567838111078e-06,
"loss": 0.9403,
"step": 549
},
{
"epoch": 1.9891891891891893,
"grad_norm": 2.742908628148625,
"learning_rate": 8.053854468471025e-06,
"loss": 0.9475,
"step": 552
},
{
"epoch": 2.0,
"grad_norm": 2.80461985695162,
"learning_rate": 7.906005207202852e-06,
"loss": 0.9251,
"step": 555
},
{
"epoch": 2.0108108108108107,
"grad_norm": 2.712256627105201,
"learning_rate": 7.75903844692212e-06,
"loss": 0.4979,
"step": 558
},
{
"epoch": 2.0216216216216214,
"grad_norm": 2.2575975329190823,
"learning_rate": 7.61297247046029e-06,
"loss": 0.4357,
"step": 561
},
{
"epoch": 2.0324324324324325,
"grad_norm": 2.8247101377056865,
"learning_rate": 7.4678254485902675e-06,
"loss": 0.4334,
"step": 564
},
{
"epoch": 2.0432432432432432,
"grad_norm": 2.588953106795816,
"learning_rate": 7.3236154377659825e-06,
"loss": 0.4327,
"step": 567
},
{
"epoch": 2.054054054054054,
"grad_norm": 2.2112157456197807,
"learning_rate": 7.180360377876125e-06,
"loss": 0.4301,
"step": 570
},
{
"epoch": 2.064864864864865,
"grad_norm": 2.1916524154888903,
"learning_rate": 7.038078090012406e-06,
"loss": 0.4254,
"step": 573
},
{
"epoch": 2.075675675675676,
"grad_norm": 2.092153236540328,
"learning_rate": 6.896786274252595e-06,
"loss": 0.4066,
"step": 576
},
{
"epoch": 2.0864864864864865,
"grad_norm": 2.1223817949774033,
"learning_rate": 6.7565025074586145e-06,
"loss": 0.4018,
"step": 579
},
{
"epoch": 2.097297297297297,
"grad_norm": 1.975599624746057,
"learning_rate": 6.617244241089947e-06,
"loss": 0.3899,
"step": 582
},
{
"epoch": 2.108108108108108,
"grad_norm": 1.8893909454422486,
"learning_rate": 6.479028799032664e-06,
"loss": 0.397,
"step": 585
},
{
"epoch": 2.118918918918919,
"grad_norm": 1.875678473328706,
"learning_rate": 6.3418733754443136e-06,
"loss": 0.407,
"step": 588
},
{
"epoch": 2.1297297297297297,
"grad_norm": 2.210895295229888,
"learning_rate": 6.205795032614943e-06,
"loss": 0.4039,
"step": 591
},
{
"epoch": 2.1405405405405404,
"grad_norm": 2.2866290593300573,
"learning_rate": 6.07081069884453e-06,
"loss": 0.3975,
"step": 594
},
{
"epoch": 2.1513513513513516,
"grad_norm": 2.169724698947998,
"learning_rate": 5.936937166337093e-06,
"loss": 0.404,
"step": 597
},
{
"epoch": 2.1621621621621623,
"grad_norm": 2.5885052465204503,
"learning_rate": 5.804191089111711e-06,
"loss": 0.4137,
"step": 600
},
{
"epoch": 2.172972972972973,
"grad_norm": 2.1184895283704273,
"learning_rate": 5.6725889809307486e-06,
"loss": 0.4069,
"step": 603
},
{
"epoch": 2.1837837837837837,
"grad_norm": 2.055767847916725,
"learning_rate": 5.5421472132455285e-06,
"loss": 0.4309,
"step": 606
},
{
"epoch": 2.1945945945945944,
"grad_norm": 1.9387007802838037,
"learning_rate": 5.412882013159697e-06,
"loss": 0.3989,
"step": 609
},
{
"epoch": 2.2054054054054055,
"grad_norm": 1.9041479568200537,
"learning_rate": 5.284809461410556e-06,
"loss": 0.4013,
"step": 612
},
{
"epoch": 2.2162162162162162,
"grad_norm": 2.0548881191902018,
"learning_rate": 5.157945490368621e-06,
"loss": 0.4205,
"step": 615
},
{
"epoch": 2.227027027027027,
"grad_norm": 2.0831599061407204,
"learning_rate": 5.03230588205558e-06,
"loss": 0.4122,
"step": 618
},
{
"epoch": 2.237837837837838,
"grad_norm": 1.971310383757786,
"learning_rate": 4.907906266181014e-06,
"loss": 0.3837,
"step": 621
},
{
"epoch": 2.2486486486486488,
"grad_norm": 1.977557211024187,
"learning_rate": 4.784762118198041e-06,
"loss": 0.3981,
"step": 624
},
{
"epoch": 2.2594594594594595,
"grad_norm": 1.9559375507208316,
"learning_rate": 4.66288875737816e-06,
"loss": 0.4094,
"step": 627
},
{
"epoch": 2.27027027027027,
"grad_norm": 1.9123345255397275,
"learning_rate": 4.542301344905496e-06,
"loss": 0.3863,
"step": 630
},
{
"epoch": 2.281081081081081,
"grad_norm": 1.8524912274987262,
"learning_rate": 4.423014881990751e-06,
"loss": 0.3908,
"step": 633
},
{
"epoch": 2.291891891891892,
"grad_norm": 2.0911936019239246,
"learning_rate": 4.305044208005023e-06,
"loss": 0.4167,
"step": 636
},
{
"epoch": 2.3027027027027027,
"grad_norm": 1.9050565892596198,
"learning_rate": 4.188403998633775e-06,
"loss": 0.3955,
"step": 639
},
{
"epoch": 2.3135135135135134,
"grad_norm": 1.8967742593703636,
"learning_rate": 4.0731087640511735e-06,
"loss": 0.4163,
"step": 642
},
{
"epoch": 2.3243243243243246,
"grad_norm": 2.051977219640454,
"learning_rate": 3.959172847114991e-06,
"loss": 0.4024,
"step": 645
},
{
"epoch": 2.3351351351351353,
"grad_norm": 2.0012355554132792,
"learning_rate": 3.846610421582349e-06,
"loss": 0.4157,
"step": 648
},
{
"epoch": 2.345945945945946,
"grad_norm": 2.175698094340077,
"learning_rate": 3.7354354903464793e-06,
"loss": 0.4024,
"step": 651
},
{
"epoch": 2.3567567567567567,
"grad_norm": 2.0526759215687362,
"learning_rate": 3.625661883694753e-06,
"loss": 0.3939,
"step": 654
},
{
"epoch": 2.3675675675675674,
"grad_norm": 1.960306969771245,
"learning_rate": 3.5173032575881768e-06,
"loss": 0.4074,
"step": 657
},
{
"epoch": 2.3783783783783785,
"grad_norm": 2.1570885212061826,
"learning_rate": 3.4103730919625753e-06,
"loss": 0.3976,
"step": 660
},
{
"epoch": 2.389189189189189,
"grad_norm": 1.8949986677811612,
"learning_rate": 3.3048846890516658e-06,
"loss": 0.4,
"step": 663
},
{
"epoch": 2.4,
"grad_norm": 1.982249761308161,
"learning_rate": 3.2008511717322593e-06,
"loss": 0.4133,
"step": 666
},
{
"epoch": 2.410810810810811,
"grad_norm": 1.980047898141974,
"learning_rate": 3.098285481891745e-06,
"loss": 0.3939,
"step": 669
},
{
"epoch": 2.4216216216216218,
"grad_norm": 2.0049995734478965,
"learning_rate": 2.9972003788181146e-06,
"loss": 0.3926,
"step": 672
},
{
"epoch": 2.4324324324324325,
"grad_norm": 1.8400258173639734,
"learning_rate": 2.8976084376126848e-06,
"loss": 0.3936,
"step": 675
},
{
"epoch": 2.443243243243243,
"grad_norm": 1.9448462664129043,
"learning_rate": 2.7995220476257482e-06,
"loss": 0.388,
"step": 678
},
{
"epoch": 2.454054054054054,
"grad_norm": 1.9031160601187072,
"learning_rate": 2.7029534109153186e-06,
"loss": 0.3909,
"step": 681
},
{
"epoch": 2.464864864864865,
"grad_norm": 2.279997846004982,
"learning_rate": 2.6079145407291877e-06,
"loss": 0.3895,
"step": 684
},
{
"epoch": 2.4756756756756757,
"grad_norm": 1.8403404089990134,
"learning_rate": 2.514417260010455e-06,
"loss": 0.3976,
"step": 687
},
{
"epoch": 2.4864864864864864,
"grad_norm": 1.7969451736164892,
"learning_rate": 2.4224731999267425e-06,
"loss": 0.3999,
"step": 690
},
{
"epoch": 2.4972972972972975,
"grad_norm": 1.9253974055183771,
"learning_rate": 2.3320937984232664e-06,
"loss": 0.3939,
"step": 693
},
{
"epoch": 2.5081081081081082,
"grad_norm": 1.913704985114193,
"learning_rate": 2.243290298799945e-06,
"loss": 0.3984,
"step": 696
},
{
"epoch": 2.518918918918919,
"grad_norm": 1.9790173152408796,
"learning_rate": 2.156073748312721e-06,
"loss": 0.3819,
"step": 699
},
{
"epoch": 2.5297297297297296,
"grad_norm": 2.276492968512024,
"learning_rate": 2.070454996799261e-06,
"loss": 0.4039,
"step": 702
},
{
"epoch": 2.5405405405405403,
"grad_norm": 1.7739681476430693,
"learning_rate": 1.9864446953292313e-06,
"loss": 0.3791,
"step": 705
},
{
"epoch": 2.5513513513513515,
"grad_norm": 2.0336913560870196,
"learning_rate": 1.9040532948792934e-06,
"loss": 0.3847,
"step": 708
},
{
"epoch": 2.562162162162162,
"grad_norm": 1.8946193097351467,
"learning_rate": 1.8232910450329832e-06,
"loss": 0.385,
"step": 711
},
{
"epoch": 2.572972972972973,
"grad_norm": 1.9817636629737283,
"learning_rate": 1.744167992705664e-06,
"loss": 0.3914,
"step": 714
},
{
"epoch": 2.583783783783784,
"grad_norm": 1.8202147731376643,
"learning_rate": 1.6666939808946619e-06,
"loss": 0.377,
"step": 717
},
{
"epoch": 2.5945945945945947,
"grad_norm": 1.804624257938459,
"learning_rate": 1.5908786474548004e-06,
"loss": 0.3834,
"step": 720
},
{
"epoch": 2.6054054054054054,
"grad_norm": 1.9478831371089558,
"learning_rate": 1.5167314238994367e-06,
"loss": 0.3802,
"step": 723
},
{
"epoch": 2.616216216216216,
"grad_norm": 1.8418242757562502,
"learning_rate": 1.4442615342271625e-06,
"loss": 0.3742,
"step": 726
},
{
"epoch": 2.627027027027027,
"grad_norm": 1.8235265917309187,
"learning_rate": 1.3734779937743403e-06,
"loss": 0.3763,
"step": 729
},
{
"epoch": 2.637837837837838,
"grad_norm": 1.8148562882498185,
"learning_rate": 1.3043896080935785e-06,
"loss": 0.3764,
"step": 732
},
{
"epoch": 2.6486486486486487,
"grad_norm": 1.9162200026873921,
"learning_rate": 1.237004971858307e-06,
"loss": 0.4009,
"step": 735
},
{
"epoch": 2.6594594594594594,
"grad_norm": 1.971484443435529,
"learning_rate": 1.1713324677936015e-06,
"loss": 0.3894,
"step": 738
},
{
"epoch": 2.6702702702702705,
"grad_norm": 2.6288039366865883,
"learning_rate": 1.1073802656333548e-06,
"loss": 0.3736,
"step": 741
},
{
"epoch": 2.6810810810810812,
"grad_norm": 1.8111148825496188,
"learning_rate": 1.0451563211039494e-06,
"loss": 0.3996,
"step": 744
},
{
"epoch": 2.691891891891892,
"grad_norm": 1.7806298978071708,
"learning_rate": 9.846683749345648e-07,
"loss": 0.383,
"step": 747
},
{
"epoch": 2.7027027027027026,
"grad_norm": 4.0497002081385185,
"learning_rate": 9.25923951894222e-07,
"loss": 0.3965,
"step": 750
},
{
"epoch": 2.7135135135135133,
"grad_norm": 1.8334211425058837,
"learning_rate": 8.68930359855683e-07,
"loss": 0.3989,
"step": 753
},
{
"epoch": 2.7243243243243245,
"grad_norm": 1.7500539556657924,
"learning_rate": 8.136946888863528e-07,
"loss": 0.395,
"step": 756
},
{
"epoch": 2.735135135135135,
"grad_norm": 1.9130969263501059,
"learning_rate": 7.602238103662646e-07,
"loss": 0.3853,
"step": 759
},
{
"epoch": 2.745945945945946,
"grad_norm": 1.8404236110308207,
"learning_rate": 7.085243761332738e-07,
"loss": 0.393,
"step": 762
},
{
"epoch": 2.756756756756757,
"grad_norm": 1.7456858490902225,
"learning_rate": 6.586028176555536e-07,
"loss": 0.3944,
"step": 765
},
{
"epoch": 2.7675675675675677,
"grad_norm": 1.837065449202062,
"learning_rate": 6.104653452315279e-07,
"loss": 0.3798,
"step": 768
},
{
"epoch": 2.7783783783783784,
"grad_norm": 2.3308657348413058,
"learning_rate": 5.641179472172875e-07,
"loss": 0.3798,
"step": 771
},
{
"epoch": 2.789189189189189,
"grad_norm": 1.7969746620946272,
"learning_rate": 5.195663892816432e-07,
"loss": 0.3817,
"step": 774
},
{
"epoch": 2.8,
"grad_norm": 1.8403934823419463,
"learning_rate": 4.768162136888643e-07,
"loss": 0.3791,
"step": 777
},
{
"epoch": 2.810810810810811,
"grad_norm": 1.8084423900988431,
"learning_rate": 4.3587273860921985e-07,
"loss": 0.3613,
"step": 780
},
{
"epoch": 2.8216216216216217,
"grad_norm": 1.8704402298319724,
"learning_rate": 3.9674105745738155e-07,
"loss": 0.3771,
"step": 783
},
{
"epoch": 2.8324324324324324,
"grad_norm": 1.818023687371634,
"learning_rate": 3.594260382588105e-07,
"loss": 0.3888,
"step": 786
},
{
"epoch": 2.8432432432432435,
"grad_norm": 1.8608896733650853,
"learning_rate": 3.239323230441615e-07,
"loss": 0.3888,
"step": 789
},
{
"epoch": 2.854054054054054,
"grad_norm": 1.938515453919976,
"learning_rate": 2.902643272718086e-07,
"loss": 0.4002,
"step": 792
},
{
"epoch": 2.864864864864865,
"grad_norm": 1.94145103701424,
"learning_rate": 2.5842623927856244e-07,
"loss": 0.3858,
"step": 795
},
{
"epoch": 2.8756756756756756,
"grad_norm": 1.7260066637899822,
"learning_rate": 2.28422019758629e-07,
"loss": 0.3905,
"step": 798
},
{
"epoch": 2.8864864864864863,
"grad_norm": 1.8230164360318986,
"learning_rate": 2.0025540127090513e-07,
"loss": 0.3977,
"step": 801
},
{
"epoch": 2.8972972972972975,
"grad_norm": 1.7780456307114303,
"learning_rate": 1.7392988777463202e-07,
"loss": 0.3881,
"step": 804
},
{
"epoch": 2.908108108108108,
"grad_norm": 1.9497490854182644,
"learning_rate": 1.4944875419350855e-07,
"loss": 0.3797,
"step": 807
},
{
"epoch": 2.918918918918919,
"grad_norm": 1.6524262781562993,
"learning_rate": 1.268150460082823e-07,
"loss": 0.3645,
"step": 810
},
{
"epoch": 2.92972972972973,
"grad_norm": 1.8325737906994488,
"learning_rate": 1.0603157887788428e-07,
"loss": 0.3574,
"step": 813
},
{
"epoch": 2.9405405405405407,
"grad_norm": 1.8188448217016162,
"learning_rate": 8.710093828917076e-08,
"loss": 0.3829,
"step": 816
},
{
"epoch": 2.9513513513513514,
"grad_norm": 1.774843520134497,
"learning_rate": 7.002547923527058e-08,
"loss": 0.3945,
"step": 819
},
{
"epoch": 2.962162162162162,
"grad_norm": 1.7417401019158905,
"learning_rate": 5.4807325922632825e-08,
"loss": 0.37,
"step": 822
},
{
"epoch": 2.972972972972973,
"grad_norm": 1.6997699000548114,
"learning_rate": 4.14483715067665e-08,
"loss": 0.3702,
"step": 825
},
{
"epoch": 2.983783783783784,
"grad_norm": 1.7185773019727228,
"learning_rate": 2.995027785673066e-08,
"loss": 0.3829,
"step": 828
},
{
"epoch": 2.9945945945945946,
"grad_norm": 1.7433824271169698,
"learning_rate": 2.0314475348401362e-08,
"loss": 0.3777,
"step": 831
}
],
"logging_steps": 3,
"max_steps": 845,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 833,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 232851391348736.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}