|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.001801801801802, |
|
"eval_steps": 500, |
|
"global_step": 833, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010810810810810811, |
|
"grad_norm": 44.04393707727108, |
|
"learning_rate": 2.9999066991504905e-05, |
|
"loss": 2.3711, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.021621621621621623, |
|
"grad_norm": 31.04171552882201, |
|
"learning_rate": 2.9996268082086924e-05, |
|
"loss": 4.159, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.032432432432432434, |
|
"grad_norm": 11.54576214967521, |
|
"learning_rate": 2.9991603619933566e-05, |
|
"loss": 1.9733, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.043243243243243246, |
|
"grad_norm": 7.08841552325599, |
|
"learning_rate": 2.9985074185309204e-05, |
|
"loss": 1.7978, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 5.949508626432288, |
|
"learning_rate": 2.99766805904829e-05, |
|
"loss": 1.7347, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06486486486486487, |
|
"grad_norm": 6.168616775238258, |
|
"learning_rate": 2.9966423879627356e-05, |
|
"loss": 1.6033, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07567567567567568, |
|
"grad_norm": 5.728079542384497, |
|
"learning_rate": 2.9954305328689024e-05, |
|
"loss": 1.7134, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.08648648648648649, |
|
"grad_norm": 6.284326561040228, |
|
"learning_rate": 2.9940326445229367e-05, |
|
"loss": 1.6933, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0972972972972973, |
|
"grad_norm": 6.92658975011714, |
|
"learning_rate": 2.9924488968237316e-05, |
|
"loss": 1.5923, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 6.538508694879061, |
|
"learning_rate": 2.9906794867912953e-05, |
|
"loss": 1.6931, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11891891891891893, |
|
"grad_norm": 4.685530306007965, |
|
"learning_rate": 2.98872463454224e-05, |
|
"loss": 1.6559, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.12972972972972974, |
|
"grad_norm": 5.65503266442286, |
|
"learning_rate": 2.9865845832623993e-05, |
|
"loss": 1.6982, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.14054054054054055, |
|
"grad_norm": 4.886380857119004, |
|
"learning_rate": 2.9842595991765766e-05, |
|
"loss": 1.6503, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.15135135135135136, |
|
"grad_norm": 5.026086310034092, |
|
"learning_rate": 2.981749971515426e-05, |
|
"loss": 1.632, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 4.795570770299284, |
|
"learning_rate": 2.9790560124794702e-05, |
|
"loss": 1.6824, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.17297297297297298, |
|
"grad_norm": 4.756143563325781, |
|
"learning_rate": 2.976178057200266e-05, |
|
"loss": 1.6694, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1837837837837838, |
|
"grad_norm": 5.364943432581566, |
|
"learning_rate": 2.9731164636987088e-05, |
|
"loss": 1.6659, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1945945945945946, |
|
"grad_norm": 5.181051766279552, |
|
"learning_rate": 2.9698716128404985e-05, |
|
"loss": 1.6443, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.20540540540540542, |
|
"grad_norm": 4.828479392346181, |
|
"learning_rate": 2.9664439082887568e-05, |
|
"loss": 1.6519, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 5.824361936201152, |
|
"learning_rate": 2.9628337764538135e-05, |
|
"loss": 1.6532, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22702702702702704, |
|
"grad_norm": 6.393887712988006, |
|
"learning_rate": 2.9590416664401566e-05, |
|
"loss": 1.6409, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.23783783783783785, |
|
"grad_norm": 5.8650966692501765, |
|
"learning_rate": 2.955068049990568e-05, |
|
"loss": 1.6105, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.24864864864864866, |
|
"grad_norm": 4.755849083042293, |
|
"learning_rate": 2.9509134214274343e-05, |
|
"loss": 1.6618, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2594594594594595, |
|
"grad_norm": 3.9789106042521962, |
|
"learning_rate": 2.9465782975912553e-05, |
|
"loss": 1.6645, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 4.592578050100414, |
|
"learning_rate": 2.942063217776346e-05, |
|
"loss": 1.605, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2810810810810811, |
|
"grad_norm": 4.299733571350802, |
|
"learning_rate": 2.9373687436637492e-05, |
|
"loss": 1.6233, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2918918918918919, |
|
"grad_norm": 4.585181401202116, |
|
"learning_rate": 2.9324954592513626e-05, |
|
"loss": 1.6587, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3027027027027027, |
|
"grad_norm": 4.530135718622418, |
|
"learning_rate": 2.927443970781287e-05, |
|
"loss": 1.6333, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.31351351351351353, |
|
"grad_norm": 5.349009947479682, |
|
"learning_rate": 2.9222149066644088e-05, |
|
"loss": 1.6431, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 4.266181060344957, |
|
"learning_rate": 2.916808917402228e-05, |
|
"loss": 1.598, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.33513513513513515, |
|
"grad_norm": 4.628674419343668, |
|
"learning_rate": 2.911226675505932e-05, |
|
"loss": 1.6375, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.34594594594594597, |
|
"grad_norm": 5.058644550344611, |
|
"learning_rate": 2.905468875412735e-05, |
|
"loss": 1.6427, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3567567567567568, |
|
"grad_norm": 4.417115318512762, |
|
"learning_rate": 2.8995362333994906e-05, |
|
"loss": 1.6333, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3675675675675676, |
|
"grad_norm": 4.0811273644284345, |
|
"learning_rate": 2.8934294874935848e-05, |
|
"loss": 1.5855, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 3.962543949681069, |
|
"learning_rate": 2.887149397381126e-05, |
|
"loss": 1.6171, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3891891891891892, |
|
"grad_norm": 3.934001274285887, |
|
"learning_rate": 2.8806967443124372e-05, |
|
"loss": 1.5538, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.891305018487301, |
|
"learning_rate": 2.8740723310048682e-05, |
|
"loss": 1.6476, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.41081081081081083, |
|
"grad_norm": 4.553656766898869, |
|
"learning_rate": 2.8672769815429385e-05, |
|
"loss": 1.5889, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.42162162162162165, |
|
"grad_norm": 4.450034079395778, |
|
"learning_rate": 2.860311541275818e-05, |
|
"loss": 1.5896, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 3.9485209950285274, |
|
"learning_rate": 2.8531768767121656e-05, |
|
"loss": 1.6198, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.44324324324324327, |
|
"grad_norm": 4.35512448284715, |
|
"learning_rate": 2.845873875412335e-05, |
|
"loss": 1.6443, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4540540540540541, |
|
"grad_norm": 4.651266887881564, |
|
"learning_rate": 2.838403445877958e-05, |
|
"loss": 1.6542, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4648648648648649, |
|
"grad_norm": 4.5294288157069, |
|
"learning_rate": 2.8307665174389323e-05, |
|
"loss": 1.655, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4756756756756757, |
|
"grad_norm": 4.104985239339571, |
|
"learning_rate": 2.822964040137805e-05, |
|
"loss": 1.6827, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 3.7948515475231286, |
|
"learning_rate": 2.8149969846115894e-05, |
|
"loss": 1.6333, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4972972972972973, |
|
"grad_norm": 4.449225329061536, |
|
"learning_rate": 2.8068663419710182e-05, |
|
"loss": 1.6185, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5081081081081081, |
|
"grad_norm": 4.391987231579949, |
|
"learning_rate": 2.7985731236772448e-05, |
|
"loss": 1.6078, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.518918918918919, |
|
"grad_norm": 4.982415169833182, |
|
"learning_rate": 2.7901183614160185e-05, |
|
"loss": 1.6529, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5297297297297298, |
|
"grad_norm": 4.176595151214056, |
|
"learning_rate": 2.7815031069693412e-05, |
|
"loss": 1.6073, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 4.3206148554703105, |
|
"learning_rate": 2.7727284320846246e-05, |
|
"loss": 1.5561, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5513513513513514, |
|
"grad_norm": 4.424758608775709, |
|
"learning_rate": 2.7637954283413632e-05, |
|
"loss": 1.6253, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5621621621621622, |
|
"grad_norm": 5.349711813640235, |
|
"learning_rate": 2.75470520701534e-05, |
|
"loss": 1.7059, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.572972972972973, |
|
"grad_norm": 4.578654891344146, |
|
"learning_rate": 2.7454588989403858e-05, |
|
"loss": 1.6107, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5837837837837838, |
|
"grad_norm": 40.22880683574773, |
|
"learning_rate": 2.7360576543676972e-05, |
|
"loss": 1.6278, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"grad_norm": 4.155039894851776, |
|
"learning_rate": 2.7265026428227476e-05, |
|
"loss": 1.6301, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6054054054054054, |
|
"grad_norm": 4.159866031946415, |
|
"learning_rate": 2.7167950529597963e-05, |
|
"loss": 1.5342, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6162162162162163, |
|
"grad_norm": 4.087141493968059, |
|
"learning_rate": 2.706936092414018e-05, |
|
"loss": 1.6033, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.6270270270270271, |
|
"grad_norm": 3.825646270675215, |
|
"learning_rate": 2.696926987651271e-05, |
|
"loss": 1.5288, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.6378378378378379, |
|
"grad_norm": 3.917523159059879, |
|
"learning_rate": 2.686768983815526e-05, |
|
"loss": 1.6363, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 4.202629239471907, |
|
"learning_rate": 2.676463344573965e-05, |
|
"loss": 1.6052, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6594594594594595, |
|
"grad_norm": 3.6368092847747304, |
|
"learning_rate": 2.666011351959783e-05, |
|
"loss": 1.6309, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6702702702702703, |
|
"grad_norm": 4.253168243144118, |
|
"learning_rate": 2.6554143062126995e-05, |
|
"loss": 1.5592, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6810810810810811, |
|
"grad_norm": 4.995354779998164, |
|
"learning_rate": 2.6446735256172092e-05, |
|
"loss": 1.6303, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6918918918918919, |
|
"grad_norm": 4.421549411402136, |
|
"learning_rate": 2.6337903463385836e-05, |
|
"loss": 1.5769, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"grad_norm": 4.32615026522547, |
|
"learning_rate": 2.6227661222566516e-05, |
|
"loss": 1.613, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7135135135135136, |
|
"grad_norm": 3.8575639988103836, |
|
"learning_rate": 2.6116022247973773e-05, |
|
"loss": 1.5844, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7243243243243244, |
|
"grad_norm": 3.7279832633133028, |
|
"learning_rate": 2.6003000427622484e-05, |
|
"loss": 1.5301, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.7351351351351352, |
|
"grad_norm": 4.190711163922663, |
|
"learning_rate": 2.5888609821555127e-05, |
|
"loss": 1.592, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.745945945945946, |
|
"grad_norm": 4.733000892445367, |
|
"learning_rate": 2.577286466009266e-05, |
|
"loss": 1.6574, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"grad_norm": 4.577219211132897, |
|
"learning_rate": 2.5655779342064276e-05, |
|
"loss": 1.6289, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7675675675675676, |
|
"grad_norm": 4.048131970531039, |
|
"learning_rate": 2.553736843301615e-05, |
|
"loss": 1.6169, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7783783783783784, |
|
"grad_norm": 4.018546715630257, |
|
"learning_rate": 2.5417646663399502e-05, |
|
"loss": 1.5489, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7891891891891892, |
|
"grad_norm": 3.7010313992210992, |
|
"learning_rate": 2.529662892673806e-05, |
|
"loss": 1.5596, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.557965597883243, |
|
"learning_rate": 2.5174330277775354e-05, |
|
"loss": 1.6145, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 4.181549208740728, |
|
"learning_rate": 2.5050765930601836e-05, |
|
"loss": 1.5339, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8216216216216217, |
|
"grad_norm": 3.7892758830012823, |
|
"learning_rate": 2.4925951256762254e-05, |
|
"loss": 1.5862, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.8324324324324325, |
|
"grad_norm": 3.6130747678919666, |
|
"learning_rate": 2.4799901783343407e-05, |
|
"loss": 1.4857, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.8432432432432433, |
|
"grad_norm": 3.639537345617851, |
|
"learning_rate": 2.467263319104256e-05, |
|
"loss": 1.5902, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.8540540540540541, |
|
"grad_norm": 4.0474919753332035, |
|
"learning_rate": 2.4544161312216752e-05, |
|
"loss": 1.5395, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 3.800979434984059, |
|
"learning_rate": 2.441450212891323e-05, |
|
"loss": 1.5284, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8756756756756757, |
|
"grad_norm": 3.3611120493742983, |
|
"learning_rate": 2.4283671770881256e-05, |
|
"loss": 1.515, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8864864864864865, |
|
"grad_norm": 3.459228078638404, |
|
"learning_rate": 2.415168651356556e-05, |
|
"loss": 1.5745, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8972972972972973, |
|
"grad_norm": 3.6185129562881513, |
|
"learning_rate": 2.4018562776081643e-05, |
|
"loss": 1.5989, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.9081081081081082, |
|
"grad_norm": 4.499909371969758, |
|
"learning_rate": 2.388431711917324e-05, |
|
"loss": 1.5609, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"grad_norm": 3.6576864938242832, |
|
"learning_rate": 2.3748966243152127e-05, |
|
"loss": 1.5623, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9297297297297298, |
|
"grad_norm": 4.261199238023545, |
|
"learning_rate": 2.3612526985820586e-05, |
|
"loss": 1.5523, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.9405405405405406, |
|
"grad_norm": 4.730374719738293, |
|
"learning_rate": 2.347501632037678e-05, |
|
"loss": 1.5813, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.9513513513513514, |
|
"grad_norm": 3.7110704143642503, |
|
"learning_rate": 2.333645135330324e-05, |
|
"loss": 1.4888, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.9621621621621622, |
|
"grad_norm": 3.481005791064881, |
|
"learning_rate": 2.3196849322238816e-05, |
|
"loss": 1.6186, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 3.9410070667987913, |
|
"learning_rate": 2.3056227593834306e-05, |
|
"loss": 1.5343, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9837837837837838, |
|
"grad_norm": 3.73687483401855, |
|
"learning_rate": 2.291460366159199e-05, |
|
"loss": 1.527, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9945945945945946, |
|
"grad_norm": 3.636935348418019, |
|
"learning_rate": 2.277199514368947e-05, |
|
"loss": 1.5228, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.0054054054054054, |
|
"grad_norm": 3.5028224113856457, |
|
"learning_rate": 2.2628419780787887e-05, |
|
"loss": 1.3043, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.0162162162162163, |
|
"grad_norm": 3.2714761796276455, |
|
"learning_rate": 2.2483895433825023e-05, |
|
"loss": 1.0507, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.027027027027027, |
|
"grad_norm": 3.180825722720309, |
|
"learning_rate": 2.2338440081793332e-05, |
|
"loss": 1.0155, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.037837837837838, |
|
"grad_norm": 2.9167211293609894, |
|
"learning_rate": 2.2192071819503365e-05, |
|
"loss": 1.0087, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.0486486486486486, |
|
"grad_norm": 3.1930797413555077, |
|
"learning_rate": 2.2044808855332743e-05, |
|
"loss": 0.9847, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.0594594594594595, |
|
"grad_norm": 3.0743072086936474, |
|
"learning_rate": 2.1896669508961002e-05, |
|
"loss": 1.0024, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.0702702702702702, |
|
"grad_norm": 3.3931402915538613, |
|
"learning_rate": 2.1747672209090627e-05, |
|
"loss": 1.0063, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 3.427840497426894, |
|
"learning_rate": 2.1597835491154495e-05, |
|
"loss": 0.9924, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0918918918918918, |
|
"grad_norm": 3.209752499479298, |
|
"learning_rate": 2.1447177995010024e-05, |
|
"loss": 1.0114, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.1027027027027028, |
|
"grad_norm": 2.9188122615255487, |
|
"learning_rate": 2.1295718462620383e-05, |
|
"loss": 0.9348, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.1135135135135135, |
|
"grad_norm": 3.2169410708018464, |
|
"learning_rate": 2.1143475735722965e-05, |
|
"loss": 0.9456, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.1243243243243244, |
|
"grad_norm": 3.2550857985332815, |
|
"learning_rate": 2.099046875348543e-05, |
|
"loss": 0.9704, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.135135135135135, |
|
"grad_norm": 3.200798957813093, |
|
"learning_rate": 2.0836716550149685e-05, |
|
"loss": 1.0187, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.145945945945946, |
|
"grad_norm": 3.026699827485341, |
|
"learning_rate": 2.068223825266397e-05, |
|
"loss": 0.9959, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.1567567567567567, |
|
"grad_norm": 2.966340597816754, |
|
"learning_rate": 2.0527053078303463e-05, |
|
"loss": 0.9672, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.1675675675675676, |
|
"grad_norm": 3.4796215218810578, |
|
"learning_rate": 2.0371180332279642e-05, |
|
"loss": 0.9631, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.1783783783783783, |
|
"grad_norm": 2.9446475013457203, |
|
"learning_rate": 2.0214639405338653e-05, |
|
"loss": 0.9922, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.1891891891891893, |
|
"grad_norm": 3.0107017661224447, |
|
"learning_rate": 2.0057449771349123e-05, |
|
"loss": 0.9846, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.1589173902147203, |
|
"learning_rate": 1.989963098487957e-05, |
|
"loss": 0.9945, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.2108108108108109, |
|
"grad_norm": 3.291095419768011, |
|
"learning_rate": 1.9741202678765785e-05, |
|
"loss": 1.0006, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.2216216216216216, |
|
"grad_norm": 3.0439357766975768, |
|
"learning_rate": 1.9582184561668496e-05, |
|
"loss": 1.0247, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.2324324324324325, |
|
"grad_norm": 2.7398517472244133, |
|
"learning_rate": 1.942259641562159e-05, |
|
"loss": 1.0129, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.2432432432432432, |
|
"grad_norm": 3.0466059717106098, |
|
"learning_rate": 1.9262458093571193e-05, |
|
"loss": 1.0257, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.2540540540540541, |
|
"grad_norm": 2.8458132575753714, |
|
"learning_rate": 1.9101789516905953e-05, |
|
"loss": 0.9715, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.2648648648648648, |
|
"grad_norm": 2.8328426905654656, |
|
"learning_rate": 1.8940610672978803e-05, |
|
"loss": 0.961, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.2756756756756757, |
|
"grad_norm": 3.030835646521939, |
|
"learning_rate": 1.8778941612620482e-05, |
|
"loss": 0.9884, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.2864864864864864, |
|
"grad_norm": 2.8633899892085024, |
|
"learning_rate": 1.8616802447645223e-05, |
|
"loss": 0.9937, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 3.338996158976475, |
|
"learning_rate": 1.8454213348348797e-05, |
|
"loss": 0.9809, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.308108108108108, |
|
"grad_norm": 2.924814513226331, |
|
"learning_rate": 1.8291194540999322e-05, |
|
"loss": 0.9526, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.318918918918919, |
|
"grad_norm": 3.090470952947, |
|
"learning_rate": 1.8127766305321072e-05, |
|
"loss": 0.9912, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.3297297297297297, |
|
"grad_norm": 2.9540976533352867, |
|
"learning_rate": 1.7963948971971686e-05, |
|
"loss": 0.9725, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.3405405405405406, |
|
"grad_norm": 2.9280101457384986, |
|
"learning_rate": 1.7799762920012982e-05, |
|
"loss": 0.9508, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 3.129222083901634, |
|
"learning_rate": 1.763522857437579e-05, |
|
"loss": 0.9952, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.3621621621621622, |
|
"grad_norm": 3.3207813445482315, |
|
"learning_rate": 1.747036640331908e-05, |
|
"loss": 0.9778, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.372972972972973, |
|
"grad_norm": 2.941815984953935, |
|
"learning_rate": 1.7305196915883662e-05, |
|
"loss": 0.9922, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.3837837837837839, |
|
"grad_norm": 3.1943275224301475, |
|
"learning_rate": 1.713974065934086e-05, |
|
"loss": 0.9738, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.3945945945945946, |
|
"grad_norm": 2.9545782873135478, |
|
"learning_rate": 1.6974018216636394e-05, |
|
"loss": 0.9712, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.4054054054054055, |
|
"grad_norm": 2.832796057451163, |
|
"learning_rate": 1.6808050203829845e-05, |
|
"loss": 1.0121, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4162162162162162, |
|
"grad_norm": 3.2139763823586196, |
|
"learning_rate": 1.6641857267530003e-05, |
|
"loss": 0.9702, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.427027027027027, |
|
"grad_norm": 3.3210065946822827, |
|
"learning_rate": 1.6475460082326377e-05, |
|
"loss": 1.0018, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.4378378378378378, |
|
"grad_norm": 3.166940843865695, |
|
"learning_rate": 1.6308879348217293e-05, |
|
"loss": 0.9959, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.4486486486486487, |
|
"grad_norm": 3.1486302485385878, |
|
"learning_rate": 1.6142135788034743e-05, |
|
"loss": 0.9477, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.4594594594594594, |
|
"grad_norm": 3.1328749208815547, |
|
"learning_rate": 1.5975250144866492e-05, |
|
"loss": 0.9854, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.4702702702702704, |
|
"grad_norm": 2.9503463010514035, |
|
"learning_rate": 1.5808243179475568e-05, |
|
"loss": 1.0001, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.481081081081081, |
|
"grad_norm": 2.8925905903725355, |
|
"learning_rate": 1.564113566771764e-05, |
|
"loss": 0.9475, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.491891891891892, |
|
"grad_norm": 3.2184062381528196, |
|
"learning_rate": 1.547394839795645e-05, |
|
"loss": 0.9862, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.5027027027027027, |
|
"grad_norm": 3.0205819182026077, |
|
"learning_rate": 1.530670216847772e-05, |
|
"loss": 0.9689, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.5135135135135136, |
|
"grad_norm": 2.886699137488658, |
|
"learning_rate": 1.5139417784901836e-05, |
|
"loss": 0.9578, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5243243243243243, |
|
"grad_norm": 3.0019029659558494, |
|
"learning_rate": 1.4972116057595592e-05, |
|
"loss": 0.9526, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.535135135135135, |
|
"grad_norm": 3.141168035086649, |
|
"learning_rate": 1.480481779908337e-05, |
|
"loss": 0.9621, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.545945945945946, |
|
"grad_norm": 2.842053920465437, |
|
"learning_rate": 1.463754382145802e-05, |
|
"loss": 0.9821, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.5567567567567568, |
|
"grad_norm": 3.220671922556498, |
|
"learning_rate": 1.4470314933791828e-05, |
|
"loss": 0.9547, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.5675675675675675, |
|
"grad_norm": 2.9122625506605586, |
|
"learning_rate": 1.430315193954783e-05, |
|
"loss": 0.9678, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.5783783783783782, |
|
"grad_norm": 2.6568836209674274, |
|
"learning_rate": 1.4136075633991864e-05, |
|
"loss": 0.9566, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.5891891891891892, |
|
"grad_norm": 2.7282858715379077, |
|
"learning_rate": 1.3969106801605577e-05, |
|
"loss": 0.9195, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.602059773028646, |
|
"learning_rate": 1.3802266213500843e-05, |
|
"loss": 0.955, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.6108108108108108, |
|
"grad_norm": 3.3786673839231423, |
|
"learning_rate": 1.3635574624835798e-05, |
|
"loss": 0.9645, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 2.6986089589909, |
|
"learning_rate": 1.3469052772232874e-05, |
|
"loss": 0.98, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6324324324324324, |
|
"grad_norm": 2.89235283174837, |
|
"learning_rate": 1.3302721371199165e-05, |
|
"loss": 0.9588, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.6432432432432433, |
|
"grad_norm": 2.935829812088402, |
|
"learning_rate": 1.3136601113549349e-05, |
|
"loss": 0.9354, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.654054054054054, |
|
"grad_norm": 2.737725384464134, |
|
"learning_rate": 1.2970712664831644e-05, |
|
"loss": 0.9574, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.6648648648648647, |
|
"grad_norm": 2.867513411111901, |
|
"learning_rate": 1.2805076661756965e-05, |
|
"loss": 0.9446, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.6756756756756757, |
|
"grad_norm": 2.787422977164954, |
|
"learning_rate": 1.2639713709631709e-05, |
|
"loss": 0.9558, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.6864864864864866, |
|
"grad_norm": 2.9076119130942026, |
|
"learning_rate": 1.2474644379794421e-05, |
|
"loss": 0.9286, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.6972972972972973, |
|
"grad_norm": 2.826910897786021, |
|
"learning_rate": 1.2309889207056708e-05, |
|
"loss": 0.9556, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.708108108108108, |
|
"grad_norm": 3.027330313518151, |
|
"learning_rate": 1.2145468687148672e-05, |
|
"loss": 0.9157, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.718918918918919, |
|
"grad_norm": 2.870766642781988, |
|
"learning_rate": 1.1981403274169219e-05, |
|
"loss": 0.9708, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 2.745031713488, |
|
"learning_rate": 1.1817713378041568e-05, |
|
"loss": 0.9404, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7405405405405405, |
|
"grad_norm": 2.8171111310049506, |
|
"learning_rate": 1.1654419361974195e-05, |
|
"loss": 0.9423, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.7513513513513512, |
|
"grad_norm": 2.8549039042787503, |
|
"learning_rate": 1.1491541539927668e-05, |
|
"loss": 0.951, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.7621621621621621, |
|
"grad_norm": 2.664120980356897, |
|
"learning_rate": 1.1329100174087534e-05, |
|
"loss": 0.9287, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.772972972972973, |
|
"grad_norm": 2.7039305514008096, |
|
"learning_rate": 1.1167115472343693e-05, |
|
"loss": 0.9584, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.7837837837837838, |
|
"grad_norm": 2.6778342659825025, |
|
"learning_rate": 1.1005607585776527e-05, |
|
"loss": 0.9151, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.7945945945945945, |
|
"grad_norm": 2.6005910068753857, |
|
"learning_rate": 1.0844596606150055e-05, |
|
"loss": 0.9501, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.8054054054054054, |
|
"grad_norm": 2.6765741105098364, |
|
"learning_rate": 1.0684102563412519e-05, |
|
"loss": 0.931, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.8162162162162163, |
|
"grad_norm": 2.811327607536862, |
|
"learning_rate": 1.0524145423204623e-05, |
|
"loss": 0.9793, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.827027027027027, |
|
"grad_norm": 2.92527323842401, |
|
"learning_rate": 1.036474508437579e-05, |
|
"loss": 0.9776, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.8378378378378377, |
|
"grad_norm": 2.789416429817517, |
|
"learning_rate": 1.020592137650872e-05, |
|
"loss": 0.947, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8486486486486486, |
|
"grad_norm": 2.754589393028259, |
|
"learning_rate": 1.004769405745257e-05, |
|
"loss": 0.9685, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.8594594594594596, |
|
"grad_norm": 2.593923465827381, |
|
"learning_rate": 9.890082810865046e-06, |
|
"loss": 0.9317, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.8702702702702703, |
|
"grad_norm": 3.005765748087634, |
|
"learning_rate": 9.733107243763754e-06, |
|
"loss": 0.9612, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.881081081081081, |
|
"grad_norm": 2.6391444135921462, |
|
"learning_rate": 9.576786884087037e-06, |
|
"loss": 0.9431, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 2.7958859686864823, |
|
"learning_rate": 9.421141178264702e-06, |
|
"loss": 0.9473, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.9027027027027028, |
|
"grad_norm": 2.8853568858381746, |
|
"learning_rate": 9.266189488798854e-06, |
|
"loss": 0.9404, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.9135135135135135, |
|
"grad_norm": 3.011863176958825, |
|
"learning_rate": 9.111951091855164e-06, |
|
"loss": 0.9424, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.9243243243243242, |
|
"grad_norm": 2.624513223364359, |
|
"learning_rate": 8.95844517486492e-06, |
|
"loss": 0.9404, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.9351351351351351, |
|
"grad_norm": 2.629936136635792, |
|
"learning_rate": 8.805690834138076e-06, |
|
"loss": 0.9588, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 2.8522026479916023, |
|
"learning_rate": 8.65370707248763e-06, |
|
"loss": 0.9339, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.9567567567567568, |
|
"grad_norm": 3.097766550094928, |
|
"learning_rate": 8.502512796865686e-06, |
|
"loss": 0.9394, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.9675675675675675, |
|
"grad_norm": 2.749849929848188, |
|
"learning_rate": 8.352126816011382e-06, |
|
"loss": 0.9402, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.9783783783783784, |
|
"grad_norm": 2.792496713680321, |
|
"learning_rate": 8.202567838111078e-06, |
|
"loss": 0.9403, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.9891891891891893, |
|
"grad_norm": 2.742908628148625, |
|
"learning_rate": 8.053854468471025e-06, |
|
"loss": 0.9475, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.80461985695162, |
|
"learning_rate": 7.906005207202852e-06, |
|
"loss": 0.9251, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.0108108108108107, |
|
"grad_norm": 2.712256627105201, |
|
"learning_rate": 7.75903844692212e-06, |
|
"loss": 0.4979, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.0216216216216214, |
|
"grad_norm": 2.2575975329190823, |
|
"learning_rate": 7.61297247046029e-06, |
|
"loss": 0.4357, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 2.0324324324324325, |
|
"grad_norm": 2.8247101377056865, |
|
"learning_rate": 7.4678254485902675e-06, |
|
"loss": 0.4334, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.0432432432432432, |
|
"grad_norm": 2.588953106795816, |
|
"learning_rate": 7.3236154377659825e-06, |
|
"loss": 0.4327, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 2.054054054054054, |
|
"grad_norm": 2.2112157456197807, |
|
"learning_rate": 7.180360377876125e-06, |
|
"loss": 0.4301, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.064864864864865, |
|
"grad_norm": 2.1916524154888903, |
|
"learning_rate": 7.038078090012406e-06, |
|
"loss": 0.4254, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 2.075675675675676, |
|
"grad_norm": 2.092153236540328, |
|
"learning_rate": 6.896786274252595e-06, |
|
"loss": 0.4066, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.0864864864864865, |
|
"grad_norm": 2.1223817949774033, |
|
"learning_rate": 6.7565025074586145e-06, |
|
"loss": 0.4018, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 2.097297297297297, |
|
"grad_norm": 1.975599624746057, |
|
"learning_rate": 6.617244241089947e-06, |
|
"loss": 0.3899, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.108108108108108, |
|
"grad_norm": 1.8893909454422486, |
|
"learning_rate": 6.479028799032664e-06, |
|
"loss": 0.397, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.118918918918919, |
|
"grad_norm": 1.875678473328706, |
|
"learning_rate": 6.3418733754443136e-06, |
|
"loss": 0.407, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.1297297297297297, |
|
"grad_norm": 2.210895295229888, |
|
"learning_rate": 6.205795032614943e-06, |
|
"loss": 0.4039, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.1405405405405404, |
|
"grad_norm": 2.2866290593300573, |
|
"learning_rate": 6.07081069884453e-06, |
|
"loss": 0.3975, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.1513513513513516, |
|
"grad_norm": 2.169724698947998, |
|
"learning_rate": 5.936937166337093e-06, |
|
"loss": 0.404, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 2.5885052465204503, |
|
"learning_rate": 5.804191089111711e-06, |
|
"loss": 0.4137, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.172972972972973, |
|
"grad_norm": 2.1184895283704273, |
|
"learning_rate": 5.6725889809307486e-06, |
|
"loss": 0.4069, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.1837837837837837, |
|
"grad_norm": 2.055767847916725, |
|
"learning_rate": 5.5421472132455285e-06, |
|
"loss": 0.4309, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.1945945945945944, |
|
"grad_norm": 1.9387007802838037, |
|
"learning_rate": 5.412882013159697e-06, |
|
"loss": 0.3989, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 2.2054054054054055, |
|
"grad_norm": 1.9041479568200537, |
|
"learning_rate": 5.284809461410556e-06, |
|
"loss": 0.4013, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.2162162162162162, |
|
"grad_norm": 2.0548881191902018, |
|
"learning_rate": 5.157945490368621e-06, |
|
"loss": 0.4205, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.227027027027027, |
|
"grad_norm": 2.0831599061407204, |
|
"learning_rate": 5.03230588205558e-06, |
|
"loss": 0.4122, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.237837837837838, |
|
"grad_norm": 1.971310383757786, |
|
"learning_rate": 4.907906266181014e-06, |
|
"loss": 0.3837, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 2.2486486486486488, |
|
"grad_norm": 1.977557211024187, |
|
"learning_rate": 4.784762118198041e-06, |
|
"loss": 0.3981, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.2594594594594595, |
|
"grad_norm": 1.9559375507208316, |
|
"learning_rate": 4.66288875737816e-06, |
|
"loss": 0.4094, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.27027027027027, |
|
"grad_norm": 1.9123345255397275, |
|
"learning_rate": 4.542301344905496e-06, |
|
"loss": 0.3863, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.281081081081081, |
|
"grad_norm": 1.8524912274987262, |
|
"learning_rate": 4.423014881990751e-06, |
|
"loss": 0.3908, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 2.291891891891892, |
|
"grad_norm": 2.0911936019239246, |
|
"learning_rate": 4.305044208005023e-06, |
|
"loss": 0.4167, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.3027027027027027, |
|
"grad_norm": 1.9050565892596198, |
|
"learning_rate": 4.188403998633775e-06, |
|
"loss": 0.3955, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 2.3135135135135134, |
|
"grad_norm": 1.8967742593703636, |
|
"learning_rate": 4.0731087640511735e-06, |
|
"loss": 0.4163, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.3243243243243246, |
|
"grad_norm": 2.051977219640454, |
|
"learning_rate": 3.959172847114991e-06, |
|
"loss": 0.4024, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.3351351351351353, |
|
"grad_norm": 2.0012355554132792, |
|
"learning_rate": 3.846610421582349e-06, |
|
"loss": 0.4157, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.345945945945946, |
|
"grad_norm": 2.175698094340077, |
|
"learning_rate": 3.7354354903464793e-06, |
|
"loss": 0.4024, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 2.3567567567567567, |
|
"grad_norm": 2.0526759215687362, |
|
"learning_rate": 3.625661883694753e-06, |
|
"loss": 0.3939, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.3675675675675674, |
|
"grad_norm": 1.960306969771245, |
|
"learning_rate": 3.5173032575881768e-06, |
|
"loss": 0.4074, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.3783783783783785, |
|
"grad_norm": 2.1570885212061826, |
|
"learning_rate": 3.4103730919625753e-06, |
|
"loss": 0.3976, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.389189189189189, |
|
"grad_norm": 1.8949986677811612, |
|
"learning_rate": 3.3048846890516658e-06, |
|
"loss": 0.4, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.982249761308161, |
|
"learning_rate": 3.2008511717322593e-06, |
|
"loss": 0.4133, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.410810810810811, |
|
"grad_norm": 1.980047898141974, |
|
"learning_rate": 3.098285481891745e-06, |
|
"loss": 0.3939, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 2.4216216216216218, |
|
"grad_norm": 2.0049995734478965, |
|
"learning_rate": 2.9972003788181146e-06, |
|
"loss": 0.3926, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 1.8400258173639734, |
|
"learning_rate": 2.8976084376126848e-06, |
|
"loss": 0.3936, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.443243243243243, |
|
"grad_norm": 1.9448462664129043, |
|
"learning_rate": 2.7995220476257482e-06, |
|
"loss": 0.388, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 2.454054054054054, |
|
"grad_norm": 1.9031160601187072, |
|
"learning_rate": 2.7029534109153186e-06, |
|
"loss": 0.3909, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 2.464864864864865, |
|
"grad_norm": 2.279997846004982, |
|
"learning_rate": 2.6079145407291877e-06, |
|
"loss": 0.3895, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 2.4756756756756757, |
|
"grad_norm": 1.8403404089990134, |
|
"learning_rate": 2.514417260010455e-06, |
|
"loss": 0.3976, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 2.4864864864864864, |
|
"grad_norm": 1.7969451736164892, |
|
"learning_rate": 2.4224731999267425e-06, |
|
"loss": 0.3999, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.4972972972972975, |
|
"grad_norm": 1.9253974055183771, |
|
"learning_rate": 2.3320937984232664e-06, |
|
"loss": 0.3939, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 2.5081081081081082, |
|
"grad_norm": 1.913704985114193, |
|
"learning_rate": 2.243290298799945e-06, |
|
"loss": 0.3984, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 2.518918918918919, |
|
"grad_norm": 1.9790173152408796, |
|
"learning_rate": 2.156073748312721e-06, |
|
"loss": 0.3819, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 2.5297297297297296, |
|
"grad_norm": 2.276492968512024, |
|
"learning_rate": 2.070454996799261e-06, |
|
"loss": 0.4039, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 2.5405405405405403, |
|
"grad_norm": 1.7739681476430693, |
|
"learning_rate": 1.9864446953292313e-06, |
|
"loss": 0.3791, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.5513513513513515, |
|
"grad_norm": 2.0336913560870196, |
|
"learning_rate": 1.9040532948792934e-06, |
|
"loss": 0.3847, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 2.562162162162162, |
|
"grad_norm": 1.8946193097351467, |
|
"learning_rate": 1.8232910450329832e-06, |
|
"loss": 0.385, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 2.572972972972973, |
|
"grad_norm": 1.9817636629737283, |
|
"learning_rate": 1.744167992705664e-06, |
|
"loss": 0.3914, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 2.583783783783784, |
|
"grad_norm": 1.8202147731376643, |
|
"learning_rate": 1.6666939808946619e-06, |
|
"loss": 0.377, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 1.804624257938459, |
|
"learning_rate": 1.5908786474548004e-06, |
|
"loss": 0.3834, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.6054054054054054, |
|
"grad_norm": 1.9478831371089558, |
|
"learning_rate": 1.5167314238994367e-06, |
|
"loss": 0.3802, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 2.616216216216216, |
|
"grad_norm": 1.8418242757562502, |
|
"learning_rate": 1.4442615342271625e-06, |
|
"loss": 0.3742, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 2.627027027027027, |
|
"grad_norm": 1.8235265917309187, |
|
"learning_rate": 1.3734779937743403e-06, |
|
"loss": 0.3763, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 2.637837837837838, |
|
"grad_norm": 1.8148562882498185, |
|
"learning_rate": 1.3043896080935785e-06, |
|
"loss": 0.3764, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 2.6486486486486487, |
|
"grad_norm": 1.9162200026873921, |
|
"learning_rate": 1.237004971858307e-06, |
|
"loss": 0.4009, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.6594594594594594, |
|
"grad_norm": 1.971484443435529, |
|
"learning_rate": 1.1713324677936015e-06, |
|
"loss": 0.3894, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.6702702702702705, |
|
"grad_norm": 2.6288039366865883, |
|
"learning_rate": 1.1073802656333548e-06, |
|
"loss": 0.3736, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 2.6810810810810812, |
|
"grad_norm": 1.8111148825496188, |
|
"learning_rate": 1.0451563211039494e-06, |
|
"loss": 0.3996, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 2.691891891891892, |
|
"grad_norm": 1.7806298978071708, |
|
"learning_rate": 9.846683749345648e-07, |
|
"loss": 0.383, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 4.0497002081385185, |
|
"learning_rate": 9.25923951894222e-07, |
|
"loss": 0.3965, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.7135135135135133, |
|
"grad_norm": 1.8334211425058837, |
|
"learning_rate": 8.68930359855683e-07, |
|
"loss": 0.3989, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 2.7243243243243245, |
|
"grad_norm": 1.7500539556657924, |
|
"learning_rate": 8.136946888863528e-07, |
|
"loss": 0.395, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 2.735135135135135, |
|
"grad_norm": 1.9130969263501059, |
|
"learning_rate": 7.602238103662646e-07, |
|
"loss": 0.3853, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 2.745945945945946, |
|
"grad_norm": 1.8404236110308207, |
|
"learning_rate": 7.085243761332738e-07, |
|
"loss": 0.393, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 2.756756756756757, |
|
"grad_norm": 1.7456858490902225, |
|
"learning_rate": 6.586028176555536e-07, |
|
"loss": 0.3944, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.7675675675675677, |
|
"grad_norm": 1.837065449202062, |
|
"learning_rate": 6.104653452315279e-07, |
|
"loss": 0.3798, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 2.7783783783783784, |
|
"grad_norm": 2.3308657348413058, |
|
"learning_rate": 5.641179472172875e-07, |
|
"loss": 0.3798, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 2.789189189189189, |
|
"grad_norm": 1.7969746620946272, |
|
"learning_rate": 5.195663892816432e-07, |
|
"loss": 0.3817, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.8403934823419463, |
|
"learning_rate": 4.768162136888643e-07, |
|
"loss": 0.3791, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 2.810810810810811, |
|
"grad_norm": 1.8084423900988431, |
|
"learning_rate": 4.3587273860921985e-07, |
|
"loss": 0.3613, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.8216216216216217, |
|
"grad_norm": 1.8704402298319724, |
|
"learning_rate": 3.9674105745738155e-07, |
|
"loss": 0.3771, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 2.8324324324324324, |
|
"grad_norm": 1.818023687371634, |
|
"learning_rate": 3.594260382588105e-07, |
|
"loss": 0.3888, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 2.8432432432432435, |
|
"grad_norm": 1.8608896733650853, |
|
"learning_rate": 3.239323230441615e-07, |
|
"loss": 0.3888, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 2.854054054054054, |
|
"grad_norm": 1.938515453919976, |
|
"learning_rate": 2.902643272718086e-07, |
|
"loss": 0.4002, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 2.864864864864865, |
|
"grad_norm": 1.94145103701424, |
|
"learning_rate": 2.5842623927856244e-07, |
|
"loss": 0.3858, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.8756756756756756, |
|
"grad_norm": 1.7260066637899822, |
|
"learning_rate": 2.28422019758629e-07, |
|
"loss": 0.3905, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 2.8864864864864863, |
|
"grad_norm": 1.8230164360318986, |
|
"learning_rate": 2.0025540127090513e-07, |
|
"loss": 0.3977, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 2.8972972972972975, |
|
"grad_norm": 1.7780456307114303, |
|
"learning_rate": 1.7392988777463202e-07, |
|
"loss": 0.3881, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 2.908108108108108, |
|
"grad_norm": 1.9497490854182644, |
|
"learning_rate": 1.4944875419350855e-07, |
|
"loss": 0.3797, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 2.918918918918919, |
|
"grad_norm": 1.6524262781562993, |
|
"learning_rate": 1.268150460082823e-07, |
|
"loss": 0.3645, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.92972972972973, |
|
"grad_norm": 1.8325737906994488, |
|
"learning_rate": 1.0603157887788428e-07, |
|
"loss": 0.3574, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 2.9405405405405407, |
|
"grad_norm": 1.8188448217016162, |
|
"learning_rate": 8.710093828917076e-08, |
|
"loss": 0.3829, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 2.9513513513513514, |
|
"grad_norm": 1.774843520134497, |
|
"learning_rate": 7.002547923527058e-08, |
|
"loss": 0.3945, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 2.962162162162162, |
|
"grad_norm": 1.7417401019158905, |
|
"learning_rate": 5.4807325922632825e-08, |
|
"loss": 0.37, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 1.6997699000548114, |
|
"learning_rate": 4.14483715067665e-08, |
|
"loss": 0.3702, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.983783783783784, |
|
"grad_norm": 1.7185773019727228, |
|
"learning_rate": 2.995027785673066e-08, |
|
"loss": 0.3829, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 2.9945945945945946, |
|
"grad_norm": 1.7433824271169698, |
|
"learning_rate": 2.0314475348401362e-08, |
|
"loss": 0.3777, |
|
"step": 831 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 845, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 833, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 232851391348736.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|