|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9995747929973436, |
|
"eval_steps": 500, |
|
"global_step": 1708, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005852311434410677, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.1695906432748538e-06, |
|
"loss": 1.737, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0029261557172053382, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 5.8479532163742686e-06, |
|
"loss": 1.7512, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0058523114344106765, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 1.1695906432748537e-05, |
|
"loss": 1.7378, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008778467151616015, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 1.7543859649122806e-05, |
|
"loss": 1.7494, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.011704622868821353, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 2.3391812865497074e-05, |
|
"loss": 1.7101, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.014630778586026691, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 2.9239766081871346e-05, |
|
"loss": 1.6696, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01755693430323203, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.508771929824561e-05, |
|
"loss": 1.6369, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02048309002043737, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 4.093567251461988e-05, |
|
"loss": 1.6058, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.023409245737642706, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 4.678362573099415e-05, |
|
"loss": 1.5587, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.026335401454848046, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 1.5149, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.029261557172053382, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 5.847953216374269e-05, |
|
"loss": 1.4769, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03218771288925872, |
|
"grad_norm": 0.12060546875, |
|
"learning_rate": 6.432748538011695e-05, |
|
"loss": 1.4573, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03511386860646406, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 7.017543859649122e-05, |
|
"loss": 1.4219, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0380400243236694, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 7.602339181286549e-05, |
|
"loss": 1.398, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04096618004087474, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 8.187134502923976e-05, |
|
"loss": 1.3912, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.043892335758080075, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 8.771929824561403e-05, |
|
"loss": 1.3633, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04681849147528541, |
|
"grad_norm": 0.06103515625, |
|
"learning_rate": 9.35672514619883e-05, |
|
"loss": 1.3546, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.049744647192490755, |
|
"grad_norm": 0.05712890625, |
|
"learning_rate": 9.941520467836257e-05, |
|
"loss": 1.3309, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05267080290969609, |
|
"grad_norm": 0.05517578125, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 1.3234, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05559695862690143, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 1.3061, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.058523114344106765, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.00011695906432748539, |
|
"loss": 1.2844, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06144927006131211, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.00012280701754385965, |
|
"loss": 1.2973, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06437542577851744, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.0001286549707602339, |
|
"loss": 1.2836, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06730158149572278, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.0001345029239766082, |
|
"loss": 1.2723, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07022773721292812, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 0.00014035087719298245, |
|
"loss": 1.2634, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07315389293013345, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.00014619883040935673, |
|
"loss": 1.2355, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0760800486473388, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00015204678362573098, |
|
"loss": 1.2494, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07900620436454414, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 1.253, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08193236008174948, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.00016374269005847952, |
|
"loss": 1.2499, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08485851579895481, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.0001695906432748538, |
|
"loss": 1.2193, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08778467151616015, |
|
"grad_norm": 0.05712890625, |
|
"learning_rate": 0.00017543859649122806, |
|
"loss": 1.2339, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09071082723336549, |
|
"grad_norm": 0.056884765625, |
|
"learning_rate": 0.00018128654970760234, |
|
"loss": 1.2427, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09363698295057082, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.0001871345029239766, |
|
"loss": 1.2184, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09656313866777616, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 0.00019298245614035088, |
|
"loss": 1.2306, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.09948929438498151, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 0.00019883040935672513, |
|
"loss": 1.214, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10241545010218685, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 0.00019999665774502696, |
|
"loss": 1.2176, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.10534160581939218, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 0.0001999830802170989, |
|
"loss": 1.204, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10826776153659752, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 0.00019995905994229593, |
|
"loss": 1.2153, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.11119391725380286, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 0.00019992459942941906, |
|
"loss": 1.1936, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1141200729710082, |
|
"grad_norm": 0.064453125, |
|
"learning_rate": 0.00019987970227770135, |
|
"loss": 1.1987, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.11704622868821353, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 0.00019982437317643217, |
|
"loss": 1.2065, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11997238440541888, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 0.00019975861790446722, |
|
"loss": 1.2088, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.12289854012262422, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 0.0001996824433296252, |
|
"loss": 1.2082, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12582469583982955, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.00019959585740797028, |
|
"loss": 1.2062, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1287508515570349, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 0.0001994988691829812, |
|
"loss": 1.2046, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13167700727424023, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00019939148878460677, |
|
"loss": 1.195, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.13460316299144556, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.00019927372742820779, |
|
"loss": 1.1807, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1375293187086509, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.0001991455974133857, |
|
"loss": 1.1887, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.14045547442585624, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 0.0001990071121226979, |
|
"loss": 1.189, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14338163014306157, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.0001988582860202601, |
|
"loss": 1.172, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1463077858602669, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.00019869913465023548, |
|
"loss": 1.1738, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14923394157747225, |
|
"grad_norm": 0.08447265625, |
|
"learning_rate": 0.00019852967463521124, |
|
"loss": 1.1947, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1521600972946776, |
|
"grad_norm": 0.080078125, |
|
"learning_rate": 0.0001983499236744625, |
|
"loss": 1.1789, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15508625301188295, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.00019815990054210361, |
|
"loss": 1.1878, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.15801240872908828, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.00019795962508512742, |
|
"loss": 1.1825, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16093856444629362, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 0.00019774911822133216, |
|
"loss": 1.1848, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.16386472016349896, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 0.0001975284019371368, |
|
"loss": 1.1634, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1667908758807043, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.0001972974992852847, |
|
"loss": 1.1539, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.16971703159790963, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 0.00019705643438243584, |
|
"loss": 1.1656, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.17264318731511497, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.00019680523240664786, |
|
"loss": 1.1923, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.1755693430323203, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.00019654391959474647, |
|
"loss": 1.1651, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17849549874952564, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.00019627252323958504, |
|
"loss": 1.1501, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.18142165446673097, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.00019599107168719412, |
|
"loss": 1.1581, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1843478101839363, |
|
"grad_norm": 0.08349609375, |
|
"learning_rate": 0.0001956995943338206, |
|
"loss": 1.1785, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.18727396590114165, |
|
"grad_norm": 0.08447265625, |
|
"learning_rate": 0.00019539812162285767, |
|
"loss": 1.1691, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19020012161834698, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.00019508668504166505, |
|
"loss": 1.1758, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.19312627733555232, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.00019476531711828027, |
|
"loss": 1.1582, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19605243305275769, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 0.0001944340514180212, |
|
"loss": 1.1767, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.19897858876996302, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.00019409292253998062, |
|
"loss": 1.1392, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.20190474448716836, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 0.0001937419661134121, |
|
"loss": 1.1626, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.2048309002043737, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.00019338121879400896, |
|
"loss": 1.1551, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.20775705592157903, |
|
"grad_norm": 0.080078125, |
|
"learning_rate": 0.00019301071826007576, |
|
"loss": 1.1495, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.21068321163878437, |
|
"grad_norm": 0.0810546875, |
|
"learning_rate": 0.00019263050320859283, |
|
"loss": 1.1514, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2136093673559897, |
|
"grad_norm": 0.08935546875, |
|
"learning_rate": 0.00019224061335117472, |
|
"loss": 1.1649, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.21653552307319504, |
|
"grad_norm": 0.08837890625, |
|
"learning_rate": 0.0001918410894099224, |
|
"loss": 1.1433, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.21946167879040038, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 0.00019143197311317014, |
|
"loss": 1.1275, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.22238783450760571, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 0.00019101330719112705, |
|
"loss": 1.1684, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22531399022481105, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00019058513537141428, |
|
"loss": 1.1606, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2282401459420164, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.0001901475023744977, |
|
"loss": 1.148, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.23116630165922172, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.00018970045390901728, |
|
"loss": 1.1626, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.23409245737642706, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 0.00018924403666701286, |
|
"loss": 1.1575, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23701861309363242, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00018877829831904746, |
|
"loss": 1.1637, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.23994476881083776, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.0001883032875092283, |
|
"loss": 1.1441, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2428709245280431, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.00018781905385012627, |
|
"loss": 1.1615, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.24579708024524843, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 0.000187325647917594, |
|
"loss": 1.1536, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.24872323596245377, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.00018682312124548346, |
|
"loss": 1.1512, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2516493916796591, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00018631152632026364, |
|
"loss": 1.1397, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.25457554739686444, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.00018579091657553844, |
|
"loss": 1.1585, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2575017031140698, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 0.00018526134638646583, |
|
"loss": 1.1612, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2604278588312751, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00018472287106407876, |
|
"loss": 1.1272, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.26335401454848045, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 0.00018417554684950794, |
|
"loss": 1.1413, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2662801702656858, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.00018361943090810796, |
|
"loss": 1.1489, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2692063259828911, |
|
"grad_norm": 0.0888671875, |
|
"learning_rate": 0.00018305458132348657, |
|
"loss": 1.1575, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.27213248170009646, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00018248105709143799, |
|
"loss": 1.136, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2750586374173018, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.00018189891811378137, |
|
"loss": 1.1369, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.27798479313450714, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 0.0001813082251921041, |
|
"loss": 1.1255, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.28091094885171247, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 0.0001807090400214114, |
|
"loss": 1.1288, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2838371045689178, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.00018010142518368278, |
|
"loss": 1.1233, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.28676326028612315, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.00017948544414133534, |
|
"loss": 1.1475, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2896894160033285, |
|
"grad_norm": 0.080078125, |
|
"learning_rate": 0.00017886116123059574, |
|
"loss": 1.1356, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2926155717205338, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.00017822864165478034, |
|
"loss": 1.1553, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.29554172743773915, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 0.00017758795147748523, |
|
"loss": 1.1188, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.2984678831549445, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00017693915761568608, |
|
"loss": 1.1388, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3013940388721499, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.000176282327832749, |
|
"loss": 1.1267, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3043201945893552, |
|
"grad_norm": 0.083984375, |
|
"learning_rate": 0.0001756175307313531, |
|
"loss": 1.1341, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.30724635030656056, |
|
"grad_norm": 0.08349609375, |
|
"learning_rate": 0.00017494483574632513, |
|
"loss": 1.1365, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3101725060237659, |
|
"grad_norm": 0.08447265625, |
|
"learning_rate": 0.00017426431313738734, |
|
"loss": 1.1335, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.31309866174097123, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.00017357603398181936, |
|
"loss": 1.1484, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.31602481745817657, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.00017288007016703444, |
|
"loss": 1.1186, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3189509731753819, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 0.00017217649438307106, |
|
"loss": 1.1442, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.32187712889258724, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 0.00017146538011500093, |
|
"loss": 1.1284, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3248032846097926, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.00017074680163525375, |
|
"loss": 1.1331, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3277294403269979, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00017002083399586, |
|
"loss": 1.1255, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.33065559604420325, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00016928755302061173, |
|
"loss": 1.1354, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.3335817517614086, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.0001685470352971437, |
|
"loss": 1.1333, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3365079074786139, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00016779935816893353, |
|
"loss": 1.1376, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.33943406319581926, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 0.00016704459972722414, |
|
"loss": 1.1249, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3423602189130246, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.00016628283880286703, |
|
"loss": 1.1451, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.34528637463022993, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.00016551415495808915, |
|
"loss": 1.1195, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.34821253034743527, |
|
"grad_norm": 0.0810546875, |
|
"learning_rate": 0.00016473862847818277, |
|
"loss": 1.146, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.3511386860646406, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00016395634036312013, |
|
"loss": 1.1327, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.35406484178184594, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.00016316737231909342, |
|
"loss": 1.1176, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3569909974990513, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.000162371806749981, |
|
"loss": 1.1208, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3599171532162566, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.00016156972674874056, |
|
"loss": 1.1315, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.36284330893346195, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 0.00016076121608873072, |
|
"loss": 1.1455, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3657694646506673, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.000159946359214961, |
|
"loss": 1.1234, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3686956203678726, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00015912524123527221, |
|
"loss": 1.1185, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.37162177608507796, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.0001582979479114472, |
|
"loss": 1.1208, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.3745479318022833, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.0001574645656502536, |
|
"loss": 1.1257, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.37747408751948863, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.0001566251814944188, |
|
"loss": 1.1317, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.38040024323669397, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 0.00015577988311353904, |
|
"loss": 1.1431, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3833263989538993, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.0001549287587949226, |
|
"loss": 1.1253, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.38625255467110464, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 0.00015407189743436864, |
|
"loss": 1.1314, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.38917871038831003, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00015320938852688248, |
|
"loss": 1.1148, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.39210486610551537, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 0.00015234132215732822, |
|
"loss": 1.141, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3950310218227207, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.00015146778899102, |
|
"loss": 1.1222, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.39795717753992604, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.00015058888026425212, |
|
"loss": 1.1177, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4008833332571314, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 0.00014970468777477026, |
|
"loss": 1.1181, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.4038094889743367, |
|
"grad_norm": 0.08349609375, |
|
"learning_rate": 0.00014881530387218325, |
|
"loss": 1.1417, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.40673564469154205, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 0.00014792082144831793, |
|
"loss": 1.1302, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.4096618004087474, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 0.00014702133392751688, |
|
"loss": 1.122, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4125879561259527, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 0.00014611693525688066, |
|
"loss": 1.1268, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.41551411184315806, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 0.00014520771989645563, |
|
"loss": 1.1238, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4184402675603634, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 0.00014429378280936804, |
|
"loss": 1.119, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.42136642327756874, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.0001433752194519054, |
|
"loss": 1.1187, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.42429257899477407, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 0.00014245212576354682, |
|
"loss": 1.122, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.4272187347119794, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.0001415245981569424, |
|
"loss": 1.1267, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.43014489042918475, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 0.00014059273350784342, |
|
"loss": 1.1273, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.4330710461463901, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 0.00013965662914498428, |
|
"loss": 1.1267, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4359972018635954, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 0.00013871638283991677, |
|
"loss": 1.1175, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.43892335758080075, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 0.0001377720927967985, |
|
"loss": 1.1211, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4418495132980061, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.00013682385764213572, |
|
"loss": 1.1319, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.44477566901521143, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00013587177641448265, |
|
"loss": 1.1233, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.44770182473241676, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.00013491594855409697, |
|
"loss": 1.1385, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.4506279804496221, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00013395647389255396, |
|
"loss": 1.1189, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.45355413616682744, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.00013299345264231957, |
|
"loss": 1.1157, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.4564802918840328, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00013202698538628376, |
|
"loss": 1.1224, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4594064476012381, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 0.00013105717306725501, |
|
"loss": 1.1283, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.46233260331844345, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.0001300841169774174, |
|
"loss": 1.131, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4652587590356488, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 0.000129107918747751, |
|
"loss": 1.1175, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.4681849147528541, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 0.00012812868033741724, |
|
"loss": 1.138, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.47111107047005946, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00012714650402310967, |
|
"loss": 1.1344, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.47403722618726485, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00012616149238837146, |
|
"loss": 1.1195, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4769633819044702, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.00012517374831288146, |
|
"loss": 1.1005, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4798895376216755, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 0.00012418337496170842, |
|
"loss": 1.1158, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.48281569333888086, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 0.00012319047577453638, |
|
"loss": 1.1181, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.4857418490560862, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 0.00012219515445486054, |
|
"loss": 1.1321, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.48866800477329153, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.00012119751495915617, |
|
"loss": 1.1309, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.49159416049049687, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 0.00012019766148602062, |
|
"loss": 1.1276, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4945203162077022, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 0.00011919569846529057, |
|
"loss": 1.1173, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.49744647192490754, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.00011819173054713466, |
|
"loss": 1.1111, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5003726276421129, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.00011718586259112326, |
|
"loss": 1.1137, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5032987833593182, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.0001161781996552765, |
|
"loss": 1.1157, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5062249390765236, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.00011516884698509143, |
|
"loss": 1.1136, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5091510947937289, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.00011415791000254964, |
|
"loss": 1.1217, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5120772505109342, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.0001131454942951065, |
|
"loss": 1.119, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5150034062281396, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.0001121317056046629, |
|
"loss": 1.1122, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5179295619453449, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.00011111664981652121, |
|
"loss": 1.1137, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5208557176625502, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.00011010043294832601, |
|
"loss": 1.1132, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5237818733797556, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.00010908316113899097, |
|
"loss": 1.1373, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.5267080290969609, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.00010806494063761335, |
|
"loss": 1.1165, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5296341848141662, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.00010704587779237654, |
|
"loss": 1.1149, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5325603405313716, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 0.00010602607903944279, |
|
"loss": 1.1244, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5354864962485769, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 0.00010500565089183627, |
|
"loss": 1.1141, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5384126519657823, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 0.00010398469992831832, |
|
"loss": 1.1031, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5413388076829876, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 0.00010296333278225599, |
|
"loss": 1.1072, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5442649634001929, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 0.00010194165613048444, |
|
"loss": 1.0993, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5471911191173983, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00010091977668216524, |
|
"loss": 1.1089, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5501172748346036, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 9.989780116764115e-05, |
|
"loss": 1.1042, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5530434305518089, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 9.887583632728845e-05, |
|
"loss": 1.1062, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.5559695862690143, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 9.785398890036867e-05, |
|
"loss": 1.1092, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5588957419862196, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 9.683236561388e-05, |
|
"loss": 1.1173, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.5618218977034249, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 9.581107317141026e-05, |
|
"loss": 1.1407, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5647480534206303, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 9.479021824199229e-05, |
|
"loss": 1.1365, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5676742091378356, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 9.376990744896276e-05, |
|
"loss": 1.1031, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.570600364855041, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 9.275024735882588e-05, |
|
"loss": 1.0896, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.5735265205722463, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 9.173134447012322e-05, |
|
"loss": 1.1094, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5764526762894516, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 9.071330520231033e-05, |
|
"loss": 1.1127, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.579378832006657, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 8.969623588464163e-05, |
|
"loss": 1.1176, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5823049877238623, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 8.868024274506505e-05, |
|
"loss": 1.112, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.5852311434410676, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 8.766543189912705e-05, |
|
"loss": 1.0846, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.588157299158273, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 8.665190933888904e-05, |
|
"loss": 1.0961, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.5910834548754783, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 8.56397809218574e-05, |
|
"loss": 1.1146, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5940096105926836, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 8.4629152359927e-05, |
|
"loss": 1.1066, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.596935766309889, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 8.362012920834014e-05, |
|
"loss": 1.1253, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5998619220270943, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 8.261281685466177e-05, |
|
"loss": 1.1072, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.6027880777442998, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 8.160732050777235e-05, |
|
"loss": 1.1147, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6057142334615051, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 8.060374518687926e-05, |
|
"loss": 1.11, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.6086403891787104, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 7.960219571054799e-05, |
|
"loss": 1.123, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6115665448959158, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 7.860277668575449e-05, |
|
"loss": 1.1035, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.6144927006131211, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 7.76055924969594e-05, |
|
"loss": 1.09, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6174188563303264, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 7.661074729520548e-05, |
|
"loss": 1.1279, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.6203450120475318, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 7.561834498723974e-05, |
|
"loss": 1.1141, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6232711677647371, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 7.462848922466092e-05, |
|
"loss": 1.1102, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.6261973234819425, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 7.364128339309326e-05, |
|
"loss": 1.1128, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6291234791991478, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 7.265683060138868e-05, |
|
"loss": 1.1054, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.6320496349163531, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 7.167523367085749e-05, |
|
"loss": 1.1097, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6349757906335585, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 7.069659512452918e-05, |
|
"loss": 1.1148, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.6379019463507638, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 6.972101717644429e-05, |
|
"loss": 1.0997, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6408281020679691, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 6.874860172097883e-05, |
|
"loss": 1.097, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.6437542577851745, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 6.777945032220187e-05, |
|
"loss": 1.1006, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6466804135023798, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 6.681366420326747e-05, |
|
"loss": 1.1191, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.6496065692195852, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 6.58513442358427e-05, |
|
"loss": 1.0901, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6525327249367905, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 6.489259092957193e-05, |
|
"loss": 1.1113, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.6554588806539958, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 6.3937504421579e-05, |
|
"loss": 1.0945, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6583850363712012, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 6.298618446600856e-05, |
|
"loss": 1.1073, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.6613111920884065, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 6.203873042360722e-05, |
|
"loss": 1.1178, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6642373478056118, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 6.109524125134571e-05, |
|
"loss": 1.1291, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.6671635035228172, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 6.015581549208322e-05, |
|
"loss": 1.0985, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6700896592400225, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 5.9220551264275356e-05, |
|
"loss": 1.1294, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.6730158149572278, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 5.828954625172597e-05, |
|
"loss": 1.115, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6759419706744332, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 5.736289769338441e-05, |
|
"loss": 1.1024, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.6788681263916385, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 5.644070237318977e-05, |
|
"loss": 1.0993, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6817942821088439, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 5.552305660996202e-05, |
|
"loss": 1.1172, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6847204378260492, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 5.4610056247341814e-05, |
|
"loss": 1.0988, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6876465935432545, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 5.3701796643780524e-05, |
|
"loss": 1.1142, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.6905727492604599, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 5.279837266258016e-05, |
|
"loss": 1.1271, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6934989049776652, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 5.189987866198548e-05, |
|
"loss": 1.1055, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.6964250606948705, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 5.100640848532878e-05, |
|
"loss": 1.1277, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6993512164120759, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 5.011805545122826e-05, |
|
"loss": 1.1093, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.7022773721292812, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 4.923491234384158e-05, |
|
"loss": 1.1055, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7052035278464865, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 4.8357071403174746e-05, |
|
"loss": 1.098, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.7081296835636919, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 4.748462431544826e-05, |
|
"loss": 1.114, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7110558392808972, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 4.661766220352097e-05, |
|
"loss": 1.1073, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.7139819949981026, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 4.5756275617372465e-05, |
|
"loss": 1.1121, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7169081507153079, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 4.490055452464594e-05, |
|
"loss": 1.12, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.7198343064325132, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 4.405058830125137e-05, |
|
"loss": 1.092, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7227604621497186, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 4.320646572203033e-05, |
|
"loss": 1.0998, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.7256866178669239, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 4.236827495148443e-05, |
|
"loss": 1.0993, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7286127735841292, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 4.153610353456654e-05, |
|
"loss": 1.1323, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.7315389293013346, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 4.071003838753737e-05, |
|
"loss": 1.1264, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7344650850185399, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 3.9890165788887365e-05, |
|
"loss": 1.1057, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.7373912407357452, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 3.9076571370325364e-05, |
|
"loss": 1.1119, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7403173964529506, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 3.82693401078349e-05, |
|
"loss": 1.0996, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.7432435521701559, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 3.7468556312798685e-05, |
|
"loss": 1.1051, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7461697078873613, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 3.667430362319277e-05, |
|
"loss": 1.0959, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.7490958636045666, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 3.588666499485115e-05, |
|
"loss": 1.1129, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.7520220193217719, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 3.510572269280097e-05, |
|
"loss": 1.1184, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.7549481750389773, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 3.433155828267089e-05, |
|
"loss": 1.1003, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7578743307561826, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 3.356425262217164e-05, |
|
"loss": 1.106, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.7608004864733879, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 3.280388585265075e-05, |
|
"loss": 1.1066, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7637266421905933, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 3.205053739072248e-05, |
|
"loss": 1.1026, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.7666527979077986, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 3.130428591997282e-05, |
|
"loss": 1.1256, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.769578953625004, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 3.0565209382741664e-05, |
|
"loss": 1.1018, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.7725051093422093, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 2.9833384971981838e-05, |
|
"loss": 1.1099, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7754312650594147, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 2.9108889123196824e-05, |
|
"loss": 1.0995, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.7783574207766201, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 2.839179750645752e-05, |
|
"loss": 1.1194, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7812835764938254, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 2.768218501849862e-05, |
|
"loss": 1.0955, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.7842097322110307, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 2.6980125774896238e-05, |
|
"loss": 1.0712, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7871358879282361, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 2.6285693102326868e-05, |
|
"loss": 1.1019, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.7900620436454414, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 2.559895953090856e-05, |
|
"loss": 1.1022, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7929881993626468, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 2.491999678662582e-05, |
|
"loss": 1.1027, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.7959143550798521, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 2.4248875783837987e-05, |
|
"loss": 1.1292, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7988405107970574, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 2.358566661787257e-05, |
|
"loss": 1.1117, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.8017666665142628, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 2.293043855770416e-05, |
|
"loss": 1.1176, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8046928222314681, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 2.2283260038719646e-05, |
|
"loss": 1.1074, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.8076189779486734, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 2.1644198655570504e-05, |
|
"loss": 1.1123, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8105451336658788, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 2.1013321155112754e-05, |
|
"loss": 1.0979, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.8134712893830841, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 2.0390693429435627e-05, |
|
"loss": 1.1102, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8163974451002894, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 1.977638050897954e-05, |
|
"loss": 1.1133, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.8193236008174948, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.917044655574387e-05, |
|
"loss": 1.1045, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8222497565347001, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.8572954856585535e-05, |
|
"loss": 1.0967, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.8251759122519055, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.798396781660914e-05, |
|
"loss": 1.1199, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8281020679691108, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 1.7403546952648885e-05, |
|
"loss": 1.1039, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.8310282236863161, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 1.6831752886843512e-05, |
|
"loss": 1.1106, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8339543794035215, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 1.626864534030469e-05, |
|
"loss": 1.106, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.8368805351207268, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.571428312687928e-05, |
|
"loss": 1.1004, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8398066908379321, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 1.5168724147006652e-05, |
|
"loss": 1.1244, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.8427328465551375, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.4632025381671133e-05, |
|
"loss": 1.1227, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8456590022723428, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.4104242886450824e-05, |
|
"loss": 1.1073, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.8485851579895481, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 1.3585431785662627e-05, |
|
"loss": 1.0903, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8515113137067535, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 1.3075646266604913e-05, |
|
"loss": 1.1129, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.8544374694239588, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 1.257493957389796e-05, |
|
"loss": 1.1293, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8573636251411642, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 1.208336400392268e-05, |
|
"loss": 1.0987, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.8602897808583695, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 1.1600970899358588e-05, |
|
"loss": 1.1044, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.8632159365755748, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 1.1127810643821401e-05, |
|
"loss": 1.1182, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.8661420922927802, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.0663932656600505e-05, |
|
"loss": 1.0957, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8690682480099855, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 1.0209385387497517e-05, |
|
"loss": 1.1238, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.8719944037271908, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 9.764216311765905e-06, |
|
"loss": 1.1209, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8749205594443962, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 9.328471925152381e-06, |
|
"loss": 1.1046, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.8778467151616015, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 8.902197739040708e-06, |
|
"loss": 1.1205, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8807728708788068, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 8.485438275698154e-06, |
|
"loss": 1.1202, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.8836990265960122, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 8.078237063625538e-06, |
|
"loss": 1.1177, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.8866251823132175, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 7.680636633010695e-06, |
|
"loss": 1.1116, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.8895513380304229, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 7.292678511286522e-06, |
|
"loss": 1.1067, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8924774937476282, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 6.914403218793608e-06, |
|
"loss": 1.12, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.8954036494648335, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 6.5458502645480924e-06, |
|
"loss": 1.1298, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.8983298051820389, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 6.187058142115077e-06, |
|
"loss": 1.1069, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.9012559608992442, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 5.838064325588288e-06, |
|
"loss": 1.0941, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9041821166164495, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 5.498905265675958e-06, |
|
"loss": 1.0976, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.9071082723336549, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 5.169616385893794e-06, |
|
"loss": 1.101, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9100344280508602, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 4.850232078865169e-06, |
|
"loss": 1.121, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.9129605837680655, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 4.5407857027289555e-06, |
|
"loss": 1.1013, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9158867394852709, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 4.241309577655406e-06, |
|
"loss": 1.1464, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.9188128952024762, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 3.951834982470526e-06, |
|
"loss": 1.1111, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9217390509196816, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 3.672392151389137e-06, |
|
"loss": 1.1078, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.9246652066368869, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 3.4030102708570212e-06, |
|
"loss": 1.1195, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9275913623540922, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 3.143717476502572e-06, |
|
"loss": 1.1216, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.9305175180712976, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 2.8945408501981906e-06, |
|
"loss": 1.0936, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9334436737885029, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 2.6555064172316234e-06, |
|
"loss": 1.106, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.9363698295057082, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 2.4266391435878387e-06, |
|
"loss": 1.1061, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9392959852229136, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 2.2079629333414453e-06, |
|
"loss": 1.1119, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.9422221409401189, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 1.999500626159967e-06, |
|
"loss": 1.1094, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9451482966573242, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 1.8012739949183844e-06, |
|
"loss": 1.1141, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.9480744523745297, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 1.6133037434250985e-06, |
|
"loss": 1.1084, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.951000608091735, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 1.4356095042594386e-06, |
|
"loss": 1.1208, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.9539267638089404, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 1.2682098367212237e-06, |
|
"loss": 1.1116, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.9568529195261457, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 1.1111222248922471e-06, |
|
"loss": 1.1047, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.959779075243351, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 9.643630758102484e-07, |
|
"loss": 1.0998, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9627052309605564, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 8.279477177551842e-07, |
|
"loss": 1.1073, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.9656313866777617, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 7.018903986483083e-07, |
|
"loss": 1.1124, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.968557542394967, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 5.862042845640403e-07, |
|
"loss": 1.1023, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.9714836981121724, |
|
"grad_norm": 0.06396484375, |
|
"learning_rate": 4.809014583548432e-07, |
|
"loss": 1.1234, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9744098538293777, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 3.859929183892108e-07, |
|
"loss": 1.111, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.9773360095465831, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 3.014885774029419e-07, |
|
"loss": 1.1179, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9802621652637884, |
|
"grad_norm": 0.064453125, |
|
"learning_rate": 2.2739726146381311e-07, |
|
"loss": 1.1082, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.9831883209809937, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 1.6372670904974963e-07, |
|
"loss": 1.14, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9861144766981991, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 1.1048357024054934e-07, |
|
"loss": 1.1126, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.9890406324154044, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 6.76734060233275e-08, |
|
"loss": 1.1087, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.9919667881326097, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 3.5300687711703475e-08, |
|
"loss": 1.1235, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.9948929438498151, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 1.3368796478807621e-08, |
|
"loss": 1.1019, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9978190995670204, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 1.8800230040860733e-09, |
|
"loss": 1.1143, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.9995747929973436, |
|
"eval_loss": 1.278271198272705, |
|
"eval_runtime": 1249.6014, |
|
"eval_samples_per_second": 12.937, |
|
"eval_steps_per_second": 12.937, |
|
"step": 1708 |
|
}, |
|
{ |
|
"epoch": 0.9995747929973436, |
|
"step": 1708, |
|
"total_flos": 2.8130589802160456e+18, |
|
"train_loss": 0.978924176871637, |
|
"train_runtime": 53778.1468, |
|
"train_samples_per_second": 4.067, |
|
"train_steps_per_second": 0.032 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1708, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 2.8130589802160456e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|