|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1019, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009813542688910697, |
|
"grad_norm": 511850.5, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 3.358, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.019627085377821395, |
|
"grad_norm": 852402.5625, |
|
"learning_rate": 7.843137254901962e-07, |
|
"loss": 4.0004, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.029440628066732092, |
|
"grad_norm": 41.74958801269531, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 3.1047, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03925417075564279, |
|
"grad_norm": 29.402185440063477, |
|
"learning_rate": 1.5686274509803923e-06, |
|
"loss": 2.4535, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04906771344455348, |
|
"grad_norm": 42.18159866333008, |
|
"learning_rate": 1.96078431372549e-06, |
|
"loss": 3.8615, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.058881256133464184, |
|
"grad_norm": 15.556477546691895, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 2.6701, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06869479882237488, |
|
"grad_norm": 1009053.9375, |
|
"learning_rate": 2.7450980392156867e-06, |
|
"loss": 2.5866, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07850834151128558, |
|
"grad_norm": 3.3355813026428223, |
|
"learning_rate": 3.1372549019607846e-06, |
|
"loss": 2.1546, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08832188420019627, |
|
"grad_norm": 6364043.5, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 1.8961, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09813542688910697, |
|
"grad_norm": 10.409449577331543, |
|
"learning_rate": 3.92156862745098e-06, |
|
"loss": 3.8516, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10794896957801767, |
|
"grad_norm": 3035116.25, |
|
"learning_rate": 4.313725490196079e-06, |
|
"loss": 2.656, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11776251226692837, |
|
"grad_norm": 1568043.375, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 2.5056, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12757605495583907, |
|
"grad_norm": 2.928816795349121, |
|
"learning_rate": 5.098039215686274e-06, |
|
"loss": 1.4711, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13738959764474976, |
|
"grad_norm": 2664501.25, |
|
"learning_rate": 5.4901960784313735e-06, |
|
"loss": 2.4145, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14720314033366044, |
|
"grad_norm": 5433292.0, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 2.5401, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15701668302257116, |
|
"grad_norm": 2.9648523330688477, |
|
"learning_rate": 6.274509803921569e-06, |
|
"loss": 1.1487, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16683022571148184, |
|
"grad_norm": 3384641.25, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.5521, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17664376840039253, |
|
"grad_norm": 2175554.5, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 2.2009, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18645731108930325, |
|
"grad_norm": 4.447290897369385, |
|
"learning_rate": 7.450980392156863e-06, |
|
"loss": 1.6716, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19627085377821393, |
|
"grad_norm": 2.5263493061065674, |
|
"learning_rate": 7.84313725490196e-06, |
|
"loss": 3.2454, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20608439646712462, |
|
"grad_norm": 772904.75, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 1.622, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21589793915603533, |
|
"grad_norm": 3.8818535804748535, |
|
"learning_rate": 8.627450980392157e-06, |
|
"loss": 2.1061, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.22571148184494602, |
|
"grad_norm": 4.158895015716553, |
|
"learning_rate": 9.019607843137256e-06, |
|
"loss": 1.7532, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23552502453385674, |
|
"grad_norm": 4.066924571990967, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 1.6545, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24533856722276742, |
|
"grad_norm": 1732917.625, |
|
"learning_rate": 9.803921568627451e-06, |
|
"loss": 2.3048, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25515210991167814, |
|
"grad_norm": 221879.953125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3068, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2649656526005888, |
|
"grad_norm": 859696.125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7565, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2747791952894995, |
|
"grad_norm": 3.6436195373535156, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1635, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2845927379784102, |
|
"grad_norm": 5.748630046844482, |
|
"learning_rate": 1e-05, |
|
"loss": 1.678, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2944062806673209, |
|
"grad_norm": 493996.46875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1511, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3042198233562316, |
|
"grad_norm": 9.792704582214355, |
|
"learning_rate": 1e-05, |
|
"loss": 2.6151, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3140333660451423, |
|
"grad_norm": 1645629.125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4831, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.323846908734053, |
|
"grad_norm": 4.4405131340026855, |
|
"learning_rate": 1e-05, |
|
"loss": 2.7356, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3336604514229637, |
|
"grad_norm": 8.055213928222656, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9915, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3434739941118744, |
|
"grad_norm": 604711.25, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7425, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.35328753680078506, |
|
"grad_norm": 8.440975189208984, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5921, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3631010794896958, |
|
"grad_norm": 1266034.375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0832, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3729146221786065, |
|
"grad_norm": 3.2711005210876465, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0002, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.38272816486751715, |
|
"grad_norm": 1738446.375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7899, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.39254170755642787, |
|
"grad_norm": 10.065378189086914, |
|
"learning_rate": 1e-05, |
|
"loss": 1.942, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4023552502453386, |
|
"grad_norm": 839358.625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2337, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.41216879293424924, |
|
"grad_norm": 820795.5, |
|
"learning_rate": 1e-05, |
|
"loss": 2.4434, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.42198233562315995, |
|
"grad_norm": 760894.875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.332, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.43179587831207067, |
|
"grad_norm": 465132.90625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1238, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.44160942100098133, |
|
"grad_norm": 151798.609375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0838, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.45142296368989204, |
|
"grad_norm": 3.6023194789886475, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5318, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.46123650637880276, |
|
"grad_norm": 3.711779832839966, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8415, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.47105004906771347, |
|
"grad_norm": 3.6837337017059326, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2333, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.48086359175662413, |
|
"grad_norm": 2.7638938426971436, |
|
"learning_rate": 1e-05, |
|
"loss": 1.612, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.49067713444553485, |
|
"grad_norm": 2.2806527614593506, |
|
"learning_rate": 1e-05, |
|
"loss": 2.4336, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5004906771344455, |
|
"grad_norm": 2.6325523853302, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3051, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5103042198233563, |
|
"grad_norm": 4.162623882293701, |
|
"learning_rate": 1e-05, |
|
"loss": 2.5193, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5201177625122669, |
|
"grad_norm": 3.865851879119873, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1113, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5299313052011776, |
|
"grad_norm": 3.6652672290802, |
|
"learning_rate": 1e-05, |
|
"loss": 2.605, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5397448478900884, |
|
"grad_norm": 1123418.0, |
|
"learning_rate": 1e-05, |
|
"loss": 2.367, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.549558390578999, |
|
"grad_norm": 3.206057071685791, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9706, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5593719332679097, |
|
"grad_norm": 3.8300833702087402, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6688, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5691854759568205, |
|
"grad_norm": 3.4160726070404053, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8959, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5789990186457311, |
|
"grad_norm": 6.991641044616699, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8449, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5888125613346418, |
|
"grad_norm": 3.89111065864563, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8364, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5986261040235525, |
|
"grad_norm": 12.52274227142334, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3841, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6084396467124632, |
|
"grad_norm": 1124655.25, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8931, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6182531894013739, |
|
"grad_norm": 2132181.75, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8265, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6280667320902846, |
|
"grad_norm": 3.21681547164917, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8137, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6378802747791953, |
|
"grad_norm": 1385230.375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2742, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.647693817468106, |
|
"grad_norm": 10.80539321899414, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0502, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6575073601570167, |
|
"grad_norm": 1592570.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9121, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6673209028459274, |
|
"grad_norm": 985591.5625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8159, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.677134445534838, |
|
"grad_norm": 1119573.375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9695, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6869479882237488, |
|
"grad_norm": 3.928929090499878, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1545, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6967615309126595, |
|
"grad_norm": 998297.4375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2963, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7065750736015701, |
|
"grad_norm": 3.8201591968536377, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9735, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7163886162904809, |
|
"grad_norm": 3.7799386978149414, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5274, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7262021589793916, |
|
"grad_norm": 3.718870162963867, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9676, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7360157016683022, |
|
"grad_norm": 4.023947715759277, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3345, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.745829244357213, |
|
"grad_norm": 14.283628463745117, |
|
"learning_rate": 1e-05, |
|
"loss": 2.7141, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7556427870461236, |
|
"grad_norm": 3178157.25, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9265, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7654563297350343, |
|
"grad_norm": 3.6791253089904785, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8104, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7752698724239451, |
|
"grad_norm": 4302724.5, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4787, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7850834151128557, |
|
"grad_norm": 1720963.75, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1176, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7948969578017664, |
|
"grad_norm": 1612358.875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2736, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8047105004906772, |
|
"grad_norm": 1152146.25, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5657, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8145240431795878, |
|
"grad_norm": 3.5905027389526367, |
|
"learning_rate": 1e-05, |
|
"loss": 2.6198, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8243375858684985, |
|
"grad_norm": 736680.8125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9112, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8341511285574092, |
|
"grad_norm": 2.9653732776641846, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3842, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8439646712463199, |
|
"grad_norm": 12.001425743103027, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3966, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8537782139352306, |
|
"grad_norm": 2124122.25, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3734, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8635917566241413, |
|
"grad_norm": 6534144.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3486, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.873405299313052, |
|
"grad_norm": 3.6779091358184814, |
|
"learning_rate": 1e-05, |
|
"loss": 0.949, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8832188420019627, |
|
"grad_norm": 1221940.0, |
|
"learning_rate": 1e-05, |
|
"loss": 2.6138, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8930323846908734, |
|
"grad_norm": 1095478.5, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4675, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9028459273797841, |
|
"grad_norm": 548933.875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8343, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9126594700686947, |
|
"grad_norm": 13.783559799194336, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1122, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9224730127576055, |
|
"grad_norm": 13.174997329711914, |
|
"learning_rate": 1e-05, |
|
"loss": 2.4962, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9322865554465162, |
|
"grad_norm": 10.191123962402344, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2086, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9421000981354269, |
|
"grad_norm": 3.606752872467041, |
|
"learning_rate": 1e-05, |
|
"loss": 1.323, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9519136408243376, |
|
"grad_norm": 2473294.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0528, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9617271835132483, |
|
"grad_norm": 2.848081588745117, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5576, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.971540726202159, |
|
"grad_norm": 3.5542256832122803, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8997, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9813542688910697, |
|
"grad_norm": 1991637.375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.5923, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9911678115799804, |
|
"grad_norm": 21.8354434967041, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0656, |
|
"step": 1010 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1019, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 255, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|