|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0011111111111111111, |
|
"grad_norm": 5.2032976150512695, |
|
"learning_rate": 4.999984769144476e-05, |
|
"loss": 5.3058, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0022222222222222222, |
|
"grad_norm": 6.879838466644287, |
|
"learning_rate": 4.999939076763487e-05, |
|
"loss": 5.0848, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0033333333333333335, |
|
"grad_norm": 6.035982131958008, |
|
"learning_rate": 4.999862923413781e-05, |
|
"loss": 5.5976, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0044444444444444444, |
|
"grad_norm": 7.264829635620117, |
|
"learning_rate": 4.999756310023261e-05, |
|
"loss": 5.007, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.005555555555555556, |
|
"grad_norm": 4.736705303192139, |
|
"learning_rate": 4.9996192378909786e-05, |
|
"loss": 4.6123, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006666666666666667, |
|
"grad_norm": 6.610605239868164, |
|
"learning_rate": 4.999451708687114e-05, |
|
"loss": 4.7884, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0077777777777777776, |
|
"grad_norm": 6.082452774047852, |
|
"learning_rate": 4.999253724452958e-05, |
|
"loss": 4.913, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.008888888888888889, |
|
"grad_norm": 4.39306116104126, |
|
"learning_rate": 4.999025287600886e-05, |
|
"loss": 5.2733, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.614330291748047, |
|
"learning_rate": 4.998766400914329e-05, |
|
"loss": 4.6074, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.011111111111111112, |
|
"grad_norm": 5.944769859313965, |
|
"learning_rate": 4.99847706754774e-05, |
|
"loss": 4.9337, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012222222222222223, |
|
"grad_norm": 7.276998519897461, |
|
"learning_rate": 4.998157291026553e-05, |
|
"loss": 5.0143, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.013333333333333334, |
|
"grad_norm": 5.569228172302246, |
|
"learning_rate": 4.997807075247146e-05, |
|
"loss": 5.1253, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.014444444444444444, |
|
"grad_norm": 5.123626232147217, |
|
"learning_rate": 4.997426424476787e-05, |
|
"loss": 4.4759, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.015555555555555555, |
|
"grad_norm": 4.314916133880615, |
|
"learning_rate": 4.997015343353585e-05, |
|
"loss": 5.193, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.016666666666666666, |
|
"grad_norm": 4.61911153793335, |
|
"learning_rate": 4.996573836886435e-05, |
|
"loss": 4.3899, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.017777777777777778, |
|
"grad_norm": 6.143403053283691, |
|
"learning_rate": 4.996101910454953e-05, |
|
"loss": 4.2176, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01888888888888889, |
|
"grad_norm": 6.195430278778076, |
|
"learning_rate": 4.995599569809414e-05, |
|
"loss": 4.1796, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.779390335083008, |
|
"learning_rate": 4.995066821070679e-05, |
|
"loss": 5.0214, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.021111111111111112, |
|
"grad_norm": 5.847035884857178, |
|
"learning_rate": 4.994503670730125e-05, |
|
"loss": 4.5121, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.022222222222222223, |
|
"grad_norm": 5.528200626373291, |
|
"learning_rate": 4.993910125649561e-05, |
|
"loss": 4.2415, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.023333333333333334, |
|
"grad_norm": 5.237406253814697, |
|
"learning_rate": 4.9932861930611454e-05, |
|
"loss": 5.0282, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.024444444444444446, |
|
"grad_norm": 5.065497875213623, |
|
"learning_rate": 4.992631880567301e-05, |
|
"loss": 4.525, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.025555555555555557, |
|
"grad_norm": 5.5612688064575195, |
|
"learning_rate": 4.991947196140618e-05, |
|
"loss": 4.9982, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 5.090909481048584, |
|
"learning_rate": 4.991232148123761e-05, |
|
"loss": 4.5534, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.027777777777777776, |
|
"grad_norm": 5.165072441101074, |
|
"learning_rate": 4.990486745229364e-05, |
|
"loss": 4.6862, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.028888888888888888, |
|
"grad_norm": 4.630911827087402, |
|
"learning_rate": 4.989710996539926e-05, |
|
"loss": 4.8492, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.68540358543396, |
|
"learning_rate": 4.9889049115077005e-05, |
|
"loss": 5.1254, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03111111111111111, |
|
"grad_norm": 5.599917888641357, |
|
"learning_rate": 4.988068499954578e-05, |
|
"loss": 4.9527, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03222222222222222, |
|
"grad_norm": 5.534451007843018, |
|
"learning_rate": 4.987201772071971e-05, |
|
"loss": 4.912, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03333333333333333, |
|
"grad_norm": 4.299800395965576, |
|
"learning_rate": 4.9863047384206835e-05, |
|
"loss": 5.1243, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.034444444444444444, |
|
"grad_norm": 3.687239646911621, |
|
"learning_rate": 4.985377409930789e-05, |
|
"loss": 4.5257, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 5.489537239074707, |
|
"learning_rate": 4.984419797901491e-05, |
|
"loss": 4.9116, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.03666666666666667, |
|
"grad_norm": 4.619030475616455, |
|
"learning_rate": 4.983431914000991e-05, |
|
"loss": 4.718, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.03777777777777778, |
|
"grad_norm": 5.1001200675964355, |
|
"learning_rate": 4.982413770266342e-05, |
|
"loss": 5.0285, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03888888888888889, |
|
"grad_norm": 4.231574058532715, |
|
"learning_rate": 4.9813653791033057e-05, |
|
"loss": 4.7938, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.560554027557373, |
|
"learning_rate": 4.980286753286195e-05, |
|
"loss": 4.962, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04111111111111111, |
|
"grad_norm": 3.8664653301239014, |
|
"learning_rate": 4.979177905957726e-05, |
|
"loss": 4.9856, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.042222222222222223, |
|
"grad_norm": 4.1073784828186035, |
|
"learning_rate": 4.978038850628854e-05, |
|
"loss": 5.2019, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.043333333333333335, |
|
"grad_norm": 4.941130638122559, |
|
"learning_rate": 4.976869601178609e-05, |
|
"loss": 4.6499, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 5.749270915985107, |
|
"learning_rate": 4.975670171853926e-05, |
|
"loss": 4.1511, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04555555555555556, |
|
"grad_norm": 3.7464685440063477, |
|
"learning_rate": 4.9744405772694725e-05, |
|
"loss": 4.9937, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.04666666666666667, |
|
"grad_norm": 4.391846656799316, |
|
"learning_rate": 4.9731808324074717e-05, |
|
"loss": 4.9573, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.04777777777777778, |
|
"grad_norm": 4.163111209869385, |
|
"learning_rate": 4.971890952617515e-05, |
|
"loss": 4.8546, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.04888888888888889, |
|
"grad_norm": 3.859717607498169, |
|
"learning_rate": 4.9705709536163824e-05, |
|
"loss": 4.8448, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.045307636260986, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 4.8979, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.051111111111111114, |
|
"grad_norm": 3.083608627319336, |
|
"learning_rate": 4.96784066268247e-05, |
|
"loss": 4.6191, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.052222222222222225, |
|
"grad_norm": 3.6996843814849854, |
|
"learning_rate": 4.966430404017424e-05, |
|
"loss": 4.1142, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 5.001142501831055, |
|
"learning_rate": 4.964990092676263e-05, |
|
"loss": 4.673, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.05444444444444444, |
|
"grad_norm": 4.148028373718262, |
|
"learning_rate": 4.963519746208726e-05, |
|
"loss": 4.4178, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 3.529871940612793, |
|
"learning_rate": 4.962019382530521e-05, |
|
"loss": 5.134, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.056666666666666664, |
|
"grad_norm": 3.791576385498047, |
|
"learning_rate": 4.960489019923105e-05, |
|
"loss": 4.5824, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.057777777777777775, |
|
"grad_norm": 3.236461877822876, |
|
"learning_rate": 4.9589286770334654e-05, |
|
"loss": 4.9126, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.058888888888888886, |
|
"grad_norm": 3.4092698097229004, |
|
"learning_rate": 4.957338372873886e-05, |
|
"loss": 4.9913, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.24392557144165, |
|
"learning_rate": 4.9557181268217227e-05, |
|
"loss": 4.3492, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06111111111111111, |
|
"grad_norm": 3.5253679752349854, |
|
"learning_rate": 4.9540679586191605e-05, |
|
"loss": 4.5665, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06222222222222222, |
|
"grad_norm": 4.3137688636779785, |
|
"learning_rate": 4.952387888372979e-05, |
|
"loss": 4.3782, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.06333333333333334, |
|
"grad_norm": 3.4922027587890625, |
|
"learning_rate": 4.9506779365543046e-05, |
|
"loss": 4.4069, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.06444444444444444, |
|
"grad_norm": 3.7192225456237793, |
|
"learning_rate": 4.94893812399836e-05, |
|
"loss": 4.6152, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.06555555555555556, |
|
"grad_norm": 3.398974895477295, |
|
"learning_rate": 4.947168471904213e-05, |
|
"loss": 4.8951, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 2.9628076553344727, |
|
"learning_rate": 4.9453690018345144e-05, |
|
"loss": 4.5419, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06777777777777778, |
|
"grad_norm": 2.703320026397705, |
|
"learning_rate": 4.94353973571524e-05, |
|
"loss": 4.9154, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.06888888888888889, |
|
"grad_norm": 2.9003796577453613, |
|
"learning_rate": 4.94168069583542e-05, |
|
"loss": 4.8565, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.6896684169769287, |
|
"learning_rate": 4.939791904846869e-05, |
|
"loss": 4.6401, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 3.679429292678833, |
|
"learning_rate": 4.937873385763908e-05, |
|
"loss": 4.5216, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.07222222222222222, |
|
"grad_norm": 3.837848424911499, |
|
"learning_rate": 4.9359251619630886e-05, |
|
"loss": 4.7913, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07333333333333333, |
|
"grad_norm": 4.7550368309021, |
|
"learning_rate": 4.933947257182901e-05, |
|
"loss": 4.6804, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.07444444444444444, |
|
"grad_norm": 3.387397289276123, |
|
"learning_rate": 4.931939695523492e-05, |
|
"loss": 5.1575, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.07555555555555556, |
|
"grad_norm": 2.715179204940796, |
|
"learning_rate": 4.929902501446366e-05, |
|
"loss": 4.8116, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.07666666666666666, |
|
"grad_norm": 3.598045587539673, |
|
"learning_rate": 4.9278356997740904e-05, |
|
"loss": 4.8033, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.07777777777777778, |
|
"grad_norm": 3.2445831298828125, |
|
"learning_rate": 4.925739315689991e-05, |
|
"loss": 5.0033, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07888888888888888, |
|
"grad_norm": 3.411445379257202, |
|
"learning_rate": 4.9236133747378475e-05, |
|
"loss": 4.7147, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.331087589263916, |
|
"learning_rate": 4.9214579028215776e-05, |
|
"loss": 4.3199, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0811111111111111, |
|
"grad_norm": 5.33408784866333, |
|
"learning_rate": 4.919272926204929e-05, |
|
"loss": 4.882, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.08222222222222222, |
|
"grad_norm": 2.8994922637939453, |
|
"learning_rate": 4.917058471511149e-05, |
|
"loss": 4.678, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 2.394202709197998, |
|
"learning_rate": 4.914814565722671e-05, |
|
"loss": 4.7618, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08444444444444445, |
|
"grad_norm": 3.3278257846832275, |
|
"learning_rate": 4.912541236180779e-05, |
|
"loss": 4.5066, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.08555555555555555, |
|
"grad_norm": 5.1034836769104, |
|
"learning_rate": 4.910238510585276e-05, |
|
"loss": 4.9339, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.08666666666666667, |
|
"grad_norm": 3.776923179626465, |
|
"learning_rate": 4.907906416994146e-05, |
|
"loss": 4.264, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.08777777777777777, |
|
"grad_norm": 3.5819032192230225, |
|
"learning_rate": 4.905544983823214e-05, |
|
"loss": 4.6317, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 4.019664764404297, |
|
"learning_rate": 4.9031542398457974e-05, |
|
"loss": 4.2868, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.2063353061676025, |
|
"learning_rate": 4.900734214192358e-05, |
|
"loss": 4.7617, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.09111111111111111, |
|
"grad_norm": 3.4615073204040527, |
|
"learning_rate": 4.898284936350144e-05, |
|
"loss": 4.6781, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.09222222222222222, |
|
"grad_norm": 3.8503334522247314, |
|
"learning_rate": 4.895806436162833e-05, |
|
"loss": 5.0211, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.09333333333333334, |
|
"grad_norm": 3.9291231632232666, |
|
"learning_rate": 4.893298743830168e-05, |
|
"loss": 4.8865, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.09444444444444444, |
|
"grad_norm": 3.537541389465332, |
|
"learning_rate": 4.890761889907589e-05, |
|
"loss": 4.5888, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09555555555555556, |
|
"grad_norm": 2.426281690597534, |
|
"learning_rate": 4.888195905305859e-05, |
|
"loss": 4.3387, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.09666666666666666, |
|
"grad_norm": 3.3084747791290283, |
|
"learning_rate": 4.8856008212906925e-05, |
|
"loss": 4.9159, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.09777777777777778, |
|
"grad_norm": 4.331256866455078, |
|
"learning_rate": 4.882976669482367e-05, |
|
"loss": 4.7531, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.09888888888888889, |
|
"grad_norm": 3.6446895599365234, |
|
"learning_rate": 4.880323481855347e-05, |
|
"loss": 4.2317, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.512236595153809, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 4.5889, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10111111111111111, |
|
"grad_norm": 4.778031349182129, |
|
"learning_rate": 4.874930128811631e-05, |
|
"loss": 4.7279, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.10222222222222223, |
|
"grad_norm": 2.602832794189453, |
|
"learning_rate": 4.8721900291112415e-05, |
|
"loss": 4.9481, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.10333333333333333, |
|
"grad_norm": 2.8278868198394775, |
|
"learning_rate": 4.869421025023965e-05, |
|
"loss": 4.5763, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.10444444444444445, |
|
"grad_norm": 3.5263729095458984, |
|
"learning_rate": 4.8666231502892415e-05, |
|
"loss": 4.3702, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.10555555555555556, |
|
"grad_norm": 3.6424851417541504, |
|
"learning_rate": 4.8637964389982926e-05, |
|
"loss": 4.0502, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 3.5338454246520996, |
|
"learning_rate": 4.860940925593703e-05, |
|
"loss": 4.7823, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.10777777777777778, |
|
"grad_norm": 3.6265504360198975, |
|
"learning_rate": 4.858056644869002e-05, |
|
"loss": 4.7303, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.10888888888888888, |
|
"grad_norm": 2.4503519535064697, |
|
"learning_rate": 4.855143631968242e-05, |
|
"loss": 4.4291, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.9208950996398926, |
|
"learning_rate": 4.852201922385564e-05, |
|
"loss": 4.876, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 2.9791765213012695, |
|
"learning_rate": 4.849231551964771e-05, |
|
"loss": 4.8165, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11222222222222222, |
|
"grad_norm": 3.589217185974121, |
|
"learning_rate": 4.84623255689889e-05, |
|
"loss": 4.7452, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.11333333333333333, |
|
"grad_norm": 3.037071943283081, |
|
"learning_rate": 4.843204973729729e-05, |
|
"loss": 4.7103, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.11444444444444445, |
|
"grad_norm": 2.5937793254852295, |
|
"learning_rate": 4.840148839347434e-05, |
|
"loss": 4.4314, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.11555555555555555, |
|
"grad_norm": 2.879254102706909, |
|
"learning_rate": 4.837064190990036e-05, |
|
"loss": 4.4885, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.11666666666666667, |
|
"grad_norm": 3.404500722885132, |
|
"learning_rate": 4.8339510662430046e-05, |
|
"loss": 4.4227, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.11777777777777777, |
|
"grad_norm": 3.4791483879089355, |
|
"learning_rate": 4.830809503038781e-05, |
|
"loss": 4.8543, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.11888888888888889, |
|
"grad_norm": 3.072810649871826, |
|
"learning_rate": 4.827639539656321e-05, |
|
"loss": 4.7271, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.365445375442505, |
|
"learning_rate": 4.8244412147206284e-05, |
|
"loss": 4.8491, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.12111111111111111, |
|
"grad_norm": 3.6025092601776123, |
|
"learning_rate": 4.8212145672022844e-05, |
|
"loss": 4.7209, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.12222222222222222, |
|
"grad_norm": 4.458660125732422, |
|
"learning_rate": 4.817959636416969e-05, |
|
"loss": 4.1645, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12333333333333334, |
|
"grad_norm": 3.9988503456115723, |
|
"learning_rate": 4.814676462024988e-05, |
|
"loss": 4.6813, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.12444444444444444, |
|
"grad_norm": 2.8452532291412354, |
|
"learning_rate": 4.8113650840307834e-05, |
|
"loss": 5.0675, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.12555555555555556, |
|
"grad_norm": 2.9468061923980713, |
|
"learning_rate": 4.808025542782453e-05, |
|
"loss": 4.8562, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.12666666666666668, |
|
"grad_norm": 3.0511226654052734, |
|
"learning_rate": 4.8046578789712515e-05, |
|
"loss": 4.5727, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.12777777777777777, |
|
"grad_norm": 2.0922510623931885, |
|
"learning_rate": 4.8012621336311016e-05, |
|
"loss": 4.7914, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1288888888888889, |
|
"grad_norm": 2.8942031860351562, |
|
"learning_rate": 4.797838348138086e-05, |
|
"loss": 4.6763, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.84708571434021, |
|
"learning_rate": 4.794386564209953e-05, |
|
"loss": 4.2561, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.13111111111111112, |
|
"grad_norm": 2.471663236618042, |
|
"learning_rate": 4.790906823905599e-05, |
|
"loss": 4.4677, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1322222222222222, |
|
"grad_norm": 2.5082037448883057, |
|
"learning_rate": 4.7873991696245624e-05, |
|
"loss": 4.56, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 2.900052309036255, |
|
"learning_rate": 4.783863644106502e-05, |
|
"loss": 4.8909, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13444444444444445, |
|
"grad_norm": 3.5951879024505615, |
|
"learning_rate": 4.780300290430682e-05, |
|
"loss": 4.6476, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.13555555555555557, |
|
"grad_norm": 4.468568325042725, |
|
"learning_rate": 4.776709152015443e-05, |
|
"loss": 4.3256, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.13666666666666666, |
|
"grad_norm": 3.0081839561462402, |
|
"learning_rate": 4.773090272617672e-05, |
|
"loss": 4.7223, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.13777777777777778, |
|
"grad_norm": 3.8555331230163574, |
|
"learning_rate": 4.769443696332272e-05, |
|
"loss": 4.4773, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 3.729095697402954, |
|
"learning_rate": 4.765769467591625e-05, |
|
"loss": 4.7924, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.2823543548583984, |
|
"learning_rate": 4.762067631165049e-05, |
|
"loss": 4.5892, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.1411111111111111, |
|
"grad_norm": 3.335906982421875, |
|
"learning_rate": 4.758338232158252e-05, |
|
"loss": 4.8221, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 5.226222038269043, |
|
"learning_rate": 4.754581316012785e-05, |
|
"loss": 4.5129, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.14333333333333334, |
|
"grad_norm": 3.1001462936401367, |
|
"learning_rate": 4.7507969285054845e-05, |
|
"loss": 4.4719, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.14444444444444443, |
|
"grad_norm": 3.3555104732513428, |
|
"learning_rate": 4.7469851157479177e-05, |
|
"loss": 4.5403, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14555555555555555, |
|
"grad_norm": 2.935755968093872, |
|
"learning_rate": 4.743145924185821e-05, |
|
"loss": 4.8928, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.14666666666666667, |
|
"grad_norm": 2.488250970840454, |
|
"learning_rate": 4.7392794005985326e-05, |
|
"loss": 4.2008, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.14777777777777779, |
|
"grad_norm": 3.4012887477874756, |
|
"learning_rate": 4.73538559209842e-05, |
|
"loss": 4.6079, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.14888888888888888, |
|
"grad_norm": 2.7918901443481445, |
|
"learning_rate": 4.731464546130314e-05, |
|
"loss": 4.7116, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.9989566802978516, |
|
"learning_rate": 4.72751631047092e-05, |
|
"loss": 4.3616, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1511111111111111, |
|
"grad_norm": 3.592566967010498, |
|
"learning_rate": 4.723540933228244e-05, |
|
"loss": 4.7092, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.15222222222222223, |
|
"grad_norm": 2.825819730758667, |
|
"learning_rate": 4.719538462841003e-05, |
|
"loss": 4.8076, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.15333333333333332, |
|
"grad_norm": 3.5768320560455322, |
|
"learning_rate": 4.715508948078037e-05, |
|
"loss": 4.2689, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.15444444444444444, |
|
"grad_norm": 2.7928998470306396, |
|
"learning_rate": 4.71145243803771e-05, |
|
"loss": 4.5123, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.15555555555555556, |
|
"grad_norm": 3.065845251083374, |
|
"learning_rate": 4.707368982147318e-05, |
|
"loss": 4.5658, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15666666666666668, |
|
"grad_norm": 3.1111562252044678, |
|
"learning_rate": 4.70325863016248e-05, |
|
"loss": 4.6722, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.15777777777777777, |
|
"grad_norm": 3.132770299911499, |
|
"learning_rate": 4.6991214321665414e-05, |
|
"loss": 4.3566, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.15888888888888889, |
|
"grad_norm": 3.0841097831726074, |
|
"learning_rate": 4.694957438569951e-05, |
|
"loss": 4.9723, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.105175018310547, |
|
"learning_rate": 4.690766700109659e-05, |
|
"loss": 4.4099, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.16111111111111112, |
|
"grad_norm": 4.112144470214844, |
|
"learning_rate": 4.6865492678484895e-05, |
|
"loss": 4.2418, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1622222222222222, |
|
"grad_norm": 2.671475648880005, |
|
"learning_rate": 4.682305193174524e-05, |
|
"loss": 4.823, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.16333333333333333, |
|
"grad_norm": 3.42596697807312, |
|
"learning_rate": 4.678034527800474e-05, |
|
"loss": 4.6529, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.16444444444444445, |
|
"grad_norm": 3.2327771186828613, |
|
"learning_rate": 4.6737373237630476e-05, |
|
"loss": 4.6662, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.16555555555555557, |
|
"grad_norm": 3.2889630794525146, |
|
"learning_rate": 4.669413633422322e-05, |
|
"loss": 4.3048, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 2.366293430328369, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 4.7887, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16777777777777778, |
|
"grad_norm": 2.6844308376312256, |
|
"learning_rate": 4.6606870048842624e-05, |
|
"loss": 4.954, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.1688888888888889, |
|
"grad_norm": 3.2190423011779785, |
|
"learning_rate": 4.656284173018144e-05, |
|
"loss": 5.189, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.640512466430664, |
|
"learning_rate": 4.65185506750986e-05, |
|
"loss": 4.4657, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1711111111111111, |
|
"grad_norm": 2.7704906463623047, |
|
"learning_rate": 4.6473997423266614e-05, |
|
"loss": 4.6634, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.17222222222222222, |
|
"grad_norm": 2.7830865383148193, |
|
"learning_rate": 4.642918251755281e-05, |
|
"loss": 4.5943, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17333333333333334, |
|
"grad_norm": 2.327153444290161, |
|
"learning_rate": 4.638410650401267e-05, |
|
"loss": 4.8456, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.17444444444444446, |
|
"grad_norm": 3.3280811309814453, |
|
"learning_rate": 4.6338769931883185e-05, |
|
"loss": 4.6068, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.17555555555555555, |
|
"grad_norm": 3.1970295906066895, |
|
"learning_rate": 4.629317335357619e-05, |
|
"loss": 4.6854, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.17666666666666667, |
|
"grad_norm": 2.481355667114258, |
|
"learning_rate": 4.6247317324671605e-05, |
|
"loss": 4.5796, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 2.445061683654785, |
|
"learning_rate": 4.620120240391065e-05, |
|
"loss": 4.3907, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17888888888888888, |
|
"grad_norm": 3.381376028060913, |
|
"learning_rate": 4.615482915318911e-05, |
|
"loss": 4.6822, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.4204893112182617, |
|
"learning_rate": 4.610819813755038e-05, |
|
"loss": 4.436, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.1811111111111111, |
|
"grad_norm": 2.725168228149414, |
|
"learning_rate": 4.606130992517869e-05, |
|
"loss": 5.0643, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.18222222222222223, |
|
"grad_norm": 3.7455644607543945, |
|
"learning_rate": 4.601416508739211e-05, |
|
"loss": 4.7741, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.18333333333333332, |
|
"grad_norm": 2.5998661518096924, |
|
"learning_rate": 4.5966764198635606e-05, |
|
"loss": 4.8321, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.18444444444444444, |
|
"grad_norm": 4.380634784698486, |
|
"learning_rate": 4.591910783647404e-05, |
|
"loss": 4.6678, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.18555555555555556, |
|
"grad_norm": 2.3288722038269043, |
|
"learning_rate": 4.5871196581585166e-05, |
|
"loss": 4.8369, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 2.959716320037842, |
|
"learning_rate": 4.5823031017752485e-05, |
|
"loss": 4.0906, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.18777777777777777, |
|
"grad_norm": 2.6955947875976562, |
|
"learning_rate": 4.577461173185821e-05, |
|
"loss": 4.6623, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.18888888888888888, |
|
"grad_norm": 4.677174091339111, |
|
"learning_rate": 4.572593931387604e-05, |
|
"loss": 4.9871, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.5706987380981445, |
|
"learning_rate": 4.567701435686404e-05, |
|
"loss": 4.8683, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.19111111111111112, |
|
"grad_norm": 3.341663122177124, |
|
"learning_rate": 4.562783745695738e-05, |
|
"loss": 4.6751, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1922222222222222, |
|
"grad_norm": 2.941930055618286, |
|
"learning_rate": 4.557840921336105e-05, |
|
"loss": 4.8538, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.19333333333333333, |
|
"grad_norm": 2.8567423820495605, |
|
"learning_rate": 4.5528730228342605e-05, |
|
"loss": 4.5703, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.19444444444444445, |
|
"grad_norm": 2.6613831520080566, |
|
"learning_rate": 4.54788011072248e-05, |
|
"loss": 4.4107, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.19555555555555557, |
|
"grad_norm": 2.5689127445220947, |
|
"learning_rate": 4.542862245837821e-05, |
|
"loss": 4.7408, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.19666666666666666, |
|
"grad_norm": 3.576414108276367, |
|
"learning_rate": 4.537819489321386e-05, |
|
"loss": 4.5211, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.19777777777777777, |
|
"grad_norm": 3.1265640258789062, |
|
"learning_rate": 4.532751902617569e-05, |
|
"loss": 4.0729, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.1988888888888889, |
|
"grad_norm": 3.3458447456359863, |
|
"learning_rate": 4.527659547473317e-05, |
|
"loss": 5.1058, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.459259033203125, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 4.3809, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2011111111111111, |
|
"grad_norm": 2.7210464477539062, |
|
"learning_rate": 4.5174007803595055e-05, |
|
"loss": 4.5236, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.20222222222222222, |
|
"grad_norm": 3.285710334777832, |
|
"learning_rate": 4.512234493389785e-05, |
|
"loss": 4.6732, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.20333333333333334, |
|
"grad_norm": 3.063709020614624, |
|
"learning_rate": 4.5070436879777865e-05, |
|
"loss": 4.2399, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.20444444444444446, |
|
"grad_norm": 2.4527218341827393, |
|
"learning_rate": 4.5018284273718336e-05, |
|
"loss": 4.8007, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.20555555555555555, |
|
"grad_norm": 3.8102920055389404, |
|
"learning_rate": 4.496588775118232e-05, |
|
"loss": 4.8862, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.20666666666666667, |
|
"grad_norm": 4.2287139892578125, |
|
"learning_rate": 4.491324795060491e-05, |
|
"loss": 4.3253, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.20777777777777778, |
|
"grad_norm": 3.0381033420562744, |
|
"learning_rate": 4.4860365513385456e-05, |
|
"loss": 4.5634, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.2088888888888889, |
|
"grad_norm": 3.7136149406433105, |
|
"learning_rate": 4.480724108387977e-05, |
|
"loss": 5.318, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.9353251457214355, |
|
"learning_rate": 4.4753875309392266e-05, |
|
"loss": 4.6681, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.2111111111111111, |
|
"grad_norm": 2.4687249660491943, |
|
"learning_rate": 4.4700268840168045e-05, |
|
"loss": 4.7589, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21222222222222223, |
|
"grad_norm": 2.1315102577209473, |
|
"learning_rate": 4.464642232938505e-05, |
|
"loss": 4.737, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 3.893827438354492, |
|
"learning_rate": 4.4592336433146e-05, |
|
"loss": 4.1764, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.21444444444444444, |
|
"grad_norm": 2.8332619667053223, |
|
"learning_rate": 4.453801181047047e-05, |
|
"loss": 4.7298, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.21555555555555556, |
|
"grad_norm": 2.2660417556762695, |
|
"learning_rate": 4.448344912328686e-05, |
|
"loss": 4.6808, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.21666666666666667, |
|
"grad_norm": 3.361409902572632, |
|
"learning_rate": 4.442864903642428e-05, |
|
"loss": 4.6071, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.21777777777777776, |
|
"grad_norm": 2.6645731925964355, |
|
"learning_rate": 4.4373612217604496e-05, |
|
"loss": 4.552, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.21888888888888888, |
|
"grad_norm": 3.0278093814849854, |
|
"learning_rate": 4.431833933743378e-05, |
|
"loss": 4.9144, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.6761045455932617, |
|
"learning_rate": 4.426283106939474e-05, |
|
"loss": 4.6239, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.22111111111111112, |
|
"grad_norm": 3.8143444061279297, |
|
"learning_rate": 4.420708808983809e-05, |
|
"loss": 4.5675, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 3.473196506500244, |
|
"learning_rate": 4.415111107797445e-05, |
|
"loss": 4.6748, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22333333333333333, |
|
"grad_norm": 3.977616310119629, |
|
"learning_rate": 4.4094900715866064e-05, |
|
"loss": 4.3232, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.22444444444444445, |
|
"grad_norm": 3.6012306213378906, |
|
"learning_rate": 4.403845768841842e-05, |
|
"loss": 4.975, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.22555555555555556, |
|
"grad_norm": 3.368455171585083, |
|
"learning_rate": 4.3981782683372016e-05, |
|
"loss": 4.5316, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.22666666666666666, |
|
"grad_norm": 4.002109050750732, |
|
"learning_rate": 4.3924876391293915e-05, |
|
"loss": 4.2947, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.22777777777777777, |
|
"grad_norm": 2.9084315299987793, |
|
"learning_rate": 4.386773950556931e-05, |
|
"loss": 4.4741, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2288888888888889, |
|
"grad_norm": 2.937263011932373, |
|
"learning_rate": 4.381037272239311e-05, |
|
"loss": 4.1499, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.508908987045288, |
|
"learning_rate": 4.375277674076149e-05, |
|
"loss": 4.856, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.2311111111111111, |
|
"grad_norm": 3.1218552589416504, |
|
"learning_rate": 4.36949522624633e-05, |
|
"loss": 4.7484, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.23222222222222222, |
|
"grad_norm": 3.868100881576538, |
|
"learning_rate": 4.363689999207156e-05, |
|
"loss": 4.4354, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.23333333333333334, |
|
"grad_norm": 2.4734623432159424, |
|
"learning_rate": 4.357862063693486e-05, |
|
"loss": 4.978, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23444444444444446, |
|
"grad_norm": 3.2189548015594482, |
|
"learning_rate": 4.352011490716875e-05, |
|
"loss": 4.9206, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.23555555555555555, |
|
"grad_norm": 3.757636308670044, |
|
"learning_rate": 4.3461383515647106e-05, |
|
"loss": 4.5211, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.23666666666666666, |
|
"grad_norm": 4.024762153625488, |
|
"learning_rate": 4.3402427177993366e-05, |
|
"loss": 4.5448, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.23777777777777778, |
|
"grad_norm": 3.536659002304077, |
|
"learning_rate": 4.334324661257191e-05, |
|
"loss": 5.0555, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.2388888888888889, |
|
"grad_norm": 2.2506678104400635, |
|
"learning_rate": 4.3283842540479264e-05, |
|
"loss": 4.8103, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.8261499404907227, |
|
"learning_rate": 4.3224215685535294e-05, |
|
"loss": 4.449, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.2411111111111111, |
|
"grad_norm": 3.2074854373931885, |
|
"learning_rate": 4.31643667742744e-05, |
|
"loss": 4.3977, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.24222222222222223, |
|
"grad_norm": 2.743082284927368, |
|
"learning_rate": 4.3104296535936695e-05, |
|
"loss": 4.2452, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.24333333333333335, |
|
"grad_norm": 2.7638344764709473, |
|
"learning_rate": 4.304400570245906e-05, |
|
"loss": 4.6636, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.24444444444444444, |
|
"grad_norm": 3.1931586265563965, |
|
"learning_rate": 4.2983495008466276e-05, |
|
"loss": 4.3444, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24555555555555555, |
|
"grad_norm": 3.946772575378418, |
|
"learning_rate": 4.292276519126207e-05, |
|
"loss": 4.4841, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.24666666666666667, |
|
"grad_norm": 2.5195651054382324, |
|
"learning_rate": 4.2861816990820084e-05, |
|
"loss": 4.7453, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2477777777777778, |
|
"grad_norm": 2.1805219650268555, |
|
"learning_rate": 4.280065114977492e-05, |
|
"loss": 4.7288, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 2.361443519592285, |
|
"learning_rate": 4.273926841341302e-05, |
|
"loss": 4.9004, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.45947265625, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 4.3567, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2511111111111111, |
|
"grad_norm": 3.4783213138580322, |
|
"learning_rate": 4.261585524908987e-05, |
|
"loss": 4.9095, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.25222222222222224, |
|
"grad_norm": 2.565812110900879, |
|
"learning_rate": 4.2553826324879064e-05, |
|
"loss": 4.5359, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.25333333333333335, |
|
"grad_norm": 3.349132776260376, |
|
"learning_rate": 4.249158351283414e-05, |
|
"loss": 4.3791, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.2544444444444444, |
|
"grad_norm": 2.278238534927368, |
|
"learning_rate": 4.242912757136412e-05, |
|
"loss": 4.5528, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.25555555555555554, |
|
"grad_norm": 2.851348400115967, |
|
"learning_rate": 4.2366459261474933e-05, |
|
"loss": 4.1264, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.25666666666666665, |
|
"grad_norm": 2.4230828285217285, |
|
"learning_rate": 4.230357934676017e-05, |
|
"loss": 4.8431, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.2577777777777778, |
|
"grad_norm": 3.563849687576294, |
|
"learning_rate": 4.224048859339175e-05, |
|
"loss": 4.2379, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2588888888888889, |
|
"grad_norm": 3.419377088546753, |
|
"learning_rate": 4.2177187770110576e-05, |
|
"loss": 4.5906, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.992064952850342, |
|
"learning_rate": 4.211367764821722e-05, |
|
"loss": 4.1902, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.2611111111111111, |
|
"grad_norm": 2.2617876529693604, |
|
"learning_rate": 4.2049959001562464e-05, |
|
"loss": 4.6505, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.26222222222222225, |
|
"grad_norm": 2.8081510066986084, |
|
"learning_rate": 4.198603260653792e-05, |
|
"loss": 4.9376, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2633333333333333, |
|
"grad_norm": 3.759847402572632, |
|
"learning_rate": 4.192189924206652e-05, |
|
"loss": 4.4958, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2644444444444444, |
|
"grad_norm": 3.2556324005126953, |
|
"learning_rate": 4.185755968959308e-05, |
|
"loss": 4.3312, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.26555555555555554, |
|
"grad_norm": 2.438190221786499, |
|
"learning_rate": 4.179301473307476e-05, |
|
"loss": 4.8255, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 3.00315523147583, |
|
"learning_rate": 4.172826515897146e-05, |
|
"loss": 4.788, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2677777777777778, |
|
"grad_norm": 3.5244944095611572, |
|
"learning_rate": 4.166331175623631e-05, |
|
"loss": 4.5552, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.2688888888888889, |
|
"grad_norm": 2.436034917831421, |
|
"learning_rate": 4.1598155316306044e-05, |
|
"loss": 4.3463, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.473583698272705, |
|
"learning_rate": 4.1532796633091296e-05, |
|
"loss": 4.6629, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.27111111111111114, |
|
"grad_norm": 2.3865156173706055, |
|
"learning_rate": 4.146723650296701e-05, |
|
"loss": 4.5326, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.2722222222222222, |
|
"grad_norm": 3.0051960945129395, |
|
"learning_rate": 4.140147572476268e-05, |
|
"loss": 4.6408, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2733333333333333, |
|
"grad_norm": 2.631802797317505, |
|
"learning_rate": 4.133551509975264e-05, |
|
"loss": 4.6096, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.27444444444444444, |
|
"grad_norm": 3.8499889373779297, |
|
"learning_rate": 4.1269355431646274e-05, |
|
"loss": 4.3807, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.27555555555555555, |
|
"grad_norm": 4.838550090789795, |
|
"learning_rate": 4.1202997526578276e-05, |
|
"loss": 4.1733, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.27666666666666667, |
|
"grad_norm": 3.210563898086548, |
|
"learning_rate": 4.113644219309877e-05, |
|
"loss": 4.633, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 3.254894256591797, |
|
"learning_rate": 4.1069690242163484e-05, |
|
"loss": 4.3214, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2788888888888889, |
|
"grad_norm": 2.7834694385528564, |
|
"learning_rate": 4.100274248712389e-05, |
|
"loss": 4.3556, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.8591508865356445, |
|
"learning_rate": 4.093559974371725e-05, |
|
"loss": 4.4838, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.2811111111111111, |
|
"grad_norm": 3.7769737243652344, |
|
"learning_rate": 4.086826283005669e-05, |
|
"loss": 5.1812, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.2822222222222222, |
|
"grad_norm": 4.0656914710998535, |
|
"learning_rate": 4.080073256662127e-05, |
|
"loss": 4.4083, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.2833333333333333, |
|
"grad_norm": 3.192784547805786, |
|
"learning_rate": 4.073300977624594e-05, |
|
"loss": 4.7642, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 3.2855887413024902, |
|
"learning_rate": 4.066509528411152e-05, |
|
"loss": 4.2253, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.28555555555555556, |
|
"grad_norm": 4.624244213104248, |
|
"learning_rate": 4.059698991773466e-05, |
|
"loss": 4.4538, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.2866666666666667, |
|
"grad_norm": 3.160623073577881, |
|
"learning_rate": 4.052869450695776e-05, |
|
"loss": 4.1262, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.2877777777777778, |
|
"grad_norm": 3.2087790966033936, |
|
"learning_rate": 4.046020988393885e-05, |
|
"loss": 4.3326, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.28888888888888886, |
|
"grad_norm": 3.1688692569732666, |
|
"learning_rate": 4.039153688314145e-05, |
|
"loss": 4.7679, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.6120312213897705, |
|
"learning_rate": 4.0322676341324415e-05, |
|
"loss": 4.9559, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.2911111111111111, |
|
"grad_norm": 3.8062994480133057, |
|
"learning_rate": 4.02536290975317e-05, |
|
"loss": 4.3511, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.2922222222222222, |
|
"grad_norm": 3.093778610229492, |
|
"learning_rate": 4.018439599308217e-05, |
|
"loss": 4.3003, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 2.905430316925049, |
|
"learning_rate": 4.011497787155938e-05, |
|
"loss": 4.4313, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.29444444444444445, |
|
"grad_norm": 3.0712385177612305, |
|
"learning_rate": 4.0045375578801214e-05, |
|
"loss": 4.4432, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.29555555555555557, |
|
"grad_norm": 3.544624090194702, |
|
"learning_rate": 3.997558996288965e-05, |
|
"loss": 4.5969, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.2966666666666667, |
|
"grad_norm": 3.2956557273864746, |
|
"learning_rate": 3.99056218741404e-05, |
|
"loss": 4.5313, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.29777777777777775, |
|
"grad_norm": 2.8312222957611084, |
|
"learning_rate": 3.983547216509254e-05, |
|
"loss": 4.9315, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.29888888888888887, |
|
"grad_norm": 2.3425047397613525, |
|
"learning_rate": 3.976514169049814e-05, |
|
"loss": 4.3562, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.2110278606414795, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 4.3582, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3011111111111111, |
|
"grad_norm": 2.364408493041992, |
|
"learning_rate": 3.962394187468039e-05, |
|
"loss": 4.8083, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.3022222222222222, |
|
"grad_norm": 3.1101303100585938, |
|
"learning_rate": 3.955307425393224e-05, |
|
"loss": 4.5352, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.30333333333333334, |
|
"grad_norm": 2.396379232406616, |
|
"learning_rate": 3.948202930856697e-05, |
|
"loss": 4.3274, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.30444444444444446, |
|
"grad_norm": 2.183039426803589, |
|
"learning_rate": 3.941080790424484e-05, |
|
"loss": 4.6847, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.3055555555555556, |
|
"grad_norm": 2.731586456298828, |
|
"learning_rate": 3.933941090877615e-05, |
|
"loss": 4.4002, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.30666666666666664, |
|
"grad_norm": 2.9183268547058105, |
|
"learning_rate": 3.92678391921108e-05, |
|
"loss": 4.7244, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.30777777777777776, |
|
"grad_norm": 2.450711965560913, |
|
"learning_rate": 3.919609362632753e-05, |
|
"loss": 4.5988, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.3088888888888889, |
|
"grad_norm": 3.4320664405822754, |
|
"learning_rate": 3.912417508562345e-05, |
|
"loss": 4.4192, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.2206807136535645, |
|
"learning_rate": 3.905208444630327e-05, |
|
"loss": 4.3554, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 3.739584445953369, |
|
"learning_rate": 3.897982258676867e-05, |
|
"loss": 4.4398, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31222222222222223, |
|
"grad_norm": 3.239889144897461, |
|
"learning_rate": 3.8907390387507625e-05, |
|
"loss": 4.3802, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.31333333333333335, |
|
"grad_norm": 2.5824975967407227, |
|
"learning_rate": 3.883478873108361e-05, |
|
"loss": 4.7278, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.31444444444444447, |
|
"grad_norm": 2.3867881298065186, |
|
"learning_rate": 3.8762018502124894e-05, |
|
"loss": 4.5456, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.31555555555555553, |
|
"grad_norm": 3.0963456630706787, |
|
"learning_rate": 3.868908058731376e-05, |
|
"loss": 3.8366, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.31666666666666665, |
|
"grad_norm": 2.538454532623291, |
|
"learning_rate": 3.861597587537568e-05, |
|
"loss": 4.2254, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.31777777777777777, |
|
"grad_norm": 3.2098913192749023, |
|
"learning_rate": 3.85427052570685e-05, |
|
"loss": 5.1547, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.3188888888888889, |
|
"grad_norm": 2.2710459232330322, |
|
"learning_rate": 3.8469269625171576e-05, |
|
"loss": 4.5172, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.4076178073883057, |
|
"learning_rate": 3.8395669874474915e-05, |
|
"loss": 4.2652, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.3211111111111111, |
|
"grad_norm": 2.152460813522339, |
|
"learning_rate": 3.832190690176825e-05, |
|
"loss": 4.4623, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.32222222222222224, |
|
"grad_norm": 2.5687735080718994, |
|
"learning_rate": 3.824798160583012e-05, |
|
"loss": 4.7519, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3233333333333333, |
|
"grad_norm": 2.60406494140625, |
|
"learning_rate": 3.8173894887416945e-05, |
|
"loss": 4.7309, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.3244444444444444, |
|
"grad_norm": 3.08526349067688, |
|
"learning_rate": 3.8099647649251986e-05, |
|
"loss": 4.5816, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.32555555555555554, |
|
"grad_norm": 3.4984188079833984, |
|
"learning_rate": 3.802524079601442e-05, |
|
"loss": 4.5818, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.32666666666666666, |
|
"grad_norm": 4.528430938720703, |
|
"learning_rate": 3.795067523432826e-05, |
|
"loss": 4.1268, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.3277777777777778, |
|
"grad_norm": 3.826263904571533, |
|
"learning_rate": 3.787595187275136e-05, |
|
"loss": 4.4605, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3288888888888889, |
|
"grad_norm": 2.9818341732025146, |
|
"learning_rate": 3.780107162176429e-05, |
|
"loss": 4.789, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.642854928970337, |
|
"learning_rate": 3.7726035393759285e-05, |
|
"loss": 4.7423, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.33111111111111113, |
|
"grad_norm": 2.6310813426971436, |
|
"learning_rate": 3.765084410302909e-05, |
|
"loss": 4.1751, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.3322222222222222, |
|
"grad_norm": 4.018439769744873, |
|
"learning_rate": 3.757549866575588e-05, |
|
"loss": 4.5056, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 3.364558696746826, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 4.3372, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.33444444444444443, |
|
"grad_norm": 2.8973915576934814, |
|
"learning_rate": 3.742434902568889e-05, |
|
"loss": 4.3383, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.33555555555555555, |
|
"grad_norm": 3.0985381603240967, |
|
"learning_rate": 3.7348546664605777e-05, |
|
"loss": 4.3256, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.33666666666666667, |
|
"grad_norm": 2.420278310775757, |
|
"learning_rate": 3.727259384037852e-05, |
|
"loss": 4.1622, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.3377777777777778, |
|
"grad_norm": 2.669018030166626, |
|
"learning_rate": 3.719649147846832e-05, |
|
"loss": 4.6532, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3388888888888889, |
|
"grad_norm": 2.5606846809387207, |
|
"learning_rate": 3.712024050615843e-05, |
|
"loss": 4.8205, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.7075791358947754, |
|
"learning_rate": 3.704384185254288e-05, |
|
"loss": 4.7873, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.3411111111111111, |
|
"grad_norm": 2.576284170150757, |
|
"learning_rate": 3.696729644851518e-05, |
|
"loss": 4.2416, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.3422222222222222, |
|
"grad_norm": 2.5355241298675537, |
|
"learning_rate": 3.689060522675689e-05, |
|
"loss": 4.6843, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.3433333333333333, |
|
"grad_norm": 3.172502040863037, |
|
"learning_rate": 3.681376912172636e-05, |
|
"loss": 4.8512, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.34444444444444444, |
|
"grad_norm": 3.2482614517211914, |
|
"learning_rate": 3.673678906964727e-05, |
|
"loss": 4.6384, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34555555555555556, |
|
"grad_norm": 3.223466634750366, |
|
"learning_rate": 3.665966600849728e-05, |
|
"loss": 4.593, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 3.437298536300659, |
|
"learning_rate": 3.6582400877996546e-05, |
|
"loss": 4.376, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.3477777777777778, |
|
"grad_norm": 3.4591798782348633, |
|
"learning_rate": 3.6504994619596294e-05, |
|
"loss": 4.5369, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.3488888888888889, |
|
"grad_norm": 2.1622931957244873, |
|
"learning_rate": 3.642744817646736e-05, |
|
"loss": 4.4165, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.5694704055786133, |
|
"learning_rate": 3.634976249348867e-05, |
|
"loss": 4.4281, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3511111111111111, |
|
"grad_norm": 3.8053269386291504, |
|
"learning_rate": 3.627193851723577e-05, |
|
"loss": 4.4582, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.3522222222222222, |
|
"grad_norm": 2.7068371772766113, |
|
"learning_rate": 3.619397719596924e-05, |
|
"loss": 4.141, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.35333333333333333, |
|
"grad_norm": 3.1135122776031494, |
|
"learning_rate": 3.611587947962319e-05, |
|
"loss": 4.4257, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.35444444444444445, |
|
"grad_norm": 2.5378129482269287, |
|
"learning_rate": 3.603764631979363e-05, |
|
"loss": 4.495, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 3.563612937927246, |
|
"learning_rate": 3.5959278669726935e-05, |
|
"loss": 4.5916, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3566666666666667, |
|
"grad_norm": 3.765002727508545, |
|
"learning_rate": 3.588077748430819e-05, |
|
"loss": 4.7184, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.35777777777777775, |
|
"grad_norm": 2.8586552143096924, |
|
"learning_rate": 3.580214372004956e-05, |
|
"loss": 4.6151, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.35888888888888887, |
|
"grad_norm": 3.664820432662964, |
|
"learning_rate": 3.572337833507865e-05, |
|
"loss": 4.0914, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.808751344680786, |
|
"learning_rate": 3.564448228912682e-05, |
|
"loss": 4.4035, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.3611111111111111, |
|
"grad_norm": 3.5564141273498535, |
|
"learning_rate": 3.556545654351749e-05, |
|
"loss": 4.4184, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3622222222222222, |
|
"grad_norm": 3.4853861331939697, |
|
"learning_rate": 3.548630206115443e-05, |
|
"loss": 4.5303, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.36333333333333334, |
|
"grad_norm": 3.2625653743743896, |
|
"learning_rate": 3.540701980651003e-05, |
|
"loss": 4.5295, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.36444444444444446, |
|
"grad_norm": 2.611847162246704, |
|
"learning_rate": 3.532761074561355e-05, |
|
"loss": 4.8323, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.3655555555555556, |
|
"grad_norm": 2.4942116737365723, |
|
"learning_rate": 3.524807584603932e-05, |
|
"loss": 4.245, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.36666666666666664, |
|
"grad_norm": 2.132793664932251, |
|
"learning_rate": 3.516841607689501e-05, |
|
"loss": 4.6669, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.36777777777777776, |
|
"grad_norm": 2.5251405239105225, |
|
"learning_rate": 3.5088632408809755e-05, |
|
"loss": 5.0771, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.3688888888888889, |
|
"grad_norm": 3.042750358581543, |
|
"learning_rate": 3.5008725813922386e-05, |
|
"loss": 4.562, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.445188283920288, |
|
"learning_rate": 3.4928697265869515e-05, |
|
"loss": 4.6474, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.3711111111111111, |
|
"grad_norm": 2.4179251194000244, |
|
"learning_rate": 3.484854773977378e-05, |
|
"loss": 4.4356, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.37222222222222223, |
|
"grad_norm": 3.5280330181121826, |
|
"learning_rate": 3.476827821223184e-05, |
|
"loss": 4.1546, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 3.564363718032837, |
|
"learning_rate": 3.4687889661302576e-05, |
|
"loss": 4.7374, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.37444444444444447, |
|
"grad_norm": 2.1853668689727783, |
|
"learning_rate": 3.460738306649509e-05, |
|
"loss": 4.4073, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.37555555555555553, |
|
"grad_norm": 4.435403823852539, |
|
"learning_rate": 3.452675940875686e-05, |
|
"loss": 4.4693, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.37666666666666665, |
|
"grad_norm": 2.2928574085235596, |
|
"learning_rate": 3.444601967046168e-05, |
|
"loss": 4.1636, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.37777777777777777, |
|
"grad_norm": 2.858842611312866, |
|
"learning_rate": 3.436516483539781e-05, |
|
"loss": 4.7424, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3788888888888889, |
|
"grad_norm": 2.4767720699310303, |
|
"learning_rate": 3.428419588875588e-05, |
|
"loss": 4.6306, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.104574680328369, |
|
"learning_rate": 3.4203113817116957e-05, |
|
"loss": 4.2592, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.3811111111111111, |
|
"grad_norm": 3.349961757659912, |
|
"learning_rate": 3.412191960844049e-05, |
|
"loss": 4.2351, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.38222222222222224, |
|
"grad_norm": 2.902287244796753, |
|
"learning_rate": 3.4040614252052305e-05, |
|
"loss": 4.1556, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.38333333333333336, |
|
"grad_norm": 2.7805283069610596, |
|
"learning_rate": 3.39591987386325e-05, |
|
"loss": 4.8241, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.3844444444444444, |
|
"grad_norm": 3.494743585586548, |
|
"learning_rate": 3.387767406020343e-05, |
|
"loss": 4.4414, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.38555555555555554, |
|
"grad_norm": 3.4807887077331543, |
|
"learning_rate": 3.3796041210117546e-05, |
|
"loss": 4.4356, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.38666666666666666, |
|
"grad_norm": 2.875729560852051, |
|
"learning_rate": 3.3714301183045385e-05, |
|
"loss": 4.6104, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.3877777777777778, |
|
"grad_norm": 2.3670778274536133, |
|
"learning_rate": 3.363245497496337e-05, |
|
"loss": 4.6181, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 3.511547088623047, |
|
"learning_rate": 3.355050358314172e-05, |
|
"loss": 4.5967, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.4935336112976074, |
|
"learning_rate": 3.346844800613229e-05, |
|
"loss": 4.4003, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 3.7835776805877686, |
|
"learning_rate": 3.338628924375638e-05, |
|
"loss": 4.3227, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.39222222222222225, |
|
"grad_norm": 3.659778594970703, |
|
"learning_rate": 3.330402829709258e-05, |
|
"loss": 4.735, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3933333333333333, |
|
"grad_norm": 2.280862331390381, |
|
"learning_rate": 3.322166616846458e-05, |
|
"loss": 4.3854, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.39444444444444443, |
|
"grad_norm": 3.5552432537078857, |
|
"learning_rate": 3.313920386142892e-05, |
|
"loss": 4.494, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.39555555555555555, |
|
"grad_norm": 3.2624123096466064, |
|
"learning_rate": 3.305664238076278e-05, |
|
"loss": 4.441, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.39666666666666667, |
|
"grad_norm": 2.5981621742248535, |
|
"learning_rate": 3.2973982732451755e-05, |
|
"loss": 4.1962, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3977777777777778, |
|
"grad_norm": 3.8529744148254395, |
|
"learning_rate": 3.289122592367757e-05, |
|
"loss": 4.1754, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.3988888888888889, |
|
"grad_norm": 3.049600839614868, |
|
"learning_rate": 3.2808372962805816e-05, |
|
"loss": 4.638, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.134927988052368, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 4.4763, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4011111111111111, |
|
"grad_norm": 2.298032522201538, |
|
"learning_rate": 3.264238262407764e-05, |
|
"loss": 4.8264, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.4022222222222222, |
|
"grad_norm": 3.3620493412017822, |
|
"learning_rate": 3.2559247268761115e-05, |
|
"loss": 4.3332, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.4033333333333333, |
|
"grad_norm": 3.7335774898529053, |
|
"learning_rate": 3.247601980640217e-05, |
|
"loss": 4.351, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.40444444444444444, |
|
"grad_norm": 2.7008893489837646, |
|
"learning_rate": 3.239270125110117e-05, |
|
"loss": 4.3373, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.40555555555555556, |
|
"grad_norm": 2.3377201557159424, |
|
"learning_rate": 3.230929261806842e-05, |
|
"loss": 4.6638, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4066666666666667, |
|
"grad_norm": 2.796996831893921, |
|
"learning_rate": 3.222579492361179e-05, |
|
"loss": 4.4803, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.4077777777777778, |
|
"grad_norm": 3.004497766494751, |
|
"learning_rate": 3.214220918512434e-05, |
|
"loss": 4.8025, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.4088888888888889, |
|
"grad_norm": 2.762565851211548, |
|
"learning_rate": 3.205853642107192e-05, |
|
"loss": 4.3787, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.7421979904174805, |
|
"learning_rate": 3.1974777650980735e-05, |
|
"loss": 4.509, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.4111111111111111, |
|
"grad_norm": 2.6516170501708984, |
|
"learning_rate": 3.1890933895424976e-05, |
|
"loss": 4.8942, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4122222222222222, |
|
"grad_norm": 3.2065646648406982, |
|
"learning_rate": 3.180700617601436e-05, |
|
"loss": 4.4659, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.41333333333333333, |
|
"grad_norm": 3.3278090953826904, |
|
"learning_rate": 3.172299551538164e-05, |
|
"loss": 4.3103, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.41444444444444445, |
|
"grad_norm": 3.3703420162200928, |
|
"learning_rate": 3.163890293717022e-05, |
|
"loss": 4.4312, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.41555555555555557, |
|
"grad_norm": 3.629591464996338, |
|
"learning_rate": 3.155472946602162e-05, |
|
"loss": 4.8446, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 3.0027589797973633, |
|
"learning_rate": 3.147047612756302e-05, |
|
"loss": 4.5204, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4177777777777778, |
|
"grad_norm": 3.2268624305725098, |
|
"learning_rate": 3.138614394839476e-05, |
|
"loss": 4.1844, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.41888888888888887, |
|
"grad_norm": 3.958721160888672, |
|
"learning_rate": 3.130173395607785e-05, |
|
"loss": 4.3071, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.1055030822753906, |
|
"learning_rate": 3.121724717912138e-05, |
|
"loss": 4.2903, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4211111111111111, |
|
"grad_norm": 3.0535993576049805, |
|
"learning_rate": 3.1132684646970064e-05, |
|
"loss": 4.5096, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.4222222222222222, |
|
"grad_norm": 2.5504136085510254, |
|
"learning_rate": 3.104804738999169e-05, |
|
"loss": 4.4653, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.42333333333333334, |
|
"grad_norm": 3.1297409534454346, |
|
"learning_rate": 3.0963336439464526e-05, |
|
"loss": 4.3652, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.42444444444444446, |
|
"grad_norm": 2.2639200687408447, |
|
"learning_rate": 3.087855282756475e-05, |
|
"loss": 4.8119, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.4255555555555556, |
|
"grad_norm": 2.581587314605713, |
|
"learning_rate": 3.079369758735393e-05, |
|
"loss": 4.0573, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 2.61521577835083, |
|
"learning_rate": 3.0708771752766394e-05, |
|
"loss": 4.3563, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.42777777777777776, |
|
"grad_norm": 3.1898045539855957, |
|
"learning_rate": 3.062377635859663e-05, |
|
"loss": 5.091, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.4288888888888889, |
|
"grad_norm": 3.8822860717773438, |
|
"learning_rate": 3.053871244048669e-05, |
|
"loss": 4.1214, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.2095978260040283, |
|
"learning_rate": 3.045358103491357e-05, |
|
"loss": 4.6263, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.4311111111111111, |
|
"grad_norm": 2.3496782779693604, |
|
"learning_rate": 3.0368383179176585e-05, |
|
"loss": 4.2816, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.43222222222222223, |
|
"grad_norm": 3.5458877086639404, |
|
"learning_rate": 3.028311991138472e-05, |
|
"loss": 4.1913, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.43333333333333335, |
|
"grad_norm": 2.9822726249694824, |
|
"learning_rate": 3.0197792270443982e-05, |
|
"loss": 4.5714, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.43444444444444447, |
|
"grad_norm": 2.740432024002075, |
|
"learning_rate": 3.0112401296044757e-05, |
|
"loss": 4.565, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.43555555555555553, |
|
"grad_norm": 3.301563024520874, |
|
"learning_rate": 3.002694802864912e-05, |
|
"loss": 4.4515, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.43666666666666665, |
|
"grad_norm": 3.0773849487304688, |
|
"learning_rate": 2.9941433509478156e-05, |
|
"loss": 4.4196, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.43777777777777777, |
|
"grad_norm": 2.6573879718780518, |
|
"learning_rate": 2.98558587804993e-05, |
|
"loss": 4.2847, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.4388888888888889, |
|
"grad_norm": 3.498271942138672, |
|
"learning_rate": 2.9770224884413623e-05, |
|
"loss": 4.1811, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.6309518814086914, |
|
"learning_rate": 2.9684532864643122e-05, |
|
"loss": 4.223, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.4411111111111111, |
|
"grad_norm": 3.3358840942382812, |
|
"learning_rate": 2.9598783765318007e-05, |
|
"loss": 4.1402, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.44222222222222224, |
|
"grad_norm": 2.367056369781494, |
|
"learning_rate": 2.9512978631264006e-05, |
|
"loss": 4.3926, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.44333333333333336, |
|
"grad_norm": 3.7848873138427734, |
|
"learning_rate": 2.9427118507989586e-05, |
|
"loss": 4.1587, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 2.969773769378662, |
|
"learning_rate": 2.9341204441673266e-05, |
|
"loss": 4.1827, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.44555555555555554, |
|
"grad_norm": 3.8604671955108643, |
|
"learning_rate": 2.9255237479150816e-05, |
|
"loss": 4.7396, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.44666666666666666, |
|
"grad_norm": 2.6314451694488525, |
|
"learning_rate": 2.916921866790256e-05, |
|
"loss": 4.7571, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.4477777777777778, |
|
"grad_norm": 3.2930240631103516, |
|
"learning_rate": 2.908314905604056e-05, |
|
"loss": 4.6127, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.4488888888888889, |
|
"grad_norm": 2.809821367263794, |
|
"learning_rate": 2.8997029692295874e-05, |
|
"loss": 4.9782, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.108168363571167, |
|
"learning_rate": 2.8910861626005776e-05, |
|
"loss": 4.8604, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.45111111111111113, |
|
"grad_norm": 3.2190604209899902, |
|
"learning_rate": 2.8824645907100954e-05, |
|
"loss": 4.4294, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.45222222222222225, |
|
"grad_norm": 3.8671491146087646, |
|
"learning_rate": 2.8738383586092745e-05, |
|
"loss": 4.5494, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 3.078843355178833, |
|
"learning_rate": 2.8652075714060295e-05, |
|
"loss": 4.4064, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.45444444444444443, |
|
"grad_norm": 2.9444501399993896, |
|
"learning_rate": 2.8565723342637796e-05, |
|
"loss": 4.6974, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.45555555555555555, |
|
"grad_norm": 3.3965888023376465, |
|
"learning_rate": 2.8479327524001636e-05, |
|
"loss": 4.6253, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.45666666666666667, |
|
"grad_norm": 2.8222734928131104, |
|
"learning_rate": 2.8392889310857612e-05, |
|
"loss": 4.4421, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.4577777777777778, |
|
"grad_norm": 3.996683359146118, |
|
"learning_rate": 2.8306409756428064e-05, |
|
"loss": 3.9591, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.4588888888888889, |
|
"grad_norm": 3.7605364322662354, |
|
"learning_rate": 2.8219889914439074e-05, |
|
"loss": 4.0585, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.199702739715576, |
|
"learning_rate": 2.8133330839107608e-05, |
|
"loss": 4.5694, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.46111111111111114, |
|
"grad_norm": 2.3009800910949707, |
|
"learning_rate": 2.8046733585128687e-05, |
|
"loss": 4.5112, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 3.8310000896453857, |
|
"learning_rate": 2.7960099207662532e-05, |
|
"loss": 4.1967, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.4633333333333333, |
|
"grad_norm": 4.325408458709717, |
|
"learning_rate": 2.787342876232167e-05, |
|
"loss": 4.1846, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.46444444444444444, |
|
"grad_norm": 3.2290215492248535, |
|
"learning_rate": 2.7786723305158136e-05, |
|
"loss": 4.3148, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.46555555555555556, |
|
"grad_norm": 3.246396541595459, |
|
"learning_rate": 2.7699983892650573e-05, |
|
"loss": 4.2778, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 3.091440200805664, |
|
"learning_rate": 2.761321158169134e-05, |
|
"loss": 4.6429, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4677777777777778, |
|
"grad_norm": 2.5828170776367188, |
|
"learning_rate": 2.7526407429573657e-05, |
|
"loss": 4.6686, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.4688888888888889, |
|
"grad_norm": 2.3290863037109375, |
|
"learning_rate": 2.7439572493978736e-05, |
|
"loss": 4.1659, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 4.536559104919434, |
|
"learning_rate": 2.7352707832962865e-05, |
|
"loss": 3.9368, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.4711111111111111, |
|
"grad_norm": 2.5557074546813965, |
|
"learning_rate": 2.726581450494451e-05, |
|
"loss": 4.7282, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.4722222222222222, |
|
"grad_norm": 2.8363335132598877, |
|
"learning_rate": 2.717889356869146e-05, |
|
"loss": 4.4236, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.47333333333333333, |
|
"grad_norm": 4.480076313018799, |
|
"learning_rate": 2.7091946083307896e-05, |
|
"loss": 4.3532, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.47444444444444445, |
|
"grad_norm": 2.9685823917388916, |
|
"learning_rate": 2.7004973108221472e-05, |
|
"loss": 3.8564, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.47555555555555556, |
|
"grad_norm": 2.388016939163208, |
|
"learning_rate": 2.6917975703170466e-05, |
|
"loss": 4.6472, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.4766666666666667, |
|
"grad_norm": 3.0806846618652344, |
|
"learning_rate": 2.6830954928190794e-05, |
|
"loss": 4.4021, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.4777777777777778, |
|
"grad_norm": 2.4739530086517334, |
|
"learning_rate": 2.674391184360313e-05, |
|
"loss": 4.3841, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.47888888888888886, |
|
"grad_norm": 2.37337327003479, |
|
"learning_rate": 2.6656847510000012e-05, |
|
"loss": 4.5123, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.335357904434204, |
|
"learning_rate": 2.656976298823284e-05, |
|
"loss": 4.5225, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.4811111111111111, |
|
"grad_norm": 2.888369083404541, |
|
"learning_rate": 2.6482659339399045e-05, |
|
"loss": 4.0533, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.4822222222222222, |
|
"grad_norm": 3.412767171859741, |
|
"learning_rate": 2.6395537624829096e-05, |
|
"loss": 4.6112, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.48333333333333334, |
|
"grad_norm": 3.0738167762756348, |
|
"learning_rate": 2.63083989060736e-05, |
|
"loss": 4.0807, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.48444444444444446, |
|
"grad_norm": 2.038522720336914, |
|
"learning_rate": 2.6221244244890336e-05, |
|
"loss": 4.3389, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.4855555555555556, |
|
"grad_norm": 2.4719579219818115, |
|
"learning_rate": 2.6134074703231344e-05, |
|
"loss": 4.1841, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.4866666666666667, |
|
"grad_norm": 2.653308153152466, |
|
"learning_rate": 2.604689134322999e-05, |
|
"loss": 4.6983, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.48777777777777775, |
|
"grad_norm": 3.113577127456665, |
|
"learning_rate": 2.5959695227188004e-05, |
|
"loss": 4.46, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 2.463456153869629, |
|
"learning_rate": 2.587248741756253e-05, |
|
"loss": 4.5327, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.733887195587158, |
|
"learning_rate": 2.578526897695321e-05, |
|
"loss": 4.2684, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.4911111111111111, |
|
"grad_norm": 3.1210319995880127, |
|
"learning_rate": 2.5698040968089225e-05, |
|
"loss": 4.2219, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.4922222222222222, |
|
"grad_norm": 2.173025369644165, |
|
"learning_rate": 2.5610804453816333e-05, |
|
"loss": 4.7141, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.49333333333333335, |
|
"grad_norm": 3.4339637756347656, |
|
"learning_rate": 2.5523560497083926e-05, |
|
"loss": 4.8913, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.49444444444444446, |
|
"grad_norm": 3.317446231842041, |
|
"learning_rate": 2.5436310160932092e-05, |
|
"loss": 4.6514, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.4955555555555556, |
|
"grad_norm": 3.317653179168701, |
|
"learning_rate": 2.5349054508478637e-05, |
|
"loss": 4.7476, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.49666666666666665, |
|
"grad_norm": 2.9672698974609375, |
|
"learning_rate": 2.5261794602906145e-05, |
|
"loss": 4.6061, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 3.32000470161438, |
|
"learning_rate": 2.517453150744904e-05, |
|
"loss": 4.1218, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.4988888888888889, |
|
"grad_norm": 4.345942497253418, |
|
"learning_rate": 2.5087266285380596e-05, |
|
"loss": 4.4872, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.055283784866333, |
|
"learning_rate": 2.5e-05, |
|
"loss": 4.2396, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5011111111111111, |
|
"grad_norm": 3.0910513401031494, |
|
"learning_rate": 2.4912733714619417e-05, |
|
"loss": 4.9336, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.5022222222222222, |
|
"grad_norm": 2.6645710468292236, |
|
"learning_rate": 2.4825468492550964e-05, |
|
"loss": 4.7848, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.5033333333333333, |
|
"grad_norm": 3.63999342918396, |
|
"learning_rate": 2.4738205397093864e-05, |
|
"loss": 4.357, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.5044444444444445, |
|
"grad_norm": 2.507779598236084, |
|
"learning_rate": 2.4650945491521372e-05, |
|
"loss": 4.5025, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.5055555555555555, |
|
"grad_norm": 2.3816704750061035, |
|
"learning_rate": 2.4563689839067913e-05, |
|
"loss": 4.5438, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 3.367776870727539, |
|
"learning_rate": 2.447643950291608e-05, |
|
"loss": 3.8387, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.5077777777777778, |
|
"grad_norm": 3.0507357120513916, |
|
"learning_rate": 2.4389195546183673e-05, |
|
"loss": 4.3121, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.5088888888888888, |
|
"grad_norm": 4.110062122344971, |
|
"learning_rate": 2.4301959031910784e-05, |
|
"loss": 4.2072, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.110203504562378, |
|
"learning_rate": 2.4214731023046793e-05, |
|
"loss": 4.3653, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.5111111111111111, |
|
"grad_norm": 3.3167712688446045, |
|
"learning_rate": 2.4127512582437485e-05, |
|
"loss": 4.2322, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5122222222222222, |
|
"grad_norm": 2.507969856262207, |
|
"learning_rate": 2.4040304772812002e-05, |
|
"loss": 4.382, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.5133333333333333, |
|
"grad_norm": 3.365709066390991, |
|
"learning_rate": 2.3953108656770016e-05, |
|
"loss": 4.0521, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.5144444444444445, |
|
"grad_norm": 2.8844547271728516, |
|
"learning_rate": 2.386592529676866e-05, |
|
"loss": 4.5033, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.5155555555555555, |
|
"grad_norm": 3.0520310401916504, |
|
"learning_rate": 2.377875575510967e-05, |
|
"loss": 4.8204, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5166666666666667, |
|
"grad_norm": 4.429820537567139, |
|
"learning_rate": 2.3691601093926404e-05, |
|
"loss": 4.3879, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5177777777777778, |
|
"grad_norm": 2.3193674087524414, |
|
"learning_rate": 2.3604462375170906e-05, |
|
"loss": 4.0077, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.5188888888888888, |
|
"grad_norm": 3.9970319271087646, |
|
"learning_rate": 2.3517340660600964e-05, |
|
"loss": 4.2043, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.8194077014923096, |
|
"learning_rate": 2.3430237011767167e-05, |
|
"loss": 4.5441, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.5211111111111111, |
|
"grad_norm": 2.7785353660583496, |
|
"learning_rate": 2.3343152490000004e-05, |
|
"loss": 5.0854, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.5222222222222223, |
|
"grad_norm": 3.0042474269866943, |
|
"learning_rate": 2.3256088156396868e-05, |
|
"loss": 4.7248, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5233333333333333, |
|
"grad_norm": 3.4051711559295654, |
|
"learning_rate": 2.3169045071809215e-05, |
|
"loss": 4.1062, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.5244444444444445, |
|
"grad_norm": 3.4314067363739014, |
|
"learning_rate": 2.3082024296829536e-05, |
|
"loss": 4.1021, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.5255555555555556, |
|
"grad_norm": 3.356543779373169, |
|
"learning_rate": 2.299502689177853e-05, |
|
"loss": 4.1495, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.5266666666666666, |
|
"grad_norm": 4.4954633712768555, |
|
"learning_rate": 2.2908053916692117e-05, |
|
"loss": 4.1691, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.5277777777777778, |
|
"grad_norm": 3.4520392417907715, |
|
"learning_rate": 2.2821106431308544e-05, |
|
"loss": 4.5688, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5288888888888889, |
|
"grad_norm": 2.547987699508667, |
|
"learning_rate": 2.2734185495055503e-05, |
|
"loss": 4.8845, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.991994619369507, |
|
"learning_rate": 2.2647292167037144e-05, |
|
"loss": 4.5536, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.5311111111111111, |
|
"grad_norm": 3.409557819366455, |
|
"learning_rate": 2.2560427506021266e-05, |
|
"loss": 4.674, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.5322222222222223, |
|
"grad_norm": 2.5158376693725586, |
|
"learning_rate": 2.247359257042634e-05, |
|
"loss": 3.8624, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 5.460207939147949, |
|
"learning_rate": 2.238678841830867e-05, |
|
"loss": 3.9847, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5344444444444445, |
|
"grad_norm": 3.18215274810791, |
|
"learning_rate": 2.230001610734943e-05, |
|
"loss": 4.4479, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.5355555555555556, |
|
"grad_norm": 3.3176145553588867, |
|
"learning_rate": 2.2213276694841866e-05, |
|
"loss": 4.5647, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.5366666666666666, |
|
"grad_norm": 2.4659605026245117, |
|
"learning_rate": 2.212657123767834e-05, |
|
"loss": 4.6482, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.5377777777777778, |
|
"grad_norm": 3.418905019760132, |
|
"learning_rate": 2.2039900792337474e-05, |
|
"loss": 4.0517, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.5388888888888889, |
|
"grad_norm": 2.7777280807495117, |
|
"learning_rate": 2.195326641487132e-05, |
|
"loss": 3.9006, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.1448440551757812, |
|
"learning_rate": 2.186666916089239e-05, |
|
"loss": 4.5849, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.5411111111111111, |
|
"grad_norm": 2.304466485977173, |
|
"learning_rate": 2.1780110085560935e-05, |
|
"loss": 4.6166, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.5422222222222223, |
|
"grad_norm": 4.101543426513672, |
|
"learning_rate": 2.1693590243571938e-05, |
|
"loss": 4.1855, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.5433333333333333, |
|
"grad_norm": 4.019572734832764, |
|
"learning_rate": 2.1607110689142393e-05, |
|
"loss": 4.511, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.5444444444444444, |
|
"grad_norm": 3.2479324340820312, |
|
"learning_rate": 2.1520672475998373e-05, |
|
"loss": 4.694, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5455555555555556, |
|
"grad_norm": 2.5916500091552734, |
|
"learning_rate": 2.1434276657362213e-05, |
|
"loss": 4.4817, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.5466666666666666, |
|
"grad_norm": 2.4654204845428467, |
|
"learning_rate": 2.1347924285939714e-05, |
|
"loss": 4.241, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.5477777777777778, |
|
"grad_norm": 3.4962589740753174, |
|
"learning_rate": 2.1261616413907265e-05, |
|
"loss": 4.2751, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.5488888888888889, |
|
"grad_norm": 2.41613507270813, |
|
"learning_rate": 2.117535409289905e-05, |
|
"loss": 4.6923, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.8937885761260986, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 4.5164, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5511111111111111, |
|
"grad_norm": 3.6244256496429443, |
|
"learning_rate": 2.1002970307704132e-05, |
|
"loss": 4.0702, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.5522222222222222, |
|
"grad_norm": 2.670847177505493, |
|
"learning_rate": 2.0916850943959452e-05, |
|
"loss": 4.1953, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.5533333333333333, |
|
"grad_norm": 3.030318021774292, |
|
"learning_rate": 2.0830781332097446e-05, |
|
"loss": 4.2784, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.5544444444444444, |
|
"grad_norm": 3.4792563915252686, |
|
"learning_rate": 2.0744762520849193e-05, |
|
"loss": 4.7421, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 4.749427318572998, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 4.0809, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5566666666666666, |
|
"grad_norm": 2.382559061050415, |
|
"learning_rate": 2.057288149201042e-05, |
|
"loss": 4.2373, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.5577777777777778, |
|
"grad_norm": 4.044615745544434, |
|
"learning_rate": 2.0487021368736003e-05, |
|
"loss": 4.3278, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.5588888888888889, |
|
"grad_norm": 2.4403457641601562, |
|
"learning_rate": 2.0401216234681995e-05, |
|
"loss": 4.6996, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.6123414039611816, |
|
"learning_rate": 2.031546713535688e-05, |
|
"loss": 3.7175, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.5611111111111111, |
|
"grad_norm": 2.930072784423828, |
|
"learning_rate": 2.022977511558638e-05, |
|
"loss": 3.8051, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5622222222222222, |
|
"grad_norm": 2.092438220977783, |
|
"learning_rate": 2.0144141219500705e-05, |
|
"loss": 4.6131, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.5633333333333334, |
|
"grad_norm": 3.791438579559326, |
|
"learning_rate": 2.0058566490521847e-05, |
|
"loss": 4.5328, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.5644444444444444, |
|
"grad_norm": 2.1941120624542236, |
|
"learning_rate": 1.9973051971350888e-05, |
|
"loss": 4.0611, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.5655555555555556, |
|
"grad_norm": 2.723223924636841, |
|
"learning_rate": 1.9887598703955242e-05, |
|
"loss": 4.2184, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.5666666666666667, |
|
"grad_norm": 2.9518237113952637, |
|
"learning_rate": 1.980220772955602e-05, |
|
"loss": 4.3172, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5677777777777778, |
|
"grad_norm": 3.06872296333313, |
|
"learning_rate": 1.9716880088615285e-05, |
|
"loss": 4.2687, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 2.8538174629211426, |
|
"learning_rate": 1.963161682082342e-05, |
|
"loss": 4.265, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.3108673095703125, |
|
"learning_rate": 1.9546418965086442e-05, |
|
"loss": 4.5094, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.5711111111111111, |
|
"grad_norm": 3.3742525577545166, |
|
"learning_rate": 1.946128755951332e-05, |
|
"loss": 4.4563, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.5722222222222222, |
|
"grad_norm": 2.6200695037841797, |
|
"learning_rate": 1.937622364140338e-05, |
|
"loss": 4.3567, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5733333333333334, |
|
"grad_norm": 2.5701615810394287, |
|
"learning_rate": 1.9291228247233605e-05, |
|
"loss": 4.2645, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.5744444444444444, |
|
"grad_norm": 4.248501777648926, |
|
"learning_rate": 1.920630241264607e-05, |
|
"loss": 3.8413, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.5755555555555556, |
|
"grad_norm": 3.4751811027526855, |
|
"learning_rate": 1.912144717243525e-05, |
|
"loss": 4.258, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.5766666666666667, |
|
"grad_norm": 2.8151302337646484, |
|
"learning_rate": 1.9036663560535483e-05, |
|
"loss": 4.4939, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 3.2205138206481934, |
|
"learning_rate": 1.895195261000831e-05, |
|
"loss": 4.7516, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5788888888888889, |
|
"grad_norm": 2.6713924407958984, |
|
"learning_rate": 1.8867315353029935e-05, |
|
"loss": 4.3591, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.1901915073394775, |
|
"learning_rate": 1.8782752820878634e-05, |
|
"loss": 4.2607, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.5811111111111111, |
|
"grad_norm": 3.473564863204956, |
|
"learning_rate": 1.869826604392216e-05, |
|
"loss": 4.125, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.5822222222222222, |
|
"grad_norm": 2.8697259426116943, |
|
"learning_rate": 1.8613856051605243e-05, |
|
"loss": 4.2696, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 2.74684739112854, |
|
"learning_rate": 1.852952387243698e-05, |
|
"loss": 4.2943, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5844444444444444, |
|
"grad_norm": 2.807659387588501, |
|
"learning_rate": 1.8445270533978388e-05, |
|
"loss": 4.6892, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.5855555555555556, |
|
"grad_norm": 2.5258119106292725, |
|
"learning_rate": 1.8361097062829778e-05, |
|
"loss": 4.4269, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 4.046256065368652, |
|
"learning_rate": 1.827700448461836e-05, |
|
"loss": 4.4027, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.5877777777777777, |
|
"grad_norm": 2.256350517272949, |
|
"learning_rate": 1.8192993823985643e-05, |
|
"loss": 4.1628, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.5888888888888889, |
|
"grad_norm": 2.6388349533081055, |
|
"learning_rate": 1.8109066104575023e-05, |
|
"loss": 5.2738, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.1763365268707275, |
|
"learning_rate": 1.802522234901927e-05, |
|
"loss": 4.4906, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.5911111111111111, |
|
"grad_norm": 2.969287157058716, |
|
"learning_rate": 1.7941463578928086e-05, |
|
"loss": 5.1068, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.5922222222222222, |
|
"grad_norm": 4.471690654754639, |
|
"learning_rate": 1.7857790814875663e-05, |
|
"loss": 4.1047, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.5933333333333334, |
|
"grad_norm": 3.2363221645355225, |
|
"learning_rate": 1.7774205076388206e-05, |
|
"loss": 4.587, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.5944444444444444, |
|
"grad_norm": 2.6446151733398438, |
|
"learning_rate": 1.7690707381931583e-05, |
|
"loss": 4.4606, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5955555555555555, |
|
"grad_norm": 3.1010208129882812, |
|
"learning_rate": 1.7607298748898842e-05, |
|
"loss": 4.3764, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.5966666666666667, |
|
"grad_norm": 2.4426517486572266, |
|
"learning_rate": 1.7523980193597836e-05, |
|
"loss": 4.4518, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.5977777777777777, |
|
"grad_norm": 1.913076400756836, |
|
"learning_rate": 1.744075273123889e-05, |
|
"loss": 4.2522, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.5988888888888889, |
|
"grad_norm": 2.492178440093994, |
|
"learning_rate": 1.735761737592236e-05, |
|
"loss": 4.5726, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.457730531692505, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 4.662, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6011111111111112, |
|
"grad_norm": 2.3263602256774902, |
|
"learning_rate": 1.7191627037194186e-05, |
|
"loss": 4.4692, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.6022222222222222, |
|
"grad_norm": 3.4461264610290527, |
|
"learning_rate": 1.7108774076322443e-05, |
|
"loss": 4.287, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.6033333333333334, |
|
"grad_norm": 3.4049248695373535, |
|
"learning_rate": 1.702601726754825e-05, |
|
"loss": 3.8536, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 3.2425801753997803, |
|
"learning_rate": 1.6943357619237226e-05, |
|
"loss": 3.8095, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.6055555555555555, |
|
"grad_norm": 3.209322452545166, |
|
"learning_rate": 1.686079613857109e-05, |
|
"loss": 4.0113, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.6066666666666667, |
|
"grad_norm": 2.507138729095459, |
|
"learning_rate": 1.677833383153542e-05, |
|
"loss": 4.3496, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.6077777777777778, |
|
"grad_norm": 3.377285957336426, |
|
"learning_rate": 1.6695971702907426e-05, |
|
"loss": 4.2639, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.6088888888888889, |
|
"grad_norm": 3.625976324081421, |
|
"learning_rate": 1.6613710756243626e-05, |
|
"loss": 4.1149, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.757136821746826, |
|
"learning_rate": 1.6531551993867717e-05, |
|
"loss": 4.0329, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 3.1707332134246826, |
|
"learning_rate": 1.6449496416858284e-05, |
|
"loss": 4.3594, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6122222222222222, |
|
"grad_norm": 3.1145691871643066, |
|
"learning_rate": 1.6367545025036636e-05, |
|
"loss": 4.6404, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 3.2511072158813477, |
|
"learning_rate": 1.6285698816954624e-05, |
|
"loss": 4.0102, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.6144444444444445, |
|
"grad_norm": 3.0803847312927246, |
|
"learning_rate": 1.6203958789882456e-05, |
|
"loss": 4.2037, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.6155555555555555, |
|
"grad_norm": 2.6308162212371826, |
|
"learning_rate": 1.612232593979658e-05, |
|
"loss": 4.1265, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.6166666666666667, |
|
"grad_norm": 2.719158172607422, |
|
"learning_rate": 1.6040801261367493e-05, |
|
"loss": 4.4969, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6177777777777778, |
|
"grad_norm": 4.231455326080322, |
|
"learning_rate": 1.5959385747947698e-05, |
|
"loss": 3.8209, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.6188888888888889, |
|
"grad_norm": 3.452610731124878, |
|
"learning_rate": 1.5878080391559508e-05, |
|
"loss": 4.8977, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.3357810974121094, |
|
"learning_rate": 1.5796886182883053e-05, |
|
"loss": 4.2065, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.6211111111111111, |
|
"grad_norm": 3.3810150623321533, |
|
"learning_rate": 1.5715804111244137e-05, |
|
"loss": 4.1377, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 2.561292886734009, |
|
"learning_rate": 1.56348351646022e-05, |
|
"loss": 4.304, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6233333333333333, |
|
"grad_norm": 4.378098011016846, |
|
"learning_rate": 1.5553980329538326e-05, |
|
"loss": 4.2043, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.6244444444444445, |
|
"grad_norm": 2.817155599594116, |
|
"learning_rate": 1.547324059124315e-05, |
|
"loss": 4.8234, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.6255555555555555, |
|
"grad_norm": 2.7013378143310547, |
|
"learning_rate": 1.539261693350491e-05, |
|
"loss": 4.5842, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.6266666666666667, |
|
"grad_norm": 3.0469796657562256, |
|
"learning_rate": 1.5312110338697426e-05, |
|
"loss": 4.475, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.6277777777777778, |
|
"grad_norm": 2.944330930709839, |
|
"learning_rate": 1.523172178776816e-05, |
|
"loss": 4.7153, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6288888888888889, |
|
"grad_norm": 3.1219630241394043, |
|
"learning_rate": 1.5151452260226224e-05, |
|
"loss": 4.2081, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.091395139694214, |
|
"learning_rate": 1.5071302734130489e-05, |
|
"loss": 3.8313, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.6311111111111111, |
|
"grad_norm": 3.610748767852783, |
|
"learning_rate": 1.4991274186077632e-05, |
|
"loss": 4.3207, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.6322222222222222, |
|
"grad_norm": 2.4512412548065186, |
|
"learning_rate": 1.4911367591190248e-05, |
|
"loss": 4.6405, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.6333333333333333, |
|
"grad_norm": 2.7447104454040527, |
|
"learning_rate": 1.4831583923104999e-05, |
|
"loss": 4.3182, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6344444444444445, |
|
"grad_norm": 2.834606409072876, |
|
"learning_rate": 1.475192415396068e-05, |
|
"loss": 4.0676, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.6355555555555555, |
|
"grad_norm": 2.1336636543273926, |
|
"learning_rate": 1.467238925438646e-05, |
|
"loss": 4.4181, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.6366666666666667, |
|
"grad_norm": 2.7370517253875732, |
|
"learning_rate": 1.4592980193489975e-05, |
|
"loss": 4.2872, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.6377777777777778, |
|
"grad_norm": 2.991546392440796, |
|
"learning_rate": 1.4513697938845572e-05, |
|
"loss": 3.8864, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.6388888888888888, |
|
"grad_norm": 2.664534330368042, |
|
"learning_rate": 1.443454345648252e-05, |
|
"loss": 4.5979, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.904681921005249, |
|
"learning_rate": 1.4355517710873184e-05, |
|
"loss": 4.2368, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.6411111111111111, |
|
"grad_norm": 3.317148447036743, |
|
"learning_rate": 1.4276621664921357e-05, |
|
"loss": 3.8986, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.6422222222222222, |
|
"grad_norm": 3.1722943782806396, |
|
"learning_rate": 1.4197856279950438e-05, |
|
"loss": 4.3489, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.6433333333333333, |
|
"grad_norm": 2.8325436115264893, |
|
"learning_rate": 1.4119222515691816e-05, |
|
"loss": 4.7594, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.6444444444444445, |
|
"grad_norm": 3.2218034267425537, |
|
"learning_rate": 1.4040721330273062e-05, |
|
"loss": 4.2976, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6455555555555555, |
|
"grad_norm": 3.36842679977417, |
|
"learning_rate": 1.3962353680206373e-05, |
|
"loss": 4.2204, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.6466666666666666, |
|
"grad_norm": 3.8246467113494873, |
|
"learning_rate": 1.388412052037682e-05, |
|
"loss": 4.1247, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.6477777777777778, |
|
"grad_norm": 3.131218910217285, |
|
"learning_rate": 1.380602280403076e-05, |
|
"loss": 4.3663, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.6488888888888888, |
|
"grad_norm": 3.3939664363861084, |
|
"learning_rate": 1.3728061482764238e-05, |
|
"loss": 4.3556, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.252523183822632, |
|
"learning_rate": 1.3650237506511331e-05, |
|
"loss": 4.0815, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6511111111111111, |
|
"grad_norm": 4.023004055023193, |
|
"learning_rate": 1.3572551823532654e-05, |
|
"loss": 4.6529, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.6522222222222223, |
|
"grad_norm": 2.3044891357421875, |
|
"learning_rate": 1.349500538040371e-05, |
|
"loss": 4.7574, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.6533333333333333, |
|
"grad_norm": 2.99569034576416, |
|
"learning_rate": 1.3417599122003464e-05, |
|
"loss": 4.1623, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.6544444444444445, |
|
"grad_norm": 3.384570360183716, |
|
"learning_rate": 1.3340333991502724e-05, |
|
"loss": 4.3638, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.6555555555555556, |
|
"grad_norm": 3.1633384227752686, |
|
"learning_rate": 1.3263210930352737e-05, |
|
"loss": 4.2265, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6566666666666666, |
|
"grad_norm": 2.922513484954834, |
|
"learning_rate": 1.3186230878273653e-05, |
|
"loss": 4.2723, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.6577777777777778, |
|
"grad_norm": 2.408703327178955, |
|
"learning_rate": 1.3109394773243117e-05, |
|
"loss": 4.3487, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.6588888888888889, |
|
"grad_norm": 2.839890718460083, |
|
"learning_rate": 1.3032703551484832e-05, |
|
"loss": 4.3941, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.723208427429199, |
|
"learning_rate": 1.2956158147457115e-05, |
|
"loss": 4.5581, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.6611111111111111, |
|
"grad_norm": 3.162594795227051, |
|
"learning_rate": 1.2879759493841575e-05, |
|
"loss": 4.6902, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6622222222222223, |
|
"grad_norm": 2.862002372741699, |
|
"learning_rate": 1.280350852153168e-05, |
|
"loss": 4.6792, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.6633333333333333, |
|
"grad_norm": 3.029798746109009, |
|
"learning_rate": 1.272740615962148e-05, |
|
"loss": 4.2943, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.6644444444444444, |
|
"grad_norm": 2.505032539367676, |
|
"learning_rate": 1.2651453335394231e-05, |
|
"loss": 4.373, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.6655555555555556, |
|
"grad_norm": 3.039720058441162, |
|
"learning_rate": 1.2575650974311119e-05, |
|
"loss": 4.5615, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 1.9847793579101562, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 4.7977, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6677777777777778, |
|
"grad_norm": 2.6300618648529053, |
|
"learning_rate": 1.2424501334244123e-05, |
|
"loss": 4.3867, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.6688888888888889, |
|
"grad_norm": 3.1111793518066406, |
|
"learning_rate": 1.234915589697091e-05, |
|
"loss": 4.3681, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.274426221847534, |
|
"learning_rate": 1.2273964606240718e-05, |
|
"loss": 4.3242, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.6711111111111111, |
|
"grad_norm": 2.1483371257781982, |
|
"learning_rate": 1.2198928378235716e-05, |
|
"loss": 4.468, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.6722222222222223, |
|
"grad_norm": 3.4644694328308105, |
|
"learning_rate": 1.2124048127248644e-05, |
|
"loss": 4.5052, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6733333333333333, |
|
"grad_norm": 2.960430860519409, |
|
"learning_rate": 1.2049324765671749e-05, |
|
"loss": 4.3371, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.6744444444444444, |
|
"grad_norm": 3.3487277030944824, |
|
"learning_rate": 1.19747592039856e-05, |
|
"loss": 4.8553, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 3.0505776405334473, |
|
"learning_rate": 1.1900352350748026e-05, |
|
"loss": 4.2182, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.6766666666666666, |
|
"grad_norm": 3.3368704319000244, |
|
"learning_rate": 1.1826105112583061e-05, |
|
"loss": 3.9389, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.6777777777777778, |
|
"grad_norm": 3.4237194061279297, |
|
"learning_rate": 1.175201839416988e-05, |
|
"loss": 4.4959, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6788888888888889, |
|
"grad_norm": 2.389981269836426, |
|
"learning_rate": 1.167809309823175e-05, |
|
"loss": 4.1658, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 4.309886455535889, |
|
"learning_rate": 1.1604330125525079e-05, |
|
"loss": 3.8112, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.6811111111111111, |
|
"grad_norm": 2.3204903602600098, |
|
"learning_rate": 1.1530730374828422e-05, |
|
"loss": 4.2661, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.6822222222222222, |
|
"grad_norm": 3.4007372856140137, |
|
"learning_rate": 1.1457294742931507e-05, |
|
"loss": 4.3726, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.6833333333333333, |
|
"grad_norm": 3.3282923698425293, |
|
"learning_rate": 1.1384024124624324e-05, |
|
"loss": 4.3936, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6844444444444444, |
|
"grad_norm": 3.03949236869812, |
|
"learning_rate": 1.1310919412686247e-05, |
|
"loss": 3.7927, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.6855555555555556, |
|
"grad_norm": 2.703687906265259, |
|
"learning_rate": 1.123798149787511e-05, |
|
"loss": 4.5, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.6866666666666666, |
|
"grad_norm": 3.9763340950012207, |
|
"learning_rate": 1.11652112689164e-05, |
|
"loss": 3.7004, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.6877777777777778, |
|
"grad_norm": 4.126737594604492, |
|
"learning_rate": 1.109260961249238e-05, |
|
"loss": 4.308, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.6888888888888889, |
|
"grad_norm": 2.2291946411132812, |
|
"learning_rate": 1.1020177413231334e-05, |
|
"loss": 4.4126, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.0533227920532227, |
|
"learning_rate": 1.0947915553696742e-05, |
|
"loss": 3.8855, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.6911111111111111, |
|
"grad_norm": 3.887996196746826, |
|
"learning_rate": 1.0875824914376553e-05, |
|
"loss": 4.3823, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.6922222222222222, |
|
"grad_norm": 2.5938265323638916, |
|
"learning_rate": 1.0803906373672476e-05, |
|
"loss": 4.2245, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 3.3216285705566406, |
|
"learning_rate": 1.0732160807889211e-05, |
|
"loss": 4.1916, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 3.3373751640319824, |
|
"learning_rate": 1.0660589091223855e-05, |
|
"loss": 4.3503, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6955555555555556, |
|
"grad_norm": 2.2890090942382812, |
|
"learning_rate": 1.058919209575517e-05, |
|
"loss": 4.0717, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.6966666666666667, |
|
"grad_norm": 4.276199817657471, |
|
"learning_rate": 1.0517970691433035e-05, |
|
"loss": 4.7337, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.6977777777777778, |
|
"grad_norm": 2.4809815883636475, |
|
"learning_rate": 1.0446925746067768e-05, |
|
"loss": 4.5435, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.6988888888888889, |
|
"grad_norm": 4.138044357299805, |
|
"learning_rate": 1.0376058125319613e-05, |
|
"loss": 4.2157, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.8675029277801514, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 4.7462, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7011111111111111, |
|
"grad_norm": 2.406512498855591, |
|
"learning_rate": 1.0234858309501862e-05, |
|
"loss": 3.879, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.7022222222222222, |
|
"grad_norm": 2.695511817932129, |
|
"learning_rate": 1.0164527834907467e-05, |
|
"loss": 4.4111, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.7033333333333334, |
|
"grad_norm": 2.458010673522949, |
|
"learning_rate": 1.0094378125859602e-05, |
|
"loss": 4.5713, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.7044444444444444, |
|
"grad_norm": 3.0153543949127197, |
|
"learning_rate": 1.0024410037110357e-05, |
|
"loss": 4.3096, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.7055555555555556, |
|
"grad_norm": 2.6395087242126465, |
|
"learning_rate": 9.954624421198792e-06, |
|
"loss": 4.4095, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.7066666666666667, |
|
"grad_norm": 2.174259662628174, |
|
"learning_rate": 9.88502212844063e-06, |
|
"loss": 4.396, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.7077777777777777, |
|
"grad_norm": 2.7941372394561768, |
|
"learning_rate": 9.815604006917839e-06, |
|
"loss": 4.1945, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.7088888888888889, |
|
"grad_norm": 3.5960729122161865, |
|
"learning_rate": 9.746370902468311e-06, |
|
"loss": 4.4889, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.816847562789917, |
|
"learning_rate": 9.677323658675594e-06, |
|
"loss": 4.6575, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 2.2747933864593506, |
|
"learning_rate": 9.608463116858542e-06, |
|
"loss": 4.5163, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7122222222222222, |
|
"grad_norm": 3.0500824451446533, |
|
"learning_rate": 9.539790116061151e-06, |
|
"loss": 4.5815, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.7133333333333334, |
|
"grad_norm": 3.7249062061309814, |
|
"learning_rate": 9.471305493042243e-06, |
|
"loss": 4.206, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.7144444444444444, |
|
"grad_norm": 3.3056654930114746, |
|
"learning_rate": 9.403010082265351e-06, |
|
"loss": 4.3168, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.7155555555555555, |
|
"grad_norm": 3.2421038150787354, |
|
"learning_rate": 9.334904715888495e-06, |
|
"loss": 4.815, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.7166666666666667, |
|
"grad_norm": 3.044701337814331, |
|
"learning_rate": 9.266990223754069e-06, |
|
"loss": 4.4632, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.7177777777777777, |
|
"grad_norm": 3.229823589324951, |
|
"learning_rate": 9.199267433378727e-06, |
|
"loss": 4.5471, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.7188888888888889, |
|
"grad_norm": 2.7537453174591064, |
|
"learning_rate": 9.131737169943314e-06, |
|
"loss": 4.0644, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.645606517791748, |
|
"learning_rate": 9.064400256282757e-06, |
|
"loss": 4.3359, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.7211111111111111, |
|
"grad_norm": 2.989220380783081, |
|
"learning_rate": 8.997257512876108e-06, |
|
"loss": 4.7121, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 2.0871849060058594, |
|
"learning_rate": 8.930309757836517e-06, |
|
"loss": 4.7126, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7233333333333334, |
|
"grad_norm": 2.9518845081329346, |
|
"learning_rate": 8.863557806901233e-06, |
|
"loss": 3.8763, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.7244444444444444, |
|
"grad_norm": 3.165712356567383, |
|
"learning_rate": 8.797002473421728e-06, |
|
"loss": 4.0458, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.7255555555555555, |
|
"grad_norm": 2.202949285507202, |
|
"learning_rate": 8.73064456835373e-06, |
|
"loss": 4.5542, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.7266666666666667, |
|
"grad_norm": 3.328310489654541, |
|
"learning_rate": 8.664484900247363e-06, |
|
"loss": 4.3315, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.7277777777777777, |
|
"grad_norm": 3.02006459236145, |
|
"learning_rate": 8.598524275237322e-06, |
|
"loss": 4.4394, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.7288888888888889, |
|
"grad_norm": 2.5074918270111084, |
|
"learning_rate": 8.532763497032987e-06, |
|
"loss": 4.2377, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.847383975982666, |
|
"learning_rate": 8.467203366908707e-06, |
|
"loss": 4.1128, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.7311111111111112, |
|
"grad_norm": 2.9188661575317383, |
|
"learning_rate": 8.40184468369396e-06, |
|
"loss": 4.2968, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.7322222222222222, |
|
"grad_norm": 3.0603976249694824, |
|
"learning_rate": 8.33668824376369e-06, |
|
"loss": 4.175, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 3.114797592163086, |
|
"learning_rate": 8.271734841028553e-06, |
|
"loss": 3.9943, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7344444444444445, |
|
"grad_norm": 3.7101423740386963, |
|
"learning_rate": 8.206985266925249e-06, |
|
"loss": 4.4357, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.7355555555555555, |
|
"grad_norm": 4.916779041290283, |
|
"learning_rate": 8.142440310406924e-06, |
|
"loss": 4.9196, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.7366666666666667, |
|
"grad_norm": 3.456704616546631, |
|
"learning_rate": 8.078100757933485e-06, |
|
"loss": 4.7176, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.7377777777777778, |
|
"grad_norm": 3.1686041355133057, |
|
"learning_rate": 8.013967393462094e-06, |
|
"loss": 4.3609, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.7388888888888889, |
|
"grad_norm": 2.5040740966796875, |
|
"learning_rate": 7.950040998437542e-06, |
|
"loss": 4.0855, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.923576831817627, |
|
"learning_rate": 7.886322351782783e-06, |
|
"loss": 3.7909, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.7411111111111112, |
|
"grad_norm": 3.781975269317627, |
|
"learning_rate": 7.822812229889428e-06, |
|
"loss": 4.4285, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.7422222222222222, |
|
"grad_norm": 2.2183735370635986, |
|
"learning_rate": 7.759511406608255e-06, |
|
"loss": 4.2021, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.7433333333333333, |
|
"grad_norm": 2.5517868995666504, |
|
"learning_rate": 7.696420653239833e-06, |
|
"loss": 4.0788, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.7444444444444445, |
|
"grad_norm": 3.1512372493743896, |
|
"learning_rate": 7.633540738525066e-06, |
|
"loss": 4.128, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7455555555555555, |
|
"grad_norm": 2.474193811416626, |
|
"learning_rate": 7.570872428635889e-06, |
|
"loss": 4.6547, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 3.1348423957824707, |
|
"learning_rate": 7.508416487165862e-06, |
|
"loss": 4.6837, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.7477777777777778, |
|
"grad_norm": 3.7456905841827393, |
|
"learning_rate": 7.4461736751209405e-06, |
|
"loss": 4.5965, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.7488888888888889, |
|
"grad_norm": 2.9826486110687256, |
|
"learning_rate": 7.384144750910133e-06, |
|
"loss": 4.1727, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.4859273433685303, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 4.188, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7511111111111111, |
|
"grad_norm": 3.679555892944336, |
|
"learning_rate": 7.260731586586983e-06, |
|
"loss": 4.3418, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.7522222222222222, |
|
"grad_norm": 4.997726917266846, |
|
"learning_rate": 7.19934885022509e-06, |
|
"loss": 4.0094, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.7533333333333333, |
|
"grad_norm": 3.3679285049438477, |
|
"learning_rate": 7.138183009179922e-06, |
|
"loss": 4.3927, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.7544444444444445, |
|
"grad_norm": 3.4834442138671875, |
|
"learning_rate": 7.0772348087379315e-06, |
|
"loss": 4.0955, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 3.499994993209839, |
|
"learning_rate": 7.016504991533726e-06, |
|
"loss": 4.2323, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7566666666666667, |
|
"grad_norm": 2.678922176361084, |
|
"learning_rate": 6.9559942975409465e-06, |
|
"loss": 4.549, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.7577777777777778, |
|
"grad_norm": 2.43112850189209, |
|
"learning_rate": 6.895703464063319e-06, |
|
"loss": 4.4337, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.7588888888888888, |
|
"grad_norm": 2.440561294555664, |
|
"learning_rate": 6.835633225725605e-06, |
|
"loss": 4.0068, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.2796149253845215, |
|
"learning_rate": 6.775784314464717e-06, |
|
"loss": 4.2503, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.7611111111111111, |
|
"grad_norm": 3.672053098678589, |
|
"learning_rate": 6.716157459520739e-06, |
|
"loss": 3.8174, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7622222222222222, |
|
"grad_norm": 4.125499248504639, |
|
"learning_rate": 6.656753387428089e-06, |
|
"loss": 4.1682, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.7633333333333333, |
|
"grad_norm": 2.379180669784546, |
|
"learning_rate": 6.5975728220066425e-06, |
|
"loss": 3.9803, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.7644444444444445, |
|
"grad_norm": 2.5495798587799072, |
|
"learning_rate": 6.538616484352902e-06, |
|
"loss": 4.4606, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.7655555555555555, |
|
"grad_norm": 3.079115629196167, |
|
"learning_rate": 6.47988509283125e-06, |
|
"loss": 4.4226, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.7666666666666667, |
|
"grad_norm": 3.088437795639038, |
|
"learning_rate": 6.421379363065142e-06, |
|
"loss": 4.5023, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7677777777777778, |
|
"grad_norm": 6.411847114562988, |
|
"learning_rate": 6.363100007928446e-06, |
|
"loss": 4.488, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.7688888888888888, |
|
"grad_norm": 2.622467517852783, |
|
"learning_rate": 6.305047737536707e-06, |
|
"loss": 3.7526, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.189143657684326, |
|
"learning_rate": 6.247223259238511e-06, |
|
"loss": 4.0969, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.7711111111111111, |
|
"grad_norm": 3.9353489875793457, |
|
"learning_rate": 6.189627277606894e-06, |
|
"loss": 4.0019, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.7722222222222223, |
|
"grad_norm": 2.4755685329437256, |
|
"learning_rate": 6.1322604944307e-06, |
|
"loss": 4.0007, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 3.4721150398254395, |
|
"learning_rate": 6.075123608706093e-06, |
|
"loss": 4.1602, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.7744444444444445, |
|
"grad_norm": 2.571910858154297, |
|
"learning_rate": 6.01821731662798e-06, |
|
"loss": 4.5001, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.7755555555555556, |
|
"grad_norm": 2.0862197875976562, |
|
"learning_rate": 5.961542311581586e-06, |
|
"loss": 4.2366, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.7766666666666666, |
|
"grad_norm": 2.0852468013763428, |
|
"learning_rate": 5.905099284133952e-06, |
|
"loss": 4.1254, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 3.239201784133911, |
|
"learning_rate": 5.848888922025553e-06, |
|
"loss": 4.0255, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7788888888888889, |
|
"grad_norm": 3.3855128288269043, |
|
"learning_rate": 5.792911910161922e-06, |
|
"loss": 4.4192, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.8477554321289062, |
|
"learning_rate": 5.737168930605272e-06, |
|
"loss": 4.6303, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.7811111111111111, |
|
"grad_norm": 2.2901785373687744, |
|
"learning_rate": 5.681660662566224e-06, |
|
"loss": 4.3732, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 3.0778727531433105, |
|
"learning_rate": 5.626387782395512e-06, |
|
"loss": 3.9875, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.7833333333333333, |
|
"grad_norm": 2.725858449935913, |
|
"learning_rate": 5.571350963575728e-06, |
|
"loss": 4.274, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7844444444444445, |
|
"grad_norm": 2.9397945404052734, |
|
"learning_rate": 5.5165508767131415e-06, |
|
"loss": 3.8244, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.7855555555555556, |
|
"grad_norm": 3.8967740535736084, |
|
"learning_rate": 5.461988189529529e-06, |
|
"loss": 4.3882, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.7866666666666666, |
|
"grad_norm": 3.6964597702026367, |
|
"learning_rate": 5.4076635668540075e-06, |
|
"loss": 4.6633, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.7877777777777778, |
|
"grad_norm": 3.322463035583496, |
|
"learning_rate": 5.3535776706149505e-06, |
|
"loss": 4.0363, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.7888888888888889, |
|
"grad_norm": 3.0608179569244385, |
|
"learning_rate": 5.299731159831953e-06, |
|
"loss": 3.9402, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.0244638919830322, |
|
"learning_rate": 5.24612469060774e-06, |
|
"loss": 4.7072, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.7911111111111111, |
|
"grad_norm": 3.1370954513549805, |
|
"learning_rate": 5.192758916120236e-06, |
|
"loss": 3.6739, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.7922222222222223, |
|
"grad_norm": 2.96083402633667, |
|
"learning_rate": 5.139634486614544e-06, |
|
"loss": 4.561, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.7933333333333333, |
|
"grad_norm": 3.8822271823883057, |
|
"learning_rate": 5.086752049395094e-06, |
|
"loss": 4.6279, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.7944444444444444, |
|
"grad_norm": 3.556574583053589, |
|
"learning_rate": 5.034112248817685e-06, |
|
"loss": 4.029, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7955555555555556, |
|
"grad_norm": 2.4491796493530273, |
|
"learning_rate": 4.981715726281666e-06, |
|
"loss": 4.0463, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.7966666666666666, |
|
"grad_norm": 3.314884901046753, |
|
"learning_rate": 4.929563120222141e-06, |
|
"loss": 4.2907, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.7977777777777778, |
|
"grad_norm": 2.998528480529785, |
|
"learning_rate": 4.877655066102149e-06, |
|
"loss": 4.1238, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.7988888888888889, |
|
"grad_norm": 2.5107343196868896, |
|
"learning_rate": 4.825992196404957e-06, |
|
"loss": 4.8033, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.2697060108184814, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 3.598, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8011111111111111, |
|
"grad_norm": 2.1032586097717285, |
|
"learning_rate": 4.723404525266839e-06, |
|
"loss": 4.0662, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.8022222222222222, |
|
"grad_norm": 2.2804932594299316, |
|
"learning_rate": 4.672480973824311e-06, |
|
"loss": 4.6152, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.8033333333333333, |
|
"grad_norm": 3.123626470565796, |
|
"learning_rate": 4.621805106786142e-06, |
|
"loss": 4.8937, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.8044444444444444, |
|
"grad_norm": 3.6036055088043213, |
|
"learning_rate": 4.571377541621788e-06, |
|
"loss": 4.5689, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.8055555555555556, |
|
"grad_norm": 3.6055924892425537, |
|
"learning_rate": 4.521198892775203e-06, |
|
"loss": 3.8795, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8066666666666666, |
|
"grad_norm": 2.7153923511505127, |
|
"learning_rate": 4.4712697716574e-06, |
|
"loss": 4.3875, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.8077777777777778, |
|
"grad_norm": 3.3169379234313965, |
|
"learning_rate": 4.421590786638951e-06, |
|
"loss": 3.9778, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.8088888888888889, |
|
"grad_norm": 3.1773722171783447, |
|
"learning_rate": 4.372162543042624e-06, |
|
"loss": 4.7571, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.2997097969055176, |
|
"learning_rate": 4.322985643135952e-06, |
|
"loss": 4.4563, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.8111111111111111, |
|
"grad_norm": 3.0270705223083496, |
|
"learning_rate": 4.274060686123959e-06, |
|
"loss": 4.1449, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8122222222222222, |
|
"grad_norm": 3.159769296646118, |
|
"learning_rate": 4.225388268141797e-06, |
|
"loss": 4.2249, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.8133333333333334, |
|
"grad_norm": 3.2132275104522705, |
|
"learning_rate": 4.176968982247514e-06, |
|
"loss": 3.7882, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.8144444444444444, |
|
"grad_norm": 2.145144462585449, |
|
"learning_rate": 4.128803418414839e-06, |
|
"loss": 3.8867, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.8155555555555556, |
|
"grad_norm": 3.366910696029663, |
|
"learning_rate": 4.08089216352596e-06, |
|
"loss": 4.5035, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.8166666666666667, |
|
"grad_norm": 3.334970235824585, |
|
"learning_rate": 4.0332358013644016e-06, |
|
"loss": 3.9257, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 3.1020681858062744, |
|
"learning_rate": 3.985834912607894e-06, |
|
"loss": 4.4492, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.8188888888888889, |
|
"grad_norm": 2.6478145122528076, |
|
"learning_rate": 3.938690074821313e-06, |
|
"loss": 3.7261, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.352097988128662, |
|
"learning_rate": 3.891801862449629e-06, |
|
"loss": 3.9445, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.8211111111111111, |
|
"grad_norm": 2.18900728225708, |
|
"learning_rate": 3.845170846810902e-06, |
|
"loss": 4.1073, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.8222222222222222, |
|
"grad_norm": 3.0373637676239014, |
|
"learning_rate": 3.798797596089351e-06, |
|
"loss": 4.4156, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8233333333333334, |
|
"grad_norm": 2.5862083435058594, |
|
"learning_rate": 3.752682675328406e-06, |
|
"loss": 4.5457, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.8244444444444444, |
|
"grad_norm": 2.2058353424072266, |
|
"learning_rate": 3.7068266464238084e-06, |
|
"loss": 3.9846, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.8255555555555556, |
|
"grad_norm": 2.6216089725494385, |
|
"learning_rate": 3.661230068116811e-06, |
|
"loss": 4.0467, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 2.514681816101074, |
|
"learning_rate": 3.6158934959873353e-06, |
|
"loss": 4.6638, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.8277777777777777, |
|
"grad_norm": 3.9455208778381348, |
|
"learning_rate": 3.5708174824471947e-06, |
|
"loss": 3.9174, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.8288888888888889, |
|
"grad_norm": 3.5356788635253906, |
|
"learning_rate": 3.5260025767333893e-06, |
|
"loss": 4.3817, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.5251104831695557, |
|
"learning_rate": 3.4814493249014116e-06, |
|
"loss": 4.3528, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.8311111111111111, |
|
"grad_norm": 3.5254111289978027, |
|
"learning_rate": 3.4371582698185633e-06, |
|
"loss": 3.896, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.8322222222222222, |
|
"grad_norm": 3.865900754928589, |
|
"learning_rate": 3.393129951157384e-06, |
|
"loss": 4.0786, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 3.2861948013305664, |
|
"learning_rate": 3.3493649053890326e-06, |
|
"loss": 4.2953, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8344444444444444, |
|
"grad_norm": 2.263437509536743, |
|
"learning_rate": 3.305863665776793e-06, |
|
"loss": 3.9898, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.8355555555555556, |
|
"grad_norm": 3.1119070053100586, |
|
"learning_rate": 3.262626762369525e-06, |
|
"loss": 4.3465, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.8366666666666667, |
|
"grad_norm": 3.526019334793091, |
|
"learning_rate": 3.219654721995266e-06, |
|
"loss": 4.4787, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.8377777777777777, |
|
"grad_norm": 3.2192864418029785, |
|
"learning_rate": 3.176948068254762e-06, |
|
"loss": 4.0421, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.8388888888888889, |
|
"grad_norm": 3.149193525314331, |
|
"learning_rate": 3.1345073215151066e-06, |
|
"loss": 4.7629, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.5595197677612305, |
|
"learning_rate": 3.092332998903416e-06, |
|
"loss": 4.4459, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.8411111111111111, |
|
"grad_norm": 2.972254991531372, |
|
"learning_rate": 3.0504256143004866e-06, |
|
"loss": 4.1769, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.8422222222222222, |
|
"grad_norm": 2.9443609714508057, |
|
"learning_rate": 3.0087856783345914e-06, |
|
"loss": 3.9543, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.8433333333333334, |
|
"grad_norm": 2.5057802200317383, |
|
"learning_rate": 2.967413698375196e-06, |
|
"loss": 4.6031, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 3.078894853591919, |
|
"learning_rate": 2.9263101785268254e-06, |
|
"loss": 3.9584, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8455555555555555, |
|
"grad_norm": 2.3315460681915283, |
|
"learning_rate": 2.8854756196229016e-06, |
|
"loss": 4.0571, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.8466666666666667, |
|
"grad_norm": 3.303471326828003, |
|
"learning_rate": 2.8449105192196316e-06, |
|
"loss": 4.3249, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.8477777777777777, |
|
"grad_norm": 3.592991590499878, |
|
"learning_rate": 2.8046153715899692e-06, |
|
"loss": 3.8605, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.8488888888888889, |
|
"grad_norm": 2.9544084072113037, |
|
"learning_rate": 2.764590667717562e-06, |
|
"loss": 4.0353, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.3249425888061523, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 4.319, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.8511111111111112, |
|
"grad_norm": 4.874053001403809, |
|
"learning_rate": 2.6853545386968606e-06, |
|
"loss": 4.458, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.8522222222222222, |
|
"grad_norm": 2.6893086433410645, |
|
"learning_rate": 2.646144079015797e-06, |
|
"loss": 4.0627, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 2.3588078022003174, |
|
"learning_rate": 2.6072059940146775e-06, |
|
"loss": 4.5383, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.8544444444444445, |
|
"grad_norm": 2.887497663497925, |
|
"learning_rate": 2.5685407581417907e-06, |
|
"loss": 4.0523, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.8555555555555555, |
|
"grad_norm": 2.968337059020996, |
|
"learning_rate": 2.5301488425208296e-06, |
|
"loss": 3.6994, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8566666666666667, |
|
"grad_norm": 2.897677183151245, |
|
"learning_rate": 2.492030714945162e-06, |
|
"loss": 4.2895, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.8577777777777778, |
|
"grad_norm": 4.195917129516602, |
|
"learning_rate": 2.454186839872158e-06, |
|
"loss": 3.8239, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.8588888888888889, |
|
"grad_norm": 3.7905352115631104, |
|
"learning_rate": 2.4166176784174795e-06, |
|
"loss": 4.0074, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.2009356021881104, |
|
"learning_rate": 2.379323688349516e-06, |
|
"loss": 4.156, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.8611111111111112, |
|
"grad_norm": 3.1370296478271484, |
|
"learning_rate": 2.3423053240837515e-06, |
|
"loss": 4.7336, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8622222222222222, |
|
"grad_norm": 2.4123268127441406, |
|
"learning_rate": 2.3055630366772856e-06, |
|
"loss": 4.3871, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.8633333333333333, |
|
"grad_norm": 2.304750919342041, |
|
"learning_rate": 2.269097273823287e-06, |
|
"loss": 4.492, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.8644444444444445, |
|
"grad_norm": 3.0814766883850098, |
|
"learning_rate": 2.2329084798455746e-06, |
|
"loss": 4.9373, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.8655555555555555, |
|
"grad_norm": 3.1735880374908447, |
|
"learning_rate": 2.1969970956931762e-06, |
|
"loss": 4.2959, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 2.108222007751465, |
|
"learning_rate": 2.1613635589349756e-06, |
|
"loss": 4.3394, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8677777777777778, |
|
"grad_norm": 3.263927459716797, |
|
"learning_rate": 2.1260083037543817e-06, |
|
"loss": 4.5798, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.8688888888888889, |
|
"grad_norm": 2.2533228397369385, |
|
"learning_rate": 2.0909317609440095e-06, |
|
"loss": 4.4232, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.333630323410034, |
|
"learning_rate": 2.0561343579004715e-06, |
|
"loss": 4.1046, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.8711111111111111, |
|
"grad_norm": 3.1644370555877686, |
|
"learning_rate": 2.0216165186191407e-06, |
|
"loss": 4.2146, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.8722222222222222, |
|
"grad_norm": 3.5359654426574707, |
|
"learning_rate": 1.9873786636889906e-06, |
|
"loss": 3.8819, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.8733333333333333, |
|
"grad_norm": 4.151428699493408, |
|
"learning_rate": 1.95342121028749e-06, |
|
"loss": 3.9319, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.8744444444444445, |
|
"grad_norm": 3.5409927368164062, |
|
"learning_rate": 1.9197445721754776e-06, |
|
"loss": 4.463, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.8755555555555555, |
|
"grad_norm": 2.9611496925354004, |
|
"learning_rate": 1.8863491596921745e-06, |
|
"loss": 3.84, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.8766666666666667, |
|
"grad_norm": 2.815295934677124, |
|
"learning_rate": 1.8532353797501318e-06, |
|
"loss": 4.3042, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.8777777777777778, |
|
"grad_norm": 3.664135456085205, |
|
"learning_rate": 1.8204036358303173e-06, |
|
"loss": 3.7069, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8788888888888889, |
|
"grad_norm": 2.666962146759033, |
|
"learning_rate": 1.787854327977162e-06, |
|
"loss": 3.9498, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.5214672088623047, |
|
"learning_rate": 1.7555878527937164e-06, |
|
"loss": 4.15, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.8811111111111111, |
|
"grad_norm": 3.4584808349609375, |
|
"learning_rate": 1.7236046034367958e-06, |
|
"loss": 4.2487, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.8822222222222222, |
|
"grad_norm": 3.4059367179870605, |
|
"learning_rate": 1.6919049696121958e-06, |
|
"loss": 4.1058, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.8833333333333333, |
|
"grad_norm": 2.7756285667419434, |
|
"learning_rate": 1.6604893375699594e-06, |
|
"loss": 4.0675, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8844444444444445, |
|
"grad_norm": 2.357132911682129, |
|
"learning_rate": 1.629358090099639e-06, |
|
"loss": 4.3934, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.8855555555555555, |
|
"grad_norm": 3.2852492332458496, |
|
"learning_rate": 1.5985116065256684e-06, |
|
"loss": 4.4737, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.8866666666666667, |
|
"grad_norm": 2.238274335861206, |
|
"learning_rate": 1.5679502627027136e-06, |
|
"loss": 4.4498, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.8877777777777778, |
|
"grad_norm": 2.4234659671783447, |
|
"learning_rate": 1.5376744310111019e-06, |
|
"loss": 4.5667, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 3.9129526615142822, |
|
"learning_rate": 1.5076844803522922e-06, |
|
"loss": 4.4, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.525909900665283, |
|
"learning_rate": 1.4779807761443636e-06, |
|
"loss": 3.7143, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.8911111111111111, |
|
"grad_norm": 3.369392156600952, |
|
"learning_rate": 1.4485636803175829e-06, |
|
"loss": 3.9016, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.8922222222222222, |
|
"grad_norm": 3.97363543510437, |
|
"learning_rate": 1.4194335513099761e-06, |
|
"loss": 4.5144, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.8933333333333333, |
|
"grad_norm": 3.287062883377075, |
|
"learning_rate": 1.3905907440629752e-06, |
|
"loss": 3.7106, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.8944444444444445, |
|
"grad_norm": 3.274184465408325, |
|
"learning_rate": 1.362035610017079e-06, |
|
"loss": 4.6111, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8955555555555555, |
|
"grad_norm": 2.367525815963745, |
|
"learning_rate": 1.333768497107593e-06, |
|
"loss": 4.1843, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.8966666666666666, |
|
"grad_norm": 3.559544324874878, |
|
"learning_rate": 1.305789749760361e-06, |
|
"loss": 4.0371, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.8977777777777778, |
|
"grad_norm": 2.5813517570495605, |
|
"learning_rate": 1.2780997088875869e-06, |
|
"loss": 4.2183, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.8988888888888888, |
|
"grad_norm": 3.533015012741089, |
|
"learning_rate": 1.250698711883691e-06, |
|
"loss": 4.4354, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.268846273422241, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 4.167, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9011111111111111, |
|
"grad_norm": 2.674999237060547, |
|
"learning_rate": 1.1967651814465354e-06, |
|
"loss": 4.5461, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.9022222222222223, |
|
"grad_norm": 2.9661448001861572, |
|
"learning_rate": 1.170233305176327e-06, |
|
"loss": 4.4819, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.9033333333333333, |
|
"grad_norm": 3.1719062328338623, |
|
"learning_rate": 1.1439917870930793e-06, |
|
"loss": 4.7351, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.9044444444444445, |
|
"grad_norm": 2.9525768756866455, |
|
"learning_rate": 1.1180409469414094e-06, |
|
"loss": 4.0522, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.9055555555555556, |
|
"grad_norm": 3.972069025039673, |
|
"learning_rate": 1.0923811009241142e-06, |
|
"loss": 3.8434, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 2.4678852558135986, |
|
"learning_rate": 1.067012561698319e-06, |
|
"loss": 4.2084, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.9077777777777778, |
|
"grad_norm": 2.2604753971099854, |
|
"learning_rate": 1.0419356383716688e-06, |
|
"loss": 4.3375, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.9088888888888889, |
|
"grad_norm": 3.653369426727295, |
|
"learning_rate": 1.0171506364985622e-06, |
|
"loss": 4.2001, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.612713575363159, |
|
"learning_rate": 9.926578580764234e-07, |
|
"loss": 3.8888, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.9111111111111111, |
|
"grad_norm": 3.2685186862945557, |
|
"learning_rate": 9.684576015420278e-07, |
|
"loss": 3.8959, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9122222222222223, |
|
"grad_norm": 3.556123733520508, |
|
"learning_rate": 9.445501617678654e-07, |
|
"loss": 3.9459, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.9133333333333333, |
|
"grad_norm": 5.377536773681641, |
|
"learning_rate": 9.209358300585474e-07, |
|
"loss": 3.5557, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.9144444444444444, |
|
"grad_norm": 2.2200653553009033, |
|
"learning_rate": 8.976148941472501e-07, |
|
"loss": 4.1984, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.9155555555555556, |
|
"grad_norm": 2.290778875350952, |
|
"learning_rate": 8.745876381922147e-07, |
|
"loss": 3.9423, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.9166666666666666, |
|
"grad_norm": 3.241596221923828, |
|
"learning_rate": 8.51854342773295e-07, |
|
"loss": 4.2498, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.9177777777777778, |
|
"grad_norm": 4.267802715301514, |
|
"learning_rate": 8.294152848885157e-07, |
|
"loss": 3.9039, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.9188888888888889, |
|
"grad_norm": 2.7669568061828613, |
|
"learning_rate": 8.072707379507216e-07, |
|
"loss": 4.0191, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.132392168045044, |
|
"learning_rate": 7.854209717842231e-07, |
|
"loss": 4.8574, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.9211111111111111, |
|
"grad_norm": 2.444495916366577, |
|
"learning_rate": 7.638662526215284e-07, |
|
"loss": 4.0902, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.9222222222222223, |
|
"grad_norm": 3.362197160720825, |
|
"learning_rate": 7.426068431000882e-07, |
|
"loss": 4.5238, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9233333333333333, |
|
"grad_norm": 4.276552200317383, |
|
"learning_rate": 7.216430022591008e-07, |
|
"loss": 3.5991, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 2.867725372314453, |
|
"learning_rate": 7.009749855363456e-07, |
|
"loss": 4.3119, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.9255555555555556, |
|
"grad_norm": 3.021606922149658, |
|
"learning_rate": 6.806030447650879e-07, |
|
"loss": 3.9519, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.9266666666666666, |
|
"grad_norm": 3.299363851547241, |
|
"learning_rate": 6.605274281709928e-07, |
|
"loss": 3.9062, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.9277777777777778, |
|
"grad_norm": 3.487799644470215, |
|
"learning_rate": 6.407483803691216e-07, |
|
"loss": 4.1561, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.9288888888888889, |
|
"grad_norm": 3.5065345764160156, |
|
"learning_rate": 6.212661423609184e-07, |
|
"loss": 3.9922, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.3422510623931885, |
|
"learning_rate": 6.020809515313142e-07, |
|
"loss": 4.5369, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.9311111111111111, |
|
"grad_norm": 3.0932629108428955, |
|
"learning_rate": 5.83193041645802e-07, |
|
"loss": 3.8776, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.9322222222222222, |
|
"grad_norm": 2.2441000938415527, |
|
"learning_rate": 5.646026428476031e-07, |
|
"loss": 4.301, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 3.151946783065796, |
|
"learning_rate": 5.463099816548579e-07, |
|
"loss": 4.5223, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9344444444444444, |
|
"grad_norm": 2.9941930770874023, |
|
"learning_rate": 5.283152809578751e-07, |
|
"loss": 4.2136, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.9355555555555556, |
|
"grad_norm": 3.5801138877868652, |
|
"learning_rate": 5.106187600163987e-07, |
|
"loss": 4.0182, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.9366666666666666, |
|
"grad_norm": 2.327622413635254, |
|
"learning_rate": 4.932206344569562e-07, |
|
"loss": 4.3929, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.9377777777777778, |
|
"grad_norm": 2.793179512023926, |
|
"learning_rate": 4.7612111627021175e-07, |
|
"loss": 4.1198, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.9388888888888889, |
|
"grad_norm": 2.1341638565063477, |
|
"learning_rate": 4.5932041380840065e-07, |
|
"loss": 4.2686, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.8225035667419434, |
|
"learning_rate": 4.4281873178278475e-07, |
|
"loss": 4.3253, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.9411111111111111, |
|
"grad_norm": 2.065812349319458, |
|
"learning_rate": 4.26616271261146e-07, |
|
"loss": 4.5926, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.9422222222222222, |
|
"grad_norm": 3.443786859512329, |
|
"learning_rate": 4.107132296653549e-07, |
|
"loss": 4.6865, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.9433333333333334, |
|
"grad_norm": 3.73835825920105, |
|
"learning_rate": 3.95109800768953e-07, |
|
"loss": 4.357, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 3.801854372024536, |
|
"learning_rate": 3.7980617469479953e-07, |
|
"loss": 4.4399, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9455555555555556, |
|
"grad_norm": 2.6715407371520996, |
|
"learning_rate": 3.6480253791274786e-07, |
|
"loss": 4.0495, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.9466666666666667, |
|
"grad_norm": 2.8497438430786133, |
|
"learning_rate": 3.5009907323737825e-07, |
|
"loss": 4.6042, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.9477777777777778, |
|
"grad_norm": 2.0767159461975098, |
|
"learning_rate": 3.3569595982576583e-07, |
|
"loss": 4.3431, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.9488888888888889, |
|
"grad_norm": 2.8185853958129883, |
|
"learning_rate": 3.215933731753024e-07, |
|
"loss": 4.2526, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.341989040374756, |
|
"learning_rate": 3.077914851215585e-07, |
|
"loss": 4.798, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9511111111111111, |
|
"grad_norm": 2.7204387187957764, |
|
"learning_rate": 2.942904638361804e-07, |
|
"loss": 4.4965, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.9522222222222222, |
|
"grad_norm": 3.139683485031128, |
|
"learning_rate": 2.810904738248549e-07, |
|
"loss": 4.7001, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.9533333333333334, |
|
"grad_norm": 3.4762489795684814, |
|
"learning_rate": 2.681916759252917e-07, |
|
"loss": 4.2364, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.9544444444444444, |
|
"grad_norm": 3.4384474754333496, |
|
"learning_rate": 2.555942273052753e-07, |
|
"loss": 4.2242, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.9555555555555556, |
|
"grad_norm": 7.5540571212768555, |
|
"learning_rate": 2.4329828146074095e-07, |
|
"loss": 4.4938, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9566666666666667, |
|
"grad_norm": 3.0467262268066406, |
|
"learning_rate": 2.3130398821391007e-07, |
|
"loss": 3.6816, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.9577777777777777, |
|
"grad_norm": 2.601795196533203, |
|
"learning_rate": 2.1961149371145795e-07, |
|
"loss": 4.1958, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.9588888888888889, |
|
"grad_norm": 4.4815216064453125, |
|
"learning_rate": 2.0822094042274032e-07, |
|
"loss": 3.994, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.6530420780181885, |
|
"learning_rate": 1.9713246713805588e-07, |
|
"loss": 4.3858, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.9611111111111111, |
|
"grad_norm": 3.1802806854248047, |
|
"learning_rate": 1.8634620896695043e-07, |
|
"loss": 4.0963, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.9622222222222222, |
|
"grad_norm": 3.68103289604187, |
|
"learning_rate": 1.7586229733657644e-07, |
|
"loss": 4.4613, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.9633333333333334, |
|
"grad_norm": 2.8591842651367188, |
|
"learning_rate": 1.6568085999008888e-07, |
|
"loss": 4.3585, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.9644444444444444, |
|
"grad_norm": 2.928494691848755, |
|
"learning_rate": 1.5580202098509077e-07, |
|
"loss": 4.3471, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.9655555555555555, |
|
"grad_norm": 2.9359610080718994, |
|
"learning_rate": 1.4622590069211516e-07, |
|
"loss": 4.5431, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.9666666666666667, |
|
"grad_norm": 2.3987393379211426, |
|
"learning_rate": 1.3695261579316777e-07, |
|
"loss": 3.9504, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9677777777777777, |
|
"grad_norm": 2.556596040725708, |
|
"learning_rate": 1.2798227928029482e-07, |
|
"loss": 3.7707, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.9688888888888889, |
|
"grad_norm": 2.1739792823791504, |
|
"learning_rate": 1.193150004542204e-07, |
|
"loss": 4.1831, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.5820603370666504, |
|
"learning_rate": 1.109508849230001e-07, |
|
"loss": 4.4525, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.9711111111111111, |
|
"grad_norm": 4.120064735412598, |
|
"learning_rate": 1.0289003460074165e-07, |
|
"loss": 3.9264, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 3.187326669692993, |
|
"learning_rate": 9.513254770636137e-08, |
|
"loss": 4.4163, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.9733333333333334, |
|
"grad_norm": 2.66953706741333, |
|
"learning_rate": 8.767851876239074e-08, |
|
"loss": 4.3193, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.9744444444444444, |
|
"grad_norm": 3.3307957649230957, |
|
"learning_rate": 8.052803859382174e-08, |
|
"loss": 4.6534, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.9755555555555555, |
|
"grad_norm": 3.617739200592041, |
|
"learning_rate": 7.368119432699383e-08, |
|
"loss": 4.4387, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.9766666666666667, |
|
"grad_norm": 2.623586654663086, |
|
"learning_rate": 6.71380693885476e-08, |
|
"loss": 4.3379, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 3.5560302734375, |
|
"learning_rate": 6.089874350439506e-08, |
|
"loss": 4.536, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9788888888888889, |
|
"grad_norm": 3.401707887649536, |
|
"learning_rate": 5.496329269875089e-08, |
|
"loss": 4.1187, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.7142131328582764, |
|
"learning_rate": 4.9331789293211026e-08, |
|
"loss": 4.5272, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.9811111111111112, |
|
"grad_norm": 3.471452236175537, |
|
"learning_rate": 4.400430190586724e-08, |
|
"loss": 4.081, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.9822222222222222, |
|
"grad_norm": 3.4180078506469727, |
|
"learning_rate": 3.8980895450474455e-08, |
|
"loss": 4.3884, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.9833333333333333, |
|
"grad_norm": 3.2058937549591064, |
|
"learning_rate": 3.426163113565417e-08, |
|
"loss": 4.178, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.9844444444444445, |
|
"grad_norm": 2.5083703994750977, |
|
"learning_rate": 2.9846566464150626e-08, |
|
"loss": 4.1643, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.9855555555555555, |
|
"grad_norm": 2.149996042251587, |
|
"learning_rate": 2.5735755232134118e-08, |
|
"loss": 4.0516, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 3.1104636192321777, |
|
"learning_rate": 2.192924752854042e-08, |
|
"loss": 4.1596, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.9877777777777778, |
|
"grad_norm": 3.0319011211395264, |
|
"learning_rate": 1.842708973447127e-08, |
|
"loss": 4.1681, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.9888888888888889, |
|
"grad_norm": 2.075939893722534, |
|
"learning_rate": 1.522932452260595e-08, |
|
"loss": 4.1082, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.8124101161956787, |
|
"learning_rate": 1.233599085671e-08, |
|
"loss": 4.0637, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.9911111111111112, |
|
"grad_norm": 3.1139042377471924, |
|
"learning_rate": 9.747123991141194e-09, |
|
"loss": 3.8563, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.9922222222222222, |
|
"grad_norm": 2.973275661468506, |
|
"learning_rate": 7.462755470422078e-09, |
|
"loss": 4.8157, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.9933333333333333, |
|
"grad_norm": 3.155707359313965, |
|
"learning_rate": 5.48291312886251e-09, |
|
"loss": 4.6411, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.9944444444444445, |
|
"grad_norm": 4.164730548858643, |
|
"learning_rate": 3.807621090218261e-09, |
|
"loss": 3.6138, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 2.4967424869537354, |
|
"learning_rate": 2.4368997673940297e-09, |
|
"loss": 3.9694, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.9966666666666667, |
|
"grad_norm": 2.2673118114471436, |
|
"learning_rate": 1.3707658621964215e-09, |
|
"loss": 4.1773, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.9977777777777778, |
|
"grad_norm": 2.343024730682373, |
|
"learning_rate": 6.092323651313292e-10, |
|
"loss": 4.2347, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.9988888888888889, |
|
"grad_norm": 3.1533758640289307, |
|
"learning_rate": 1.5230855524017708e-10, |
|
"loss": 4.2303, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.3015313148498535, |
|
"learning_rate": 0.0, |
|
"loss": 4.4286, |
|
"step": 900 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0254817236897792e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|