|
{ |
|
"best_metric": NaN, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-50", |
|
"epoch": 0.8658008658008658, |
|
"eval_steps": 50, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004329004329004329, |
|
"grad_norm": 0.33439210057258606, |
|
"learning_rate": 1.001e-05, |
|
"loss": 0.7374, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004329004329004329, |
|
"eval_loss": NaN, |
|
"eval_runtime": 11.016, |
|
"eval_samples_per_second": 8.805, |
|
"eval_steps_per_second": 2.269, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008658008658008658, |
|
"grad_norm": 0.30539026856422424, |
|
"learning_rate": 2.002e-05, |
|
"loss": 0.7, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012987012987012988, |
|
"grad_norm": 0.30680182576179504, |
|
"learning_rate": 3.0029999999999995e-05, |
|
"loss": 0.7228, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.017316017316017316, |
|
"grad_norm": 0.6637282371520996, |
|
"learning_rate": 4.004e-05, |
|
"loss": 0.6918, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.021645021645021644, |
|
"grad_norm": 0.33853232860565186, |
|
"learning_rate": 5.005e-05, |
|
"loss": 0.7408, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.025974025974025976, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.005999999999999e-05, |
|
"loss": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.030303030303030304, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.006999999999998e-05, |
|
"loss": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03463203463203463, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.008e-05, |
|
"loss": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03896103896103896, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.009e-05, |
|
"loss": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04329004329004329, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001001, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.957315789473684e-05, |
|
"loss": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05194805194805195, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.904631578947367e-05, |
|
"loss": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05627705627705628, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.851947368421052e-05, |
|
"loss": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06060606060606061, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.799263157894736e-05, |
|
"loss": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.746578947368421e-05, |
|
"loss": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06926406926406926, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.693894736842104e-05, |
|
"loss": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0735930735930736, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.641210526315789e-05, |
|
"loss": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07792207792207792, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.588526315789473e-05, |
|
"loss": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08225108225108226, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.535842105263157e-05, |
|
"loss": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08658008658008658, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.483157894736841e-05, |
|
"loss": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.430473684210526e-05, |
|
"loss": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.37778947368421e-05, |
|
"loss": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09956709956709957, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.325105263157894e-05, |
|
"loss": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1038961038961039, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.272421052631578e-05, |
|
"loss": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10822510822510822, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.219736842105263e-05, |
|
"loss": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11255411255411256, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.167052631578946e-05, |
|
"loss": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11688311688311688, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.114368421052632e-05, |
|
"loss": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.061684210526315e-05, |
|
"loss": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12554112554112554, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.009e-05, |
|
"loss": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.956315789473683e-05, |
|
"loss": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1341991341991342, |
|
"grad_norm": 0.5805222988128662, |
|
"learning_rate": 8.903631578947368e-05, |
|
"loss": 0.7943, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13852813852813853, |
|
"grad_norm": 0.39428913593292236, |
|
"learning_rate": 8.850947368421052e-05, |
|
"loss": 0.7229, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.6405807733535767, |
|
"learning_rate": 8.798263157894736e-05, |
|
"loss": 0.8673, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1471861471861472, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.745578947368422e-05, |
|
"loss": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.692894736842105e-05, |
|
"loss": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15584415584415584, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.64021052631579e-05, |
|
"loss": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.16017316017316016, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.587526315789473e-05, |
|
"loss": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1645021645021645, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.534842105263157e-05, |
|
"loss": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.16883116883116883, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.482157894736842e-05, |
|
"loss": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.17316017316017315, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.429473684210525e-05, |
|
"loss": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1774891774891775, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.376789473684211e-05, |
|
"loss": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.324105263157894e-05, |
|
"loss": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.18614718614718614, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.271421052631579e-05, |
|
"loss": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.218736842105262e-05, |
|
"loss": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.166052631578947e-05, |
|
"loss": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19913419913419914, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.113368421052631e-05, |
|
"loss": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.20346320346320346, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.060684210526315e-05, |
|
"loss": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2077922077922078, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.008e-05, |
|
"loss": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21212121212121213, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.955315789473684e-05, |
|
"loss": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.21645021645021645, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.902631578947368e-05, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21645021645021645, |
|
"eval_loss": NaN, |
|
"eval_runtime": 8.378, |
|
"eval_samples_per_second": 11.578, |
|
"eval_steps_per_second": 2.984, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22077922077922077, |
|
"grad_norm": 0.34621065855026245, |
|
"learning_rate": 7.849947368421052e-05, |
|
"loss": 0.8347, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.22510822510822512, |
|
"grad_norm": 0.31127241253852844, |
|
"learning_rate": 7.797263157894736e-05, |
|
"loss": 0.7476, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.22943722943722944, |
|
"grad_norm": 0.3109810948371887, |
|
"learning_rate": 7.744578947368421e-05, |
|
"loss": 0.7386, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.23376623376623376, |
|
"grad_norm": 0.2887902855873108, |
|
"learning_rate": 7.691894736842104e-05, |
|
"loss": 0.6139, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 0.327152818441391, |
|
"learning_rate": 7.63921052631579e-05, |
|
"loss": 0.7329, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 0.3406467139720917, |
|
"learning_rate": 7.586526315789473e-05, |
|
"loss": 0.7201, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.24675324675324675, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.533842105263158e-05, |
|
"loss": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2510822510822511, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.481157894736841e-05, |
|
"loss": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2554112554112554, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.428473684210526e-05, |
|
"loss": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.375789473684209e-05, |
|
"loss": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.26406926406926406, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.323105263157895e-05, |
|
"loss": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2683982683982684, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.270421052631578e-05, |
|
"loss": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.217736842105263e-05, |
|
"loss": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.27705627705627706, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.165052631578947e-05, |
|
"loss": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2813852813852814, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.11236842105263e-05, |
|
"loss": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.059684210526315e-05, |
|
"loss": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.29004329004329005, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.006999999999998e-05, |
|
"loss": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2943722943722944, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.954315789473684e-05, |
|
"loss": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2987012987012987, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.901631578947368e-05, |
|
"loss": 0.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.848947368421052e-05, |
|
"loss": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.30735930735930733, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.796263157894737e-05, |
|
"loss": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3116883116883117, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.74357894736842e-05, |
|
"loss": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.31601731601731603, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.690894736842105e-05, |
|
"loss": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3203463203463203, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.638210526315788e-05, |
|
"loss": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.585526315789474e-05, |
|
"loss": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.329004329004329, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.532842105263157e-05, |
|
"loss": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.480157894736842e-05, |
|
"loss": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.33766233766233766, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.427473684210526e-05, |
|
"loss": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.341991341991342, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.37478947368421e-05, |
|
"loss": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3463203463203463, |
|
"grad_norm": 0.5967077612876892, |
|
"learning_rate": 6.322105263157894e-05, |
|
"loss": 1.5417, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.35064935064935066, |
|
"grad_norm": 0.481616735458374, |
|
"learning_rate": 6.269421052631577e-05, |
|
"loss": 0.7137, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.354978354978355, |
|
"grad_norm": 0.5399643778800964, |
|
"learning_rate": 6.216736842105263e-05, |
|
"loss": 0.7938, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3593073593073593, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.164052631578947e-05, |
|
"loss": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.111368421052631e-05, |
|
"loss": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.36796536796536794, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.058684210526315e-05, |
|
"loss": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3722943722943723, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.005999999999999e-05, |
|
"loss": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.37662337662337664, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.953315789473684e-05, |
|
"loss": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.9006315789473676e-05, |
|
"loss": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.3852813852813853, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.847947368421053e-05, |
|
"loss": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.795263157894737e-05, |
|
"loss": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3939393939393939, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.742578947368421e-05, |
|
"loss": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.39826839826839827, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6898947368421046e-05, |
|
"loss": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.4025974025974026, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6372105263157886e-05, |
|
"loss": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.4069264069264069, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.584526315789473e-05, |
|
"loss": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.41125541125541126, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.531842105263158e-05, |
|
"loss": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4155844155844156, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.4791578947368424e-05, |
|
"loss": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4199134199134199, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.426473684210526e-05, |
|
"loss": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.42424242424242425, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.37378947368421e-05, |
|
"loss": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.321105263157894e-05, |
|
"loss": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4329004329004329, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.268421052631578e-05, |
|
"loss": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4329004329004329, |
|
"eval_loss": NaN, |
|
"eval_runtime": 8.3775, |
|
"eval_samples_per_second": 11.579, |
|
"eval_steps_per_second": 2.984, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.43722943722943725, |
|
"grad_norm": 0.3175669014453888, |
|
"learning_rate": 5.2157368421052626e-05, |
|
"loss": 0.6472, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.44155844155844154, |
|
"grad_norm": 0.3083650767803192, |
|
"learning_rate": 5.163052631578947e-05, |
|
"loss": 0.7949, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.4458874458874459, |
|
"grad_norm": 0.2869618237018585, |
|
"learning_rate": 5.110368421052632e-05, |
|
"loss": 0.6991, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.45021645021645024, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.057684210526316e-05, |
|
"loss": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.005e-05, |
|
"loss": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4588744588744589, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.9523157894736836e-05, |
|
"loss": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.46320346320346323, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.899631578947368e-05, |
|
"loss": 0.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.4675324675324675, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.846947368421052e-05, |
|
"loss": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.47186147186147187, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.794263157894737e-05, |
|
"loss": 0.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.7415789473684206e-05, |
|
"loss": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4805194805194805, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.688894736842105e-05, |
|
"loss": 0.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.636210526315789e-05, |
|
"loss": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.48917748917748916, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.583526315789473e-05, |
|
"loss": 0.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4935064935064935, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.530842105263158e-05, |
|
"loss": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.49783549783549785, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.4781578947368416e-05, |
|
"loss": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5021645021645021, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.425473684210526e-05, |
|
"loss": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5064935064935064, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.372789473684211e-05, |
|
"loss": 0.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5108225108225108, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.320105263157895e-05, |
|
"loss": 0.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5151515151515151, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2674210526315786e-05, |
|
"loss": 0.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2147368421052626e-05, |
|
"loss": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5238095238095238, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.162052631578947e-05, |
|
"loss": 0.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5281385281385281, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.109368421052631e-05, |
|
"loss": 0.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5324675324675324, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.056684210526316e-05, |
|
"loss": 0.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5367965367965368, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.004e-05, |
|
"loss": 0.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5411255411255411, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.951315789473684e-05, |
|
"loss": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.898631578947368e-05, |
|
"loss": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5497835497835498, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.845947368421052e-05, |
|
"loss": 0.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5541125541125541, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.7932631578947367e-05, |
|
"loss": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5584415584415584, |
|
"grad_norm": 0.5039573311805725, |
|
"learning_rate": 3.7405789473684206e-05, |
|
"loss": 0.7812, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5627705627705628, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6878947368421045e-05, |
|
"loss": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5670995670995671, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.635210526315789e-05, |
|
"loss": 0.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.582526315789474e-05, |
|
"loss": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5757575757575758, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5298421052631576e-05, |
|
"loss": 0.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5800865800865801, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.477157894736842e-05, |
|
"loss": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.424473684210526e-05, |
|
"loss": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5887445887445888, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.37178947368421e-05, |
|
"loss": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5930735930735931, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.319105263157894e-05, |
|
"loss": 0.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5974025974025974, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.2664210526315786e-05, |
|
"loss": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6017316017316018, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.213736842105263e-05, |
|
"loss": 0.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.161052631578947e-05, |
|
"loss": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6103896103896104, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.108368421052632e-05, |
|
"loss": 0.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6147186147186147, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0556842105263156e-05, |
|
"loss": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6190476190476191, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0029999999999995e-05, |
|
"loss": 0.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6233766233766234, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9503157894736838e-05, |
|
"loss": 0.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6277056277056277, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8976315789473684e-05, |
|
"loss": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6320346320346321, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8449473684210523e-05, |
|
"loss": 0.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7922631578947366e-05, |
|
"loss": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6406926406926406, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7395789473684212e-05, |
|
"loss": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.645021645021645, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.686894736842105e-05, |
|
"loss": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.634210526315789e-05, |
|
"loss": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"eval_loss": NaN, |
|
"eval_runtime": 8.5995, |
|
"eval_samples_per_second": 11.28, |
|
"eval_steps_per_second": 2.907, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6536796536796536, |
|
"grad_norm": 0.3423425257205963, |
|
"learning_rate": 2.5815263157894736e-05, |
|
"loss": 0.6861, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.658008658008658, |
|
"grad_norm": 0.3168509304523468, |
|
"learning_rate": 2.528842105263158e-05, |
|
"loss": 0.7967, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6623376623376623, |
|
"grad_norm": 0.29653236269950867, |
|
"learning_rate": 2.4761578947368418e-05, |
|
"loss": 0.7314, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.2796725034713745, |
|
"learning_rate": 2.423473684210526e-05, |
|
"loss": 0.6499, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.670995670995671, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3707894736842103e-05, |
|
"loss": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6753246753246753, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3181052631578946e-05, |
|
"loss": 0.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6796536796536796, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.265421052631579e-05, |
|
"loss": 0.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.683982683982684, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.212736842105263e-05, |
|
"loss": 0.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6883116883116883, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1600526315789474e-05, |
|
"loss": 0.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6926406926406926, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1073684210526313e-05, |
|
"loss": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.696969696969697, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0546842105263155e-05, |
|
"loss": 0.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.7012987012987013, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.002e-05, |
|
"loss": 0.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7056277056277056, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.949315789473684e-05, |
|
"loss": 0.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.70995670995671, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8966315789473683e-05, |
|
"loss": 0.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8439473684210522e-05, |
|
"loss": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7186147186147186, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.791263157894737e-05, |
|
"loss": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7229437229437229, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.738578947368421e-05, |
|
"loss": 0.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.685894736842105e-05, |
|
"loss": 0.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7316017316017316, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6332105263157893e-05, |
|
"loss": 0.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7359307359307359, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5805263157894735e-05, |
|
"loss": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7402597402597403, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5278421052631578e-05, |
|
"loss": 0.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7445887445887446, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4751578947368419e-05, |
|
"loss": 0.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7489177489177489, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4224736842105262e-05, |
|
"loss": 0.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7532467532467533, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3697894736842106e-05, |
|
"loss": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3171052631578945e-05, |
|
"loss": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.264421052631579e-05, |
|
"loss": 0.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7662337662337663, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.211736842105263e-05, |
|
"loss": 0.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7705627705627706, |
|
"grad_norm": 0.4464944899082184, |
|
"learning_rate": 1.1590526315789473e-05, |
|
"loss": 0.7245, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7748917748917749, |
|
"grad_norm": 0.42561060190200806, |
|
"learning_rate": 1.1063684210526316e-05, |
|
"loss": 0.7586, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.4319334626197815, |
|
"learning_rate": 1.0536842105263156e-05, |
|
"loss": 0.7706, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7835497835497836, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.001e-05, |
|
"loss": 0.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7878787878787878, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.483157894736842e-06, |
|
"loss": 0.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7922077922077922, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.956315789473684e-06, |
|
"loss": 0.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7965367965367965, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.429473684210525e-06, |
|
"loss": 0.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8008658008658008, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.902631578947368e-06, |
|
"loss": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8051948051948052, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.3757894736842095e-06, |
|
"loss": 0.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.848947368421053e-06, |
|
"loss": 0.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8138528138528138, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.322105263157895e-06, |
|
"loss": 0.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.7952631578947365e-06, |
|
"loss": 0.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8225108225108225, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.268421052631578e-06, |
|
"loss": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8268398268398268, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.741578947368421e-06, |
|
"loss": 0.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8311688311688312, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2147368421052626e-06, |
|
"loss": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8354978354978355, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6878947368421047e-06, |
|
"loss": 0.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.8398268398268398, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.1610526315789474e-06, |
|
"loss": 0.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.634210526315789e-06, |
|
"loss": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1073684210526313e-06, |
|
"loss": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8528138528138528, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5805263157894737e-06, |
|
"loss": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0536842105263156e-06, |
|
"loss": 0.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8614718614718615, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.268421052631578e-07, |
|
"loss": 0.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"eval_loss": NaN, |
|
"eval_runtime": 8.7972, |
|
"eval_samples_per_second": 11.026, |
|
"eval_steps_per_second": 2.842, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0444398149605786e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|