|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 200, |
|
"global_step": 254, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007874015748031496, |
|
"grad_norm": 0.048636828926147534, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 0.1544, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015748031496062992, |
|
"grad_norm": 0.06829153340487745, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.2093, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.023622047244094488, |
|
"grad_norm": 0.14998917767699588, |
|
"learning_rate": 1.153846153846154e-05, |
|
"loss": 0.3021, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.031496062992125984, |
|
"grad_norm": 0.05894881793284517, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.1696, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03937007874015748, |
|
"grad_norm": 0.06797578046068338, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 0.1865, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.047244094488188976, |
|
"grad_norm": 0.04989501389059178, |
|
"learning_rate": 2.307692307692308e-05, |
|
"loss": 0.1527, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05511811023622047, |
|
"grad_norm": 0.13344186213137674, |
|
"learning_rate": 2.6923076923076923e-05, |
|
"loss": 0.2722, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06299212598425197, |
|
"grad_norm": 0.10816772928150016, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.243, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07086614173228346, |
|
"grad_norm": 0.04182475098336472, |
|
"learning_rate": 3.461538461538462e-05, |
|
"loss": 0.1224, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07874015748031496, |
|
"grad_norm": 0.1554026910734981, |
|
"learning_rate": 3.846153846153846e-05, |
|
"loss": 0.2677, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08661417322834646, |
|
"grad_norm": 0.15617841077219685, |
|
"learning_rate": 4.230769230769231e-05, |
|
"loss": 0.2695, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09448818897637795, |
|
"grad_norm": 0.07385844760092473, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 0.1463, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10236220472440945, |
|
"grad_norm": 0.16533108037457117, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2453, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11023622047244094, |
|
"grad_norm": 0.25293589244560055, |
|
"learning_rate": 5.384615384615385e-05, |
|
"loss": 0.2824, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11811023622047244, |
|
"grad_norm": 0.08466230707261538, |
|
"learning_rate": 5.769230769230769e-05, |
|
"loss": 0.133, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12598425196850394, |
|
"grad_norm": 0.1754852847424689, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 0.2315, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13385826771653545, |
|
"grad_norm": 0.11115487227175415, |
|
"learning_rate": 6.538461538461539e-05, |
|
"loss": 0.1498, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14173228346456693, |
|
"grad_norm": 0.11637176629211, |
|
"learning_rate": 6.923076923076924e-05, |
|
"loss": 0.1766, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.14960629921259844, |
|
"grad_norm": 0.08400494585959933, |
|
"learning_rate": 7.307692307692307e-05, |
|
"loss": 0.1329, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.15748031496062992, |
|
"grad_norm": 0.06131863575653607, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 0.1113, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16535433070866143, |
|
"grad_norm": 0.07242179489824115, |
|
"learning_rate": 8.076923076923078e-05, |
|
"loss": 0.1365, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1732283464566929, |
|
"grad_norm": 0.09210836600277003, |
|
"learning_rate": 8.461538461538461e-05, |
|
"loss": 0.1275, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.18110236220472442, |
|
"grad_norm": 0.08327248894343327, |
|
"learning_rate": 8.846153846153847e-05, |
|
"loss": 0.1333, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1889763779527559, |
|
"grad_norm": 0.12488058178149539, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 0.1552, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1968503937007874, |
|
"grad_norm": 0.0715803184402007, |
|
"learning_rate": 9.615384615384617e-05, |
|
"loss": 0.1005, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2047244094488189, |
|
"grad_norm": 0.08165576376897732, |
|
"learning_rate": 0.0001, |
|
"loss": 0.129, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2125984251968504, |
|
"grad_norm": 0.06777023964931363, |
|
"learning_rate": 9.999525361252996e-05, |
|
"loss": 0.0818, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2204724409448819, |
|
"grad_norm": 0.08188897028908457, |
|
"learning_rate": 9.998101535124758e-05, |
|
"loss": 0.1067, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2283464566929134, |
|
"grad_norm": 0.05955056508032511, |
|
"learning_rate": 9.995728791936504e-05, |
|
"loss": 0.0781, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.23622047244094488, |
|
"grad_norm": 0.10052278242607535, |
|
"learning_rate": 9.992407582166581e-05, |
|
"loss": 0.1132, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2440944881889764, |
|
"grad_norm": 0.07776039997533422, |
|
"learning_rate": 9.988138536364922e-05, |
|
"loss": 0.0974, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.25196850393700787, |
|
"grad_norm": 0.08388901010421355, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 0.1009, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.25984251968503935, |
|
"grad_norm": 0.06560022866292156, |
|
"learning_rate": 9.976760358471686e-05, |
|
"loss": 0.1105, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2677165354330709, |
|
"grad_norm": 0.05590070356365914, |
|
"learning_rate": 9.969653386589748e-05, |
|
"loss": 0.1103, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2755905511811024, |
|
"grad_norm": 0.04905786983982704, |
|
"learning_rate": 9.961602898685226e-05, |
|
"loss": 0.0881, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.28346456692913385, |
|
"grad_norm": 0.05490757430787913, |
|
"learning_rate": 9.952610423187516e-05, |
|
"loss": 0.0918, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.29133858267716534, |
|
"grad_norm": 0.043950941798686236, |
|
"learning_rate": 9.942677667367541e-05, |
|
"loss": 0.0881, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2992125984251969, |
|
"grad_norm": 0.05071542251542392, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 0.0729, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.30708661417322836, |
|
"grad_norm": 0.040398473430997464, |
|
"learning_rate": 9.9199990360734e-05, |
|
"loss": 0.0793, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.31496062992125984, |
|
"grad_norm": 0.037634938852860234, |
|
"learning_rate": 9.90725746626209e-05, |
|
"loss": 0.0776, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3228346456692913, |
|
"grad_norm": 0.0479244660692272, |
|
"learning_rate": 9.893584226636772e-05, |
|
"loss": 0.0801, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.33070866141732286, |
|
"grad_norm": 0.05346016466242833, |
|
"learning_rate": 9.878981913137179e-05, |
|
"loss": 0.0817, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.33858267716535434, |
|
"grad_norm": 0.07429962714762295, |
|
"learning_rate": 9.86345329809282e-05, |
|
"loss": 0.0942, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3464566929133858, |
|
"grad_norm": 0.036250514133953575, |
|
"learning_rate": 9.847001329696653e-05, |
|
"loss": 0.0761, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3543307086614173, |
|
"grad_norm": 0.03881655826068151, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 0.0751, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.36220472440944884, |
|
"grad_norm": 0.040273401041872256, |
|
"learning_rate": 9.811340001546251e-05, |
|
"loss": 0.0912, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3700787401574803, |
|
"grad_norm": 0.05203429733895773, |
|
"learning_rate": 9.792137412291265e-05, |
|
"loss": 0.0894, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3779527559055118, |
|
"grad_norm": 0.04242863105236421, |
|
"learning_rate": 9.772025009397537e-05, |
|
"loss": 0.0804, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3858267716535433, |
|
"grad_norm": 0.040515570278649306, |
|
"learning_rate": 9.751006611315356e-05, |
|
"loss": 0.0717, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3937007874015748, |
|
"grad_norm": 0.04280194987120334, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 0.0713, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4015748031496063, |
|
"grad_norm": 0.07612965297736185, |
|
"learning_rate": 9.706267962669998e-05, |
|
"loss": 0.0935, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4094488188976378, |
|
"grad_norm": 0.046501972727124995, |
|
"learning_rate": 9.682556205985274e-05, |
|
"loss": 0.0558, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.41732283464566927, |
|
"grad_norm": 0.039381255305059475, |
|
"learning_rate": 9.657955440256395e-05, |
|
"loss": 0.0741, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.4251968503937008, |
|
"grad_norm": 0.040869656588052625, |
|
"learning_rate": 9.632470336074009e-05, |
|
"loss": 0.0693, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4330708661417323, |
|
"grad_norm": 0.034972215143239324, |
|
"learning_rate": 9.606105731925283e-05, |
|
"loss": 0.0654, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4409448818897638, |
|
"grad_norm": 0.04651590524829242, |
|
"learning_rate": 9.578866633275288e-05, |
|
"loss": 0.0895, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.44881889763779526, |
|
"grad_norm": 0.03790710812187682, |
|
"learning_rate": 9.550758211616684e-05, |
|
"loss": 0.0587, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4566929133858268, |
|
"grad_norm": 0.06064386306880854, |
|
"learning_rate": 9.521785803487889e-05, |
|
"loss": 0.0704, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4645669291338583, |
|
"grad_norm": 0.04665702311958301, |
|
"learning_rate": 9.491954909459895e-05, |
|
"loss": 0.0785, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.47244094488188976, |
|
"grad_norm": 0.03845050550687295, |
|
"learning_rate": 9.46127119309197e-05, |
|
"loss": 0.0707, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.48031496062992124, |
|
"grad_norm": 0.06770425704069039, |
|
"learning_rate": 9.42974047985639e-05, |
|
"loss": 0.0989, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4881889763779528, |
|
"grad_norm": 0.0493379370118964, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 0.0773, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.49606299212598426, |
|
"grad_norm": 0.05424018637256655, |
|
"learning_rate": 9.364162167569907e-05, |
|
"loss": 0.0862, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5039370078740157, |
|
"grad_norm": 0.035888638018077, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.0696, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5118110236220472, |
|
"grad_norm": 0.05419778194628531, |
|
"learning_rate": 9.295269771849427e-05, |
|
"loss": 0.0655, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5196850393700787, |
|
"grad_norm": 0.042949223060275404, |
|
"learning_rate": 9.259597044191636e-05, |
|
"loss": 0.0607, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5275590551181102, |
|
"grad_norm": 0.04264699828582032, |
|
"learning_rate": 9.223115608612325e-05, |
|
"loss": 0.0647, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5354330708661418, |
|
"grad_norm": 0.04465981374861148, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 0.0721, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5433070866141733, |
|
"grad_norm": 0.15086641781783566, |
|
"learning_rate": 9.147754470716408e-05, |
|
"loss": 0.0588, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5511811023622047, |
|
"grad_norm": 0.03608693394469442, |
|
"learning_rate": 9.108889076126226e-05, |
|
"loss": 0.0598, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5590551181102362, |
|
"grad_norm": 0.04512147014513813, |
|
"learning_rate": 9.069243586350975e-05, |
|
"loss": 0.0683, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5669291338582677, |
|
"grad_norm": 0.034531651539235535, |
|
"learning_rate": 9.028825528304892e-05, |
|
"loss": 0.0534, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5748031496062992, |
|
"grad_norm": 0.04001741026747657, |
|
"learning_rate": 8.987642575578545e-05, |
|
"loss": 0.0679, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5826771653543307, |
|
"grad_norm": 0.0627167723825249, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 0.0843, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5905511811023622, |
|
"grad_norm": 0.041162273807440045, |
|
"learning_rate": 8.903013405060211e-05, |
|
"loss": 0.0769, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5984251968503937, |
|
"grad_norm": 0.06413471441605206, |
|
"learning_rate": 8.859583254581605e-05, |
|
"loss": 0.079, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6062992125984252, |
|
"grad_norm": 0.038139347016296524, |
|
"learning_rate": 8.815420340999033e-05, |
|
"loss": 0.061, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6141732283464567, |
|
"grad_norm": 0.05637898761021731, |
|
"learning_rate": 8.770533048884482e-05, |
|
"loss": 0.0619, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6220472440944882, |
|
"grad_norm": 0.0357824590275947, |
|
"learning_rate": 8.724929900337186e-05, |
|
"loss": 0.0584, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6299212598425197, |
|
"grad_norm": 0.051212360329821134, |
|
"learning_rate": 8.678619553365659e-05, |
|
"loss": 0.0839, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6377952755905512, |
|
"grad_norm": 0.05049167558999489, |
|
"learning_rate": 8.631610800243926e-05, |
|
"loss": 0.0589, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6456692913385826, |
|
"grad_norm": 0.03961922655074931, |
|
"learning_rate": 8.583912565842257e-05, |
|
"loss": 0.0657, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6535433070866141, |
|
"grad_norm": 0.04037566238898647, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.0747, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6614173228346457, |
|
"grad_norm": 0.039919573702817145, |
|
"learning_rate": 8.486484005469977e-05, |
|
"loss": 0.0719, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6692913385826772, |
|
"grad_norm": 0.033793800349021845, |
|
"learning_rate": 8.436772176847294e-05, |
|
"loss": 0.0582, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6771653543307087, |
|
"grad_norm": 0.033700515363253886, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 0.0669, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6850393700787402, |
|
"grad_norm": 0.040605736837914866, |
|
"learning_rate": 8.335400611257067e-05, |
|
"loss": 0.0652, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6929133858267716, |
|
"grad_norm": 0.054522693875205565, |
|
"learning_rate": 8.283760120238672e-05, |
|
"loss": 0.0717, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7007874015748031, |
|
"grad_norm": 0.03702465616892497, |
|
"learning_rate": 8.231496189304704e-05, |
|
"loss": 0.0731, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7086614173228346, |
|
"grad_norm": 0.03285710986831703, |
|
"learning_rate": 8.178618741049842e-05, |
|
"loss": 0.0668, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7165354330708661, |
|
"grad_norm": 0.053063329951814404, |
|
"learning_rate": 8.125137814548393e-05, |
|
"loss": 0.067, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7244094488188977, |
|
"grad_norm": 0.040180722630613536, |
|
"learning_rate": 8.07106356344834e-05, |
|
"loss": 0.0698, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7322834645669292, |
|
"grad_norm": 0.0416900575795704, |
|
"learning_rate": 8.016406254043595e-05, |
|
"loss": 0.0725, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7401574803149606, |
|
"grad_norm": 0.03869655273152511, |
|
"learning_rate": 7.961176263324901e-05, |
|
"loss": 0.0659, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7480314960629921, |
|
"grad_norm": 0.044825068618721646, |
|
"learning_rate": 7.905384077009693e-05, |
|
"loss": 0.0731, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7559055118110236, |
|
"grad_norm": 0.044483649059968176, |
|
"learning_rate": 7.849040287551331e-05, |
|
"loss": 0.0634, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7637795275590551, |
|
"grad_norm": 0.04017053730657027, |
|
"learning_rate": 7.79215559212807e-05, |
|
"loss": 0.078, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7716535433070866, |
|
"grad_norm": 0.04145460320890759, |
|
"learning_rate": 7.734740790612136e-05, |
|
"loss": 0.0768, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7795275590551181, |
|
"grad_norm": 0.03633052402965569, |
|
"learning_rate": 7.676806783519304e-05, |
|
"loss": 0.0664, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7874015748031497, |
|
"grad_norm": 0.04021689685346706, |
|
"learning_rate": 7.618364569939391e-05, |
|
"loss": 0.0674, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7952755905511811, |
|
"grad_norm": 0.060396460565417545, |
|
"learning_rate": 7.559425245448006e-05, |
|
"loss": 0.0694, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8031496062992126, |
|
"grad_norm": 0.03969555967865163, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0559, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8110236220472441, |
|
"grad_norm": 0.038622088305060684, |
|
"learning_rate": 7.440100115804991e-05, |
|
"loss": 0.0678, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8188976377952756, |
|
"grad_norm": 0.0408599543219439, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 0.0596, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8267716535433071, |
|
"grad_norm": 0.03612580723785473, |
|
"learning_rate": 7.318922008417203e-05, |
|
"loss": 0.0613, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8346456692913385, |
|
"grad_norm": 0.051121513111016405, |
|
"learning_rate": 7.257666791554448e-05, |
|
"loss": 0.0681, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.84251968503937, |
|
"grad_norm": 0.053947523356233006, |
|
"learning_rate": 7.195982944236851e-05, |
|
"loss": 0.0642, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8503937007874016, |
|
"grad_norm": 0.04708378321870759, |
|
"learning_rate": 7.133882177482019e-05, |
|
"loss": 0.0714, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8582677165354331, |
|
"grad_norm": 0.05157517691265944, |
|
"learning_rate": 7.071376281461994e-05, |
|
"loss": 0.0679, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8661417322834646, |
|
"grad_norm": 0.043251187130909544, |
|
"learning_rate": 7.008477123264848e-05, |
|
"loss": 0.0638, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8740157480314961, |
|
"grad_norm": 0.03498275614195765, |
|
"learning_rate": 6.94519664464163e-05, |
|
"loss": 0.0545, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8818897637795275, |
|
"grad_norm": 0.04509157725839411, |
|
"learning_rate": 6.881546859739179e-05, |
|
"loss": 0.072, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.889763779527559, |
|
"grad_norm": 0.03949179802033183, |
|
"learning_rate": 6.817539852819149e-05, |
|
"loss": 0.0679, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8976377952755905, |
|
"grad_norm": 0.04824956524429339, |
|
"learning_rate": 6.753187775963773e-05, |
|
"loss": 0.0602, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.905511811023622, |
|
"grad_norm": 0.05052923605617683, |
|
"learning_rate": 6.688502846768696e-05, |
|
"loss": 0.07, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9133858267716536, |
|
"grad_norm": 0.049650560302814, |
|
"learning_rate": 6.623497346023418e-05, |
|
"loss": 0.0588, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9212598425196851, |
|
"grad_norm": 0.04693328297700466, |
|
"learning_rate": 6.558183615379707e-05, |
|
"loss": 0.0848, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9291338582677166, |
|
"grad_norm": 0.05162117635179258, |
|
"learning_rate": 6.492574055008473e-05, |
|
"loss": 0.0779, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.937007874015748, |
|
"grad_norm": 0.04642140808574063, |
|
"learning_rate": 6.426681121245527e-05, |
|
"loss": 0.0711, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.9448818897637795, |
|
"grad_norm": 0.03821222457115445, |
|
"learning_rate": 6.360517324226676e-05, |
|
"loss": 0.0597, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.952755905511811, |
|
"grad_norm": 0.07405595024512149, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 0.0691, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9606299212598425, |
|
"grad_norm": 0.04965701613939319, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 0.0697, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.968503937007874, |
|
"grad_norm": 0.053755810398937655, |
|
"learning_rate": 6.16052661204734e-05, |
|
"loss": 0.0713, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9763779527559056, |
|
"grad_norm": 0.041060956975168636, |
|
"learning_rate": 6.09340545603188e-05, |
|
"loss": 0.0752, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.984251968503937, |
|
"grad_norm": 0.05257209833173453, |
|
"learning_rate": 6.026076710978171e-05, |
|
"loss": 0.0749, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9921259842519685, |
|
"grad_norm": 0.04634248903143434, |
|
"learning_rate": 5.958553159618693e-05, |
|
"loss": 0.0721, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.04595655646478586, |
|
"learning_rate": 5.890847621670966e-05, |
|
"loss": 0.0643, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.0078740157480315, |
|
"grad_norm": 0.0428889003832125, |
|
"learning_rate": 5.8229729514036705e-05, |
|
"loss": 0.0592, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.015748031496063, |
|
"grad_norm": 0.06513209718503772, |
|
"learning_rate": 5.7549420351961844e-05, |
|
"loss": 0.0661, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.0236220472440944, |
|
"grad_norm": 0.048508681912102766, |
|
"learning_rate": 5.686767789092041e-05, |
|
"loss": 0.062, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.031496062992126, |
|
"grad_norm": 0.03942306111426942, |
|
"learning_rate": 5.618463156346739e-05, |
|
"loss": 0.06, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.0393700787401574, |
|
"grad_norm": 0.05920893966651, |
|
"learning_rate": 5.550041104970397e-05, |
|
"loss": 0.0574, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.047244094488189, |
|
"grad_norm": 0.06537752227653837, |
|
"learning_rate": 5.481514625265709e-05, |
|
"loss": 0.0789, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0551181102362204, |
|
"grad_norm": 0.052133215264540934, |
|
"learning_rate": 5.4128967273616625e-05, |
|
"loss": 0.0539, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0629921259842519, |
|
"grad_norm": 0.03961482885456455, |
|
"learning_rate": 5.344200438743489e-05, |
|
"loss": 0.0584, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0708661417322836, |
|
"grad_norm": 0.04991806278320797, |
|
"learning_rate": 5.2754388017793274e-05, |
|
"loss": 0.0597, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.078740157480315, |
|
"grad_norm": 0.05656237899859479, |
|
"learning_rate": 5.2066248712440656e-05, |
|
"loss": 0.0758, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0866141732283465, |
|
"grad_norm": 0.055792787158515794, |
|
"learning_rate": 5.1377717118408105e-05, |
|
"loss": 0.0754, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.094488188976378, |
|
"grad_norm": 0.058089701897722286, |
|
"learning_rate": 5.068892395720483e-05, |
|
"loss": 0.0673, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.1023622047244095, |
|
"grad_norm": 0.049023325150838494, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0591, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.110236220472441, |
|
"grad_norm": 0.04294743238196575, |
|
"learning_rate": 4.9311076042795185e-05, |
|
"loss": 0.0573, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.1181102362204725, |
|
"grad_norm": 0.04629796470945678, |
|
"learning_rate": 4.8622282881591906e-05, |
|
"loss": 0.0662, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.125984251968504, |
|
"grad_norm": 0.04706455796489872, |
|
"learning_rate": 4.7933751287559335e-05, |
|
"loss": 0.0705, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.1338582677165354, |
|
"grad_norm": 0.04805424749077547, |
|
"learning_rate": 4.7245611982206724e-05, |
|
"loss": 0.0617, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.141732283464567, |
|
"grad_norm": 0.04537362622805077, |
|
"learning_rate": 4.6557995612565144e-05, |
|
"loss": 0.0563, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1496062992125984, |
|
"grad_norm": 0.043403479823984015, |
|
"learning_rate": 4.5871032726383386e-05, |
|
"loss": 0.0534, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.1574803149606299, |
|
"grad_norm": 0.03764537967703119, |
|
"learning_rate": 4.518485374734292e-05, |
|
"loss": 0.0515, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1653543307086613, |
|
"grad_norm": 0.056823438107632934, |
|
"learning_rate": 4.449958895029604e-05, |
|
"loss": 0.0742, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1732283464566928, |
|
"grad_norm": 0.04193687232087755, |
|
"learning_rate": 4.381536843653262e-05, |
|
"loss": 0.0542, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1811023622047245, |
|
"grad_norm": 0.046536924814026416, |
|
"learning_rate": 4.3132322109079596e-05, |
|
"loss": 0.0855, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.188976377952756, |
|
"grad_norm": 0.05457156199530363, |
|
"learning_rate": 4.2450579648038154e-05, |
|
"loss": 0.0553, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1968503937007875, |
|
"grad_norm": 0.04033132170380821, |
|
"learning_rate": 4.17702704859633e-05, |
|
"loss": 0.0538, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.204724409448819, |
|
"grad_norm": 0.049656351067623145, |
|
"learning_rate": 4.109152378329036e-05, |
|
"loss": 0.0542, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.2125984251968505, |
|
"grad_norm": 0.03998164718927446, |
|
"learning_rate": 4.0414468403813095e-05, |
|
"loss": 0.0521, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.220472440944882, |
|
"grad_norm": 0.05745317845238628, |
|
"learning_rate": 3.973923289021829e-05, |
|
"loss": 0.0609, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2283464566929134, |
|
"grad_norm": 0.042323711634250194, |
|
"learning_rate": 3.9065945439681214e-05, |
|
"loss": 0.0626, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.236220472440945, |
|
"grad_norm": 0.05723447684288661, |
|
"learning_rate": 3.839473387952662e-05, |
|
"loss": 0.0643, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.2440944881889764, |
|
"grad_norm": 0.05609057194605591, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 0.0446, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.2519685039370079, |
|
"grad_norm": 0.04383475827201075, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 0.054, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.2598425196850394, |
|
"grad_norm": 0.04721149501353491, |
|
"learning_rate": 3.639482675773324e-05, |
|
"loss": 0.0768, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2677165354330708, |
|
"grad_norm": 0.044660845340955664, |
|
"learning_rate": 3.5733188787544745e-05, |
|
"loss": 0.0433, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.2755905511811023, |
|
"grad_norm": 0.05604911676781777, |
|
"learning_rate": 3.5074259449915284e-05, |
|
"loss": 0.0621, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2834645669291338, |
|
"grad_norm": 0.049412392554855175, |
|
"learning_rate": 3.4418163846202944e-05, |
|
"loss": 0.067, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2913385826771653, |
|
"grad_norm": 0.05304697118137444, |
|
"learning_rate": 3.3765026539765834e-05, |
|
"loss": 0.06, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2992125984251968, |
|
"grad_norm": 0.05294702902408293, |
|
"learning_rate": 3.3114971532313056e-05, |
|
"loss": 0.067, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.3070866141732282, |
|
"grad_norm": 0.041258642954892746, |
|
"learning_rate": 3.2468122240362284e-05, |
|
"loss": 0.0472, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.3149606299212597, |
|
"grad_norm": 0.053885127357226115, |
|
"learning_rate": 3.18246014718085e-05, |
|
"loss": 0.0679, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.3228346456692912, |
|
"grad_norm": 0.06157836754578656, |
|
"learning_rate": 3.118453140260823e-05, |
|
"loss": 0.0735, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.330708661417323, |
|
"grad_norm": 0.05459458021353128, |
|
"learning_rate": 3.0548033553583705e-05, |
|
"loss": 0.0618, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.3385826771653544, |
|
"grad_norm": 0.06138229722884799, |
|
"learning_rate": 2.991522876735154e-05, |
|
"loss": 0.06, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3464566929133859, |
|
"grad_norm": 0.04607977533183901, |
|
"learning_rate": 2.928623718538006e-05, |
|
"loss": 0.056, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.3543307086614174, |
|
"grad_norm": 0.04457994569263475, |
|
"learning_rate": 2.866117822517982e-05, |
|
"loss": 0.0505, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3622047244094488, |
|
"grad_norm": 0.05347674759185391, |
|
"learning_rate": 2.804017055763149e-05, |
|
"loss": 0.0701, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.3700787401574803, |
|
"grad_norm": 0.04603095018943048, |
|
"learning_rate": 2.7423332084455544e-05, |
|
"loss": 0.0661, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3779527559055118, |
|
"grad_norm": 0.04535204168167475, |
|
"learning_rate": 2.681077991582797e-05, |
|
"loss": 0.0638, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3858267716535433, |
|
"grad_norm": 0.044079347580669566, |
|
"learning_rate": 2.6202630348146324e-05, |
|
"loss": 0.0483, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3937007874015748, |
|
"grad_norm": 0.04787312823352356, |
|
"learning_rate": 2.5598998841950107e-05, |
|
"loss": 0.0485, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.4015748031496063, |
|
"grad_norm": 0.053556935205416635, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.0598, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.4094488188976377, |
|
"grad_norm": 0.048892583117746094, |
|
"learning_rate": 2.4405747545519963e-05, |
|
"loss": 0.0671, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.4173228346456692, |
|
"grad_norm": 0.05573731864284115, |
|
"learning_rate": 2.381635430060611e-05, |
|
"loss": 0.0519, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.425196850393701, |
|
"grad_norm": 0.04104144198286972, |
|
"learning_rate": 2.323193216480698e-05, |
|
"loss": 0.0619, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.4330708661417324, |
|
"grad_norm": 0.04902777007420262, |
|
"learning_rate": 2.2652592093878666e-05, |
|
"loss": 0.0587, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.4409448818897639, |
|
"grad_norm": 0.04484354432801945, |
|
"learning_rate": 2.207844407871929e-05, |
|
"loss": 0.0554, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.4488188976377954, |
|
"grad_norm": 0.06590936087340947, |
|
"learning_rate": 2.150959712448669e-05, |
|
"loss": 0.0811, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.4566929133858268, |
|
"grad_norm": 0.04102430626416167, |
|
"learning_rate": 2.094615922990309e-05, |
|
"loss": 0.0428, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.4645669291338583, |
|
"grad_norm": 0.041794329984435635, |
|
"learning_rate": 2.0388237366751006e-05, |
|
"loss": 0.0605, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.4724409448818898, |
|
"grad_norm": 0.05045021005842732, |
|
"learning_rate": 1.9835937459564064e-05, |
|
"loss": 0.0536, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4803149606299213, |
|
"grad_norm": 0.04618763538981042, |
|
"learning_rate": 1.928936436551661e-05, |
|
"loss": 0.0556, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4881889763779528, |
|
"grad_norm": 0.05106802741492108, |
|
"learning_rate": 1.874862185451608e-05, |
|
"loss": 0.0709, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.4960629921259843, |
|
"grad_norm": 0.05597419656508567, |
|
"learning_rate": 1.821381258950161e-05, |
|
"loss": 0.0489, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5039370078740157, |
|
"grad_norm": 0.06222180812872232, |
|
"learning_rate": 1.768503810695295e-05, |
|
"loss": 0.0578, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.5118110236220472, |
|
"grad_norm": 0.04964108845251158, |
|
"learning_rate": 1.7162398797613282e-05, |
|
"loss": 0.0555, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.5196850393700787, |
|
"grad_norm": 0.05088603506896626, |
|
"learning_rate": 1.6645993887429345e-05, |
|
"loss": 0.0622, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.5275590551181102, |
|
"grad_norm": 0.046495091752318314, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 0.0595, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.5354330708661417, |
|
"grad_norm": 0.053917463079799346, |
|
"learning_rate": 1.563227823152708e-05, |
|
"loss": 0.075, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5433070866141732, |
|
"grad_norm": 0.08260510658354386, |
|
"learning_rate": 1.5135159945300231e-05, |
|
"loss": 0.0716, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.5511811023622046, |
|
"grad_norm": 0.04738267952758889, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.0587, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.5590551181102361, |
|
"grad_norm": 0.051915775011872965, |
|
"learning_rate": 1.4160874341577446e-05, |
|
"loss": 0.0521, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5669291338582676, |
|
"grad_norm": 0.05481705129948632, |
|
"learning_rate": 1.368389199756075e-05, |
|
"loss": 0.0752, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.574803149606299, |
|
"grad_norm": 0.05217737291956296, |
|
"learning_rate": 1.3213804466343421e-05, |
|
"loss": 0.0736, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.574803149606299, |
|
"eval_loss": 0.06070369854569435, |
|
"eval_runtime": 6.4146, |
|
"eval_samples_per_second": 0.935, |
|
"eval_steps_per_second": 0.312, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5826771653543306, |
|
"grad_norm": 0.04678999221530293, |
|
"learning_rate": 1.275070099662815e-05, |
|
"loss": 0.0707, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.590551181102362, |
|
"grad_norm": 0.05124530088869734, |
|
"learning_rate": 1.2294669511155193e-05, |
|
"loss": 0.0559, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.5984251968503937, |
|
"grad_norm": 0.05350993198205092, |
|
"learning_rate": 1.1845796590009683e-05, |
|
"loss": 0.0768, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.6062992125984252, |
|
"grad_norm": 0.05328374514051326, |
|
"learning_rate": 1.1404167454183957e-05, |
|
"loss": 0.0638, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.6141732283464567, |
|
"grad_norm": 0.049069065326141866, |
|
"learning_rate": 1.0969865949397901e-05, |
|
"loss": 0.0701, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.6220472440944882, |
|
"grad_norm": 0.051454451624391905, |
|
"learning_rate": 1.0542974530180327e-05, |
|
"loss": 0.0535, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.6299212598425197, |
|
"grad_norm": 0.04437769885060027, |
|
"learning_rate": 1.012357424421455e-05, |
|
"loss": 0.0521, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.6377952755905512, |
|
"grad_norm": 0.04825172010643271, |
|
"learning_rate": 9.711744716951093e-06, |
|
"loss": 0.0689, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.6456692913385826, |
|
"grad_norm": 0.06554675030762395, |
|
"learning_rate": 9.307564136490254e-06, |
|
"loss": 0.0612, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.6535433070866141, |
|
"grad_norm": 0.05866366054712351, |
|
"learning_rate": 8.911109238737747e-06, |
|
"loss": 0.0791, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6614173228346458, |
|
"grad_norm": 0.04336852944598847, |
|
"learning_rate": 8.522455292835934e-06, |
|
"loss": 0.0539, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.6692913385826773, |
|
"grad_norm": 0.051947799814034856, |
|
"learning_rate": 8.141676086873572e-06, |
|
"loss": 0.0639, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.6771653543307088, |
|
"grad_norm": 0.04351599998197119, |
|
"learning_rate": 7.768843913876756e-06, |
|
"loss": 0.0482, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.6850393700787403, |
|
"grad_norm": 0.06480464685922364, |
|
"learning_rate": 7.404029558083653e-06, |
|
"loss": 0.0662, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6929133858267718, |
|
"grad_norm": 0.04654351665030391, |
|
"learning_rate": 7.047302281505736e-06, |
|
"loss": 0.0566, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.7007874015748032, |
|
"grad_norm": 0.04652775620652711, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.0571, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.7086614173228347, |
|
"grad_norm": 0.047224618696043545, |
|
"learning_rate": 6.3583783243009285e-06, |
|
"loss": 0.0592, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.7165354330708662, |
|
"grad_norm": 0.053615270349897666, |
|
"learning_rate": 6.026312439675552e-06, |
|
"loss": 0.0646, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.7244094488188977, |
|
"grad_norm": 0.04665853226611898, |
|
"learning_rate": 5.702595201436101e-06, |
|
"loss": 0.0574, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.7322834645669292, |
|
"grad_norm": 0.05099339647037513, |
|
"learning_rate": 5.387288069080299e-06, |
|
"loss": 0.0614, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7401574803149606, |
|
"grad_norm": 0.043289965587830936, |
|
"learning_rate": 5.080450905401057e-06, |
|
"loss": 0.0468, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.7480314960629921, |
|
"grad_norm": 0.0692890477383107, |
|
"learning_rate": 4.782141965121128e-06, |
|
"loss": 0.0493, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.7559055118110236, |
|
"grad_norm": 0.0403442528103141, |
|
"learning_rate": 4.492417883833155e-06, |
|
"loss": 0.0476, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.763779527559055, |
|
"grad_norm": 0.04191520401385937, |
|
"learning_rate": 4.2113336672471245e-06, |
|
"loss": 0.052, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.7716535433070866, |
|
"grad_norm": 0.054916613323571846, |
|
"learning_rate": 3.9389426807471766e-06, |
|
"loss": 0.0653, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.779527559055118, |
|
"grad_norm": 0.04750102972177334, |
|
"learning_rate": 3.675296639259912e-06, |
|
"loss": 0.0578, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.7874015748031495, |
|
"grad_norm": 0.0588876838593015, |
|
"learning_rate": 3.420445597436056e-06, |
|
"loss": 0.0565, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.795275590551181, |
|
"grad_norm": 0.050319040408150564, |
|
"learning_rate": 3.1744379401472677e-06, |
|
"loss": 0.0546, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.8031496062992125, |
|
"grad_norm": 0.05754277450687693, |
|
"learning_rate": 2.9373203733000232e-06, |
|
"loss": 0.0686, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.811023622047244, |
|
"grad_norm": 0.05586902250961728, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 0.0673, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8188976377952755, |
|
"grad_norm": 0.05258380689743355, |
|
"learning_rate": 2.4899338868464404e-06, |
|
"loss": 0.0656, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.826771653543307, |
|
"grad_norm": 0.0444146353656455, |
|
"learning_rate": 2.2797499060246253e-06, |
|
"loss": 0.0543, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.8346456692913384, |
|
"grad_norm": 0.050066903014096574, |
|
"learning_rate": 2.0786258770873647e-06, |
|
"loss": 0.0599, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.84251968503937, |
|
"grad_norm": 0.045770675687925415, |
|
"learning_rate": 1.8865999845374793e-06, |
|
"loss": 0.0599, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.8503937007874016, |
|
"grad_norm": 0.05464584148563528, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 0.0685, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.858267716535433, |
|
"grad_norm": 0.04968886797769799, |
|
"learning_rate": 1.5299867030334814e-06, |
|
"loss": 0.068, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.8661417322834646, |
|
"grad_norm": 0.0552386892230046, |
|
"learning_rate": 1.3654670190718034e-06, |
|
"loss": 0.0668, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.874015748031496, |
|
"grad_norm": 0.042676978639831414, |
|
"learning_rate": 1.210180868628219e-06, |
|
"loss": 0.053, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.8818897637795275, |
|
"grad_norm": 0.04702355316793717, |
|
"learning_rate": 1.064157733632276e-06, |
|
"loss": 0.0513, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.889763779527559, |
|
"grad_norm": 0.07317497229013392, |
|
"learning_rate": 9.274253373791064e-07, |
|
"loss": 0.0609, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8976377952755905, |
|
"grad_norm": 0.04768857996113459, |
|
"learning_rate": 8.000096392660029e-07, |
|
"loss": 0.0523, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.905511811023622, |
|
"grad_norm": 0.0600627581282451, |
|
"learning_rate": 6.819348298638839e-07, |
|
"loss": 0.0756, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.9133858267716537, |
|
"grad_norm": 0.048224067594923814, |
|
"learning_rate": 5.732233263245845e-07, |
|
"loss": 0.0568, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.9212598425196852, |
|
"grad_norm": 0.051208440325568635, |
|
"learning_rate": 4.738957681248379e-07, |
|
"loss": 0.0576, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.9291338582677167, |
|
"grad_norm": 0.05305714441239726, |
|
"learning_rate": 3.839710131477492e-07, |
|
"loss": 0.0578, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.9370078740157481, |
|
"grad_norm": 0.0539897244444409, |
|
"learning_rate": 3.034661341025258e-07, |
|
"loss": 0.0586, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.9448818897637796, |
|
"grad_norm": 0.04186116850231722, |
|
"learning_rate": 2.323964152831426e-07, |
|
"loss": 0.0511, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.952755905511811, |
|
"grad_norm": 0.049234512985121694, |
|
"learning_rate": 1.7077534966650766e-07, |
|
"loss": 0.0597, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.9606299212598426, |
|
"grad_norm": 0.0551149693713877, |
|
"learning_rate": 1.1861463635077785e-07, |
|
"loss": 0.0684, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.968503937007874, |
|
"grad_norm": 0.05304684823721321, |
|
"learning_rate": 7.59241783341913e-08, |
|
"loss": 0.0682, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.9763779527559056, |
|
"grad_norm": 0.07253818377949055, |
|
"learning_rate": 4.2712080634949024e-08, |
|
"loss": 0.0637, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.984251968503937, |
|
"grad_norm": 0.053065918527288605, |
|
"learning_rate": 1.8984648752429225e-08, |
|
"loss": 0.0693, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.9921259842519685, |
|
"grad_norm": 0.05734495408696275, |
|
"learning_rate": 4.746387470044855e-09, |
|
"loss": 0.061, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.05703286573729645, |
|
"learning_rate": 0.0, |
|
"loss": 0.0489, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 254, |
|
"total_flos": 643226180517888.0, |
|
"train_loss": 0.07837846240131405, |
|
"train_runtime": 1925.3112, |
|
"train_samples_per_second": 0.525, |
|
"train_steps_per_second": 0.132 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 254, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 643226180517888.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|