cutelemonlili's picture
Add files using upload-large-folder tool
8bb022b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 254,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007874015748031496,
"grad_norm": 0.048636828926147534,
"learning_rate": 3.846153846153847e-06,
"loss": 0.1544,
"step": 1
},
{
"epoch": 0.015748031496062992,
"grad_norm": 0.06829153340487745,
"learning_rate": 7.692307692307694e-06,
"loss": 0.2093,
"step": 2
},
{
"epoch": 0.023622047244094488,
"grad_norm": 0.14998917767699588,
"learning_rate": 1.153846153846154e-05,
"loss": 0.3021,
"step": 3
},
{
"epoch": 0.031496062992125984,
"grad_norm": 0.05894881793284517,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.1696,
"step": 4
},
{
"epoch": 0.03937007874015748,
"grad_norm": 0.06797578046068338,
"learning_rate": 1.923076923076923e-05,
"loss": 0.1865,
"step": 5
},
{
"epoch": 0.047244094488188976,
"grad_norm": 0.04989501389059178,
"learning_rate": 2.307692307692308e-05,
"loss": 0.1527,
"step": 6
},
{
"epoch": 0.05511811023622047,
"grad_norm": 0.13344186213137674,
"learning_rate": 2.6923076923076923e-05,
"loss": 0.2722,
"step": 7
},
{
"epoch": 0.06299212598425197,
"grad_norm": 0.10816772928150016,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.243,
"step": 8
},
{
"epoch": 0.07086614173228346,
"grad_norm": 0.04182475098336472,
"learning_rate": 3.461538461538462e-05,
"loss": 0.1224,
"step": 9
},
{
"epoch": 0.07874015748031496,
"grad_norm": 0.1554026910734981,
"learning_rate": 3.846153846153846e-05,
"loss": 0.2677,
"step": 10
},
{
"epoch": 0.08661417322834646,
"grad_norm": 0.15617841077219685,
"learning_rate": 4.230769230769231e-05,
"loss": 0.2695,
"step": 11
},
{
"epoch": 0.09448818897637795,
"grad_norm": 0.07385844760092473,
"learning_rate": 4.615384615384616e-05,
"loss": 0.1463,
"step": 12
},
{
"epoch": 0.10236220472440945,
"grad_norm": 0.16533108037457117,
"learning_rate": 5e-05,
"loss": 0.2453,
"step": 13
},
{
"epoch": 0.11023622047244094,
"grad_norm": 0.25293589244560055,
"learning_rate": 5.384615384615385e-05,
"loss": 0.2824,
"step": 14
},
{
"epoch": 0.11811023622047244,
"grad_norm": 0.08466230707261538,
"learning_rate": 5.769230769230769e-05,
"loss": 0.133,
"step": 15
},
{
"epoch": 0.12598425196850394,
"grad_norm": 0.1754852847424689,
"learning_rate": 6.153846153846155e-05,
"loss": 0.2315,
"step": 16
},
{
"epoch": 0.13385826771653545,
"grad_norm": 0.11115487227175415,
"learning_rate": 6.538461538461539e-05,
"loss": 0.1498,
"step": 17
},
{
"epoch": 0.14173228346456693,
"grad_norm": 0.11637176629211,
"learning_rate": 6.923076923076924e-05,
"loss": 0.1766,
"step": 18
},
{
"epoch": 0.14960629921259844,
"grad_norm": 0.08400494585959933,
"learning_rate": 7.307692307692307e-05,
"loss": 0.1329,
"step": 19
},
{
"epoch": 0.15748031496062992,
"grad_norm": 0.06131863575653607,
"learning_rate": 7.692307692307693e-05,
"loss": 0.1113,
"step": 20
},
{
"epoch": 0.16535433070866143,
"grad_norm": 0.07242179489824115,
"learning_rate": 8.076923076923078e-05,
"loss": 0.1365,
"step": 21
},
{
"epoch": 0.1732283464566929,
"grad_norm": 0.09210836600277003,
"learning_rate": 8.461538461538461e-05,
"loss": 0.1275,
"step": 22
},
{
"epoch": 0.18110236220472442,
"grad_norm": 0.08327248894343327,
"learning_rate": 8.846153846153847e-05,
"loss": 0.1333,
"step": 23
},
{
"epoch": 0.1889763779527559,
"grad_norm": 0.12488058178149539,
"learning_rate": 9.230769230769232e-05,
"loss": 0.1552,
"step": 24
},
{
"epoch": 0.1968503937007874,
"grad_norm": 0.0715803184402007,
"learning_rate": 9.615384615384617e-05,
"loss": 0.1005,
"step": 25
},
{
"epoch": 0.2047244094488189,
"grad_norm": 0.08165576376897732,
"learning_rate": 0.0001,
"loss": 0.129,
"step": 26
},
{
"epoch": 0.2125984251968504,
"grad_norm": 0.06777023964931363,
"learning_rate": 9.999525361252996e-05,
"loss": 0.0818,
"step": 27
},
{
"epoch": 0.2204724409448819,
"grad_norm": 0.08188897028908457,
"learning_rate": 9.998101535124758e-05,
"loss": 0.1067,
"step": 28
},
{
"epoch": 0.2283464566929134,
"grad_norm": 0.05955056508032511,
"learning_rate": 9.995728791936504e-05,
"loss": 0.0781,
"step": 29
},
{
"epoch": 0.23622047244094488,
"grad_norm": 0.10052278242607535,
"learning_rate": 9.992407582166581e-05,
"loss": 0.1132,
"step": 30
},
{
"epoch": 0.2440944881889764,
"grad_norm": 0.07776039997533422,
"learning_rate": 9.988138536364922e-05,
"loss": 0.0974,
"step": 31
},
{
"epoch": 0.25196850393700787,
"grad_norm": 0.08388901010421355,
"learning_rate": 9.98292246503335e-05,
"loss": 0.1009,
"step": 32
},
{
"epoch": 0.25984251968503935,
"grad_norm": 0.06560022866292156,
"learning_rate": 9.976760358471686e-05,
"loss": 0.1105,
"step": 33
},
{
"epoch": 0.2677165354330709,
"grad_norm": 0.05590070356365914,
"learning_rate": 9.969653386589748e-05,
"loss": 0.1103,
"step": 34
},
{
"epoch": 0.2755905511811024,
"grad_norm": 0.04905786983982704,
"learning_rate": 9.961602898685226e-05,
"loss": 0.0881,
"step": 35
},
{
"epoch": 0.28346456692913385,
"grad_norm": 0.05490757430787913,
"learning_rate": 9.952610423187516e-05,
"loss": 0.0918,
"step": 36
},
{
"epoch": 0.29133858267716534,
"grad_norm": 0.043950941798686236,
"learning_rate": 9.942677667367541e-05,
"loss": 0.0881,
"step": 37
},
{
"epoch": 0.2992125984251969,
"grad_norm": 0.05071542251542392,
"learning_rate": 9.931806517013612e-05,
"loss": 0.0729,
"step": 38
},
{
"epoch": 0.30708661417322836,
"grad_norm": 0.040398473430997464,
"learning_rate": 9.9199990360734e-05,
"loss": 0.0793,
"step": 39
},
{
"epoch": 0.31496062992125984,
"grad_norm": 0.037634938852860234,
"learning_rate": 9.90725746626209e-05,
"loss": 0.0776,
"step": 40
},
{
"epoch": 0.3228346456692913,
"grad_norm": 0.0479244660692272,
"learning_rate": 9.893584226636772e-05,
"loss": 0.0801,
"step": 41
},
{
"epoch": 0.33070866141732286,
"grad_norm": 0.05346016466242833,
"learning_rate": 9.878981913137179e-05,
"loss": 0.0817,
"step": 42
},
{
"epoch": 0.33858267716535434,
"grad_norm": 0.07429962714762295,
"learning_rate": 9.86345329809282e-05,
"loss": 0.0942,
"step": 43
},
{
"epoch": 0.3464566929133858,
"grad_norm": 0.036250514133953575,
"learning_rate": 9.847001329696653e-05,
"loss": 0.0761,
"step": 44
},
{
"epoch": 0.3543307086614173,
"grad_norm": 0.03881655826068151,
"learning_rate": 9.829629131445342e-05,
"loss": 0.0751,
"step": 45
},
{
"epoch": 0.36220472440944884,
"grad_norm": 0.040273401041872256,
"learning_rate": 9.811340001546251e-05,
"loss": 0.0912,
"step": 46
},
{
"epoch": 0.3700787401574803,
"grad_norm": 0.05203429733895773,
"learning_rate": 9.792137412291265e-05,
"loss": 0.0894,
"step": 47
},
{
"epoch": 0.3779527559055118,
"grad_norm": 0.04242863105236421,
"learning_rate": 9.772025009397537e-05,
"loss": 0.0804,
"step": 48
},
{
"epoch": 0.3858267716535433,
"grad_norm": 0.040515570278649306,
"learning_rate": 9.751006611315356e-05,
"loss": 0.0717,
"step": 49
},
{
"epoch": 0.3937007874015748,
"grad_norm": 0.04280194987120334,
"learning_rate": 9.729086208503174e-05,
"loss": 0.0713,
"step": 50
},
{
"epoch": 0.4015748031496063,
"grad_norm": 0.07612965297736185,
"learning_rate": 9.706267962669998e-05,
"loss": 0.0935,
"step": 51
},
{
"epoch": 0.4094488188976378,
"grad_norm": 0.046501972727124995,
"learning_rate": 9.682556205985274e-05,
"loss": 0.0558,
"step": 52
},
{
"epoch": 0.41732283464566927,
"grad_norm": 0.039381255305059475,
"learning_rate": 9.657955440256395e-05,
"loss": 0.0741,
"step": 53
},
{
"epoch": 0.4251968503937008,
"grad_norm": 0.040869656588052625,
"learning_rate": 9.632470336074009e-05,
"loss": 0.0693,
"step": 54
},
{
"epoch": 0.4330708661417323,
"grad_norm": 0.034972215143239324,
"learning_rate": 9.606105731925283e-05,
"loss": 0.0654,
"step": 55
},
{
"epoch": 0.4409448818897638,
"grad_norm": 0.04651590524829242,
"learning_rate": 9.578866633275288e-05,
"loss": 0.0895,
"step": 56
},
{
"epoch": 0.44881889763779526,
"grad_norm": 0.03790710812187682,
"learning_rate": 9.550758211616684e-05,
"loss": 0.0587,
"step": 57
},
{
"epoch": 0.4566929133858268,
"grad_norm": 0.06064386306880854,
"learning_rate": 9.521785803487889e-05,
"loss": 0.0704,
"step": 58
},
{
"epoch": 0.4645669291338583,
"grad_norm": 0.04665702311958301,
"learning_rate": 9.491954909459895e-05,
"loss": 0.0785,
"step": 59
},
{
"epoch": 0.47244094488188976,
"grad_norm": 0.03845050550687295,
"learning_rate": 9.46127119309197e-05,
"loss": 0.0707,
"step": 60
},
{
"epoch": 0.48031496062992124,
"grad_norm": 0.06770425704069039,
"learning_rate": 9.42974047985639e-05,
"loss": 0.0989,
"step": 61
},
{
"epoch": 0.4881889763779528,
"grad_norm": 0.0493379370118964,
"learning_rate": 9.397368756032445e-05,
"loss": 0.0773,
"step": 62
},
{
"epoch": 0.49606299212598426,
"grad_norm": 0.05424018637256655,
"learning_rate": 9.364162167569907e-05,
"loss": 0.0862,
"step": 63
},
{
"epoch": 0.5039370078740157,
"grad_norm": 0.035888638018077,
"learning_rate": 9.330127018922194e-05,
"loss": 0.0696,
"step": 64
},
{
"epoch": 0.5118110236220472,
"grad_norm": 0.05419778194628531,
"learning_rate": 9.295269771849427e-05,
"loss": 0.0655,
"step": 65
},
{
"epoch": 0.5196850393700787,
"grad_norm": 0.042949223060275404,
"learning_rate": 9.259597044191636e-05,
"loss": 0.0607,
"step": 66
},
{
"epoch": 0.5275590551181102,
"grad_norm": 0.04264699828582032,
"learning_rate": 9.223115608612325e-05,
"loss": 0.0647,
"step": 67
},
{
"epoch": 0.5354330708661418,
"grad_norm": 0.04465981374861148,
"learning_rate": 9.185832391312644e-05,
"loss": 0.0721,
"step": 68
},
{
"epoch": 0.5433070866141733,
"grad_norm": 0.15086641781783566,
"learning_rate": 9.147754470716408e-05,
"loss": 0.0588,
"step": 69
},
{
"epoch": 0.5511811023622047,
"grad_norm": 0.03608693394469442,
"learning_rate": 9.108889076126226e-05,
"loss": 0.0598,
"step": 70
},
{
"epoch": 0.5590551181102362,
"grad_norm": 0.04512147014513813,
"learning_rate": 9.069243586350975e-05,
"loss": 0.0683,
"step": 71
},
{
"epoch": 0.5669291338582677,
"grad_norm": 0.034531651539235535,
"learning_rate": 9.028825528304892e-05,
"loss": 0.0534,
"step": 72
},
{
"epoch": 0.5748031496062992,
"grad_norm": 0.04001741026747657,
"learning_rate": 8.987642575578545e-05,
"loss": 0.0679,
"step": 73
},
{
"epoch": 0.5826771653543307,
"grad_norm": 0.0627167723825249,
"learning_rate": 8.945702546981969e-05,
"loss": 0.0843,
"step": 74
},
{
"epoch": 0.5905511811023622,
"grad_norm": 0.041162273807440045,
"learning_rate": 8.903013405060211e-05,
"loss": 0.0769,
"step": 75
},
{
"epoch": 0.5984251968503937,
"grad_norm": 0.06413471441605206,
"learning_rate": 8.859583254581605e-05,
"loss": 0.079,
"step": 76
},
{
"epoch": 0.6062992125984252,
"grad_norm": 0.038139347016296524,
"learning_rate": 8.815420340999033e-05,
"loss": 0.061,
"step": 77
},
{
"epoch": 0.6141732283464567,
"grad_norm": 0.05637898761021731,
"learning_rate": 8.770533048884482e-05,
"loss": 0.0619,
"step": 78
},
{
"epoch": 0.6220472440944882,
"grad_norm": 0.0357824590275947,
"learning_rate": 8.724929900337186e-05,
"loss": 0.0584,
"step": 79
},
{
"epoch": 0.6299212598425197,
"grad_norm": 0.051212360329821134,
"learning_rate": 8.678619553365659e-05,
"loss": 0.0839,
"step": 80
},
{
"epoch": 0.6377952755905512,
"grad_norm": 0.05049167558999489,
"learning_rate": 8.631610800243926e-05,
"loss": 0.0589,
"step": 81
},
{
"epoch": 0.6456692913385826,
"grad_norm": 0.03961922655074931,
"learning_rate": 8.583912565842257e-05,
"loss": 0.0657,
"step": 82
},
{
"epoch": 0.6535433070866141,
"grad_norm": 0.04037566238898647,
"learning_rate": 8.535533905932738e-05,
"loss": 0.0747,
"step": 83
},
{
"epoch": 0.6614173228346457,
"grad_norm": 0.039919573702817145,
"learning_rate": 8.486484005469977e-05,
"loss": 0.0719,
"step": 84
},
{
"epoch": 0.6692913385826772,
"grad_norm": 0.033793800349021845,
"learning_rate": 8.436772176847294e-05,
"loss": 0.0582,
"step": 85
},
{
"epoch": 0.6771653543307087,
"grad_norm": 0.033700515363253886,
"learning_rate": 8.386407858128706e-05,
"loss": 0.0669,
"step": 86
},
{
"epoch": 0.6850393700787402,
"grad_norm": 0.040605736837914866,
"learning_rate": 8.335400611257067e-05,
"loss": 0.0652,
"step": 87
},
{
"epoch": 0.6929133858267716,
"grad_norm": 0.054522693875205565,
"learning_rate": 8.283760120238672e-05,
"loss": 0.0717,
"step": 88
},
{
"epoch": 0.7007874015748031,
"grad_norm": 0.03702465616892497,
"learning_rate": 8.231496189304704e-05,
"loss": 0.0731,
"step": 89
},
{
"epoch": 0.7086614173228346,
"grad_norm": 0.03285710986831703,
"learning_rate": 8.178618741049842e-05,
"loss": 0.0668,
"step": 90
},
{
"epoch": 0.7165354330708661,
"grad_norm": 0.053063329951814404,
"learning_rate": 8.125137814548393e-05,
"loss": 0.067,
"step": 91
},
{
"epoch": 0.7244094488188977,
"grad_norm": 0.040180722630613536,
"learning_rate": 8.07106356344834e-05,
"loss": 0.0698,
"step": 92
},
{
"epoch": 0.7322834645669292,
"grad_norm": 0.0416900575795704,
"learning_rate": 8.016406254043595e-05,
"loss": 0.0725,
"step": 93
},
{
"epoch": 0.7401574803149606,
"grad_norm": 0.03869655273152511,
"learning_rate": 7.961176263324901e-05,
"loss": 0.0659,
"step": 94
},
{
"epoch": 0.7480314960629921,
"grad_norm": 0.044825068618721646,
"learning_rate": 7.905384077009693e-05,
"loss": 0.0731,
"step": 95
},
{
"epoch": 0.7559055118110236,
"grad_norm": 0.044483649059968176,
"learning_rate": 7.849040287551331e-05,
"loss": 0.0634,
"step": 96
},
{
"epoch": 0.7637795275590551,
"grad_norm": 0.04017053730657027,
"learning_rate": 7.79215559212807e-05,
"loss": 0.078,
"step": 97
},
{
"epoch": 0.7716535433070866,
"grad_norm": 0.04145460320890759,
"learning_rate": 7.734740790612136e-05,
"loss": 0.0768,
"step": 98
},
{
"epoch": 0.7795275590551181,
"grad_norm": 0.03633052402965569,
"learning_rate": 7.676806783519304e-05,
"loss": 0.0664,
"step": 99
},
{
"epoch": 0.7874015748031497,
"grad_norm": 0.04021689685346706,
"learning_rate": 7.618364569939391e-05,
"loss": 0.0674,
"step": 100
},
{
"epoch": 0.7952755905511811,
"grad_norm": 0.060396460565417545,
"learning_rate": 7.559425245448006e-05,
"loss": 0.0694,
"step": 101
},
{
"epoch": 0.8031496062992126,
"grad_norm": 0.03969555967865163,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0559,
"step": 102
},
{
"epoch": 0.8110236220472441,
"grad_norm": 0.038622088305060684,
"learning_rate": 7.440100115804991e-05,
"loss": 0.0678,
"step": 103
},
{
"epoch": 0.8188976377952756,
"grad_norm": 0.0408599543219439,
"learning_rate": 7.379736965185368e-05,
"loss": 0.0596,
"step": 104
},
{
"epoch": 0.8267716535433071,
"grad_norm": 0.03612580723785473,
"learning_rate": 7.318922008417203e-05,
"loss": 0.0613,
"step": 105
},
{
"epoch": 0.8346456692913385,
"grad_norm": 0.051121513111016405,
"learning_rate": 7.257666791554448e-05,
"loss": 0.0681,
"step": 106
},
{
"epoch": 0.84251968503937,
"grad_norm": 0.053947523356233006,
"learning_rate": 7.195982944236851e-05,
"loss": 0.0642,
"step": 107
},
{
"epoch": 0.8503937007874016,
"grad_norm": 0.04708378321870759,
"learning_rate": 7.133882177482019e-05,
"loss": 0.0714,
"step": 108
},
{
"epoch": 0.8582677165354331,
"grad_norm": 0.05157517691265944,
"learning_rate": 7.071376281461994e-05,
"loss": 0.0679,
"step": 109
},
{
"epoch": 0.8661417322834646,
"grad_norm": 0.043251187130909544,
"learning_rate": 7.008477123264848e-05,
"loss": 0.0638,
"step": 110
},
{
"epoch": 0.8740157480314961,
"grad_norm": 0.03498275614195765,
"learning_rate": 6.94519664464163e-05,
"loss": 0.0545,
"step": 111
},
{
"epoch": 0.8818897637795275,
"grad_norm": 0.04509157725839411,
"learning_rate": 6.881546859739179e-05,
"loss": 0.072,
"step": 112
},
{
"epoch": 0.889763779527559,
"grad_norm": 0.03949179802033183,
"learning_rate": 6.817539852819149e-05,
"loss": 0.0679,
"step": 113
},
{
"epoch": 0.8976377952755905,
"grad_norm": 0.04824956524429339,
"learning_rate": 6.753187775963773e-05,
"loss": 0.0602,
"step": 114
},
{
"epoch": 0.905511811023622,
"grad_norm": 0.05052923605617683,
"learning_rate": 6.688502846768696e-05,
"loss": 0.07,
"step": 115
},
{
"epoch": 0.9133858267716536,
"grad_norm": 0.049650560302814,
"learning_rate": 6.623497346023418e-05,
"loss": 0.0588,
"step": 116
},
{
"epoch": 0.9212598425196851,
"grad_norm": 0.04693328297700466,
"learning_rate": 6.558183615379707e-05,
"loss": 0.0848,
"step": 117
},
{
"epoch": 0.9291338582677166,
"grad_norm": 0.05162117635179258,
"learning_rate": 6.492574055008473e-05,
"loss": 0.0779,
"step": 118
},
{
"epoch": 0.937007874015748,
"grad_norm": 0.04642140808574063,
"learning_rate": 6.426681121245527e-05,
"loss": 0.0711,
"step": 119
},
{
"epoch": 0.9448818897637795,
"grad_norm": 0.03821222457115445,
"learning_rate": 6.360517324226676e-05,
"loss": 0.0597,
"step": 120
},
{
"epoch": 0.952755905511811,
"grad_norm": 0.07405595024512149,
"learning_rate": 6.294095225512603e-05,
"loss": 0.0691,
"step": 121
},
{
"epoch": 0.9606299212598425,
"grad_norm": 0.04965701613939319,
"learning_rate": 6.227427435703997e-05,
"loss": 0.0697,
"step": 122
},
{
"epoch": 0.968503937007874,
"grad_norm": 0.053755810398937655,
"learning_rate": 6.16052661204734e-05,
"loss": 0.0713,
"step": 123
},
{
"epoch": 0.9763779527559056,
"grad_norm": 0.041060956975168636,
"learning_rate": 6.09340545603188e-05,
"loss": 0.0752,
"step": 124
},
{
"epoch": 0.984251968503937,
"grad_norm": 0.05257209833173453,
"learning_rate": 6.026076710978171e-05,
"loss": 0.0749,
"step": 125
},
{
"epoch": 0.9921259842519685,
"grad_norm": 0.04634248903143434,
"learning_rate": 5.958553159618693e-05,
"loss": 0.0721,
"step": 126
},
{
"epoch": 1.0,
"grad_norm": 0.04595655646478586,
"learning_rate": 5.890847621670966e-05,
"loss": 0.0643,
"step": 127
},
{
"epoch": 1.0078740157480315,
"grad_norm": 0.0428889003832125,
"learning_rate": 5.8229729514036705e-05,
"loss": 0.0592,
"step": 128
},
{
"epoch": 1.015748031496063,
"grad_norm": 0.06513209718503772,
"learning_rate": 5.7549420351961844e-05,
"loss": 0.0661,
"step": 129
},
{
"epoch": 1.0236220472440944,
"grad_norm": 0.048508681912102766,
"learning_rate": 5.686767789092041e-05,
"loss": 0.062,
"step": 130
},
{
"epoch": 1.031496062992126,
"grad_norm": 0.03942306111426942,
"learning_rate": 5.618463156346739e-05,
"loss": 0.06,
"step": 131
},
{
"epoch": 1.0393700787401574,
"grad_norm": 0.05920893966651,
"learning_rate": 5.550041104970397e-05,
"loss": 0.0574,
"step": 132
},
{
"epoch": 1.047244094488189,
"grad_norm": 0.06537752227653837,
"learning_rate": 5.481514625265709e-05,
"loss": 0.0789,
"step": 133
},
{
"epoch": 1.0551181102362204,
"grad_norm": 0.052133215264540934,
"learning_rate": 5.4128967273616625e-05,
"loss": 0.0539,
"step": 134
},
{
"epoch": 1.0629921259842519,
"grad_norm": 0.03961482885456455,
"learning_rate": 5.344200438743489e-05,
"loss": 0.0584,
"step": 135
},
{
"epoch": 1.0708661417322836,
"grad_norm": 0.04991806278320797,
"learning_rate": 5.2754388017793274e-05,
"loss": 0.0597,
"step": 136
},
{
"epoch": 1.078740157480315,
"grad_norm": 0.05656237899859479,
"learning_rate": 5.2066248712440656e-05,
"loss": 0.0758,
"step": 137
},
{
"epoch": 1.0866141732283465,
"grad_norm": 0.055792787158515794,
"learning_rate": 5.1377717118408105e-05,
"loss": 0.0754,
"step": 138
},
{
"epoch": 1.094488188976378,
"grad_norm": 0.058089701897722286,
"learning_rate": 5.068892395720483e-05,
"loss": 0.0673,
"step": 139
},
{
"epoch": 1.1023622047244095,
"grad_norm": 0.049023325150838494,
"learning_rate": 5e-05,
"loss": 0.0591,
"step": 140
},
{
"epoch": 1.110236220472441,
"grad_norm": 0.04294743238196575,
"learning_rate": 4.9311076042795185e-05,
"loss": 0.0573,
"step": 141
},
{
"epoch": 1.1181102362204725,
"grad_norm": 0.04629796470945678,
"learning_rate": 4.8622282881591906e-05,
"loss": 0.0662,
"step": 142
},
{
"epoch": 1.125984251968504,
"grad_norm": 0.04706455796489872,
"learning_rate": 4.7933751287559335e-05,
"loss": 0.0705,
"step": 143
},
{
"epoch": 1.1338582677165354,
"grad_norm": 0.04805424749077547,
"learning_rate": 4.7245611982206724e-05,
"loss": 0.0617,
"step": 144
},
{
"epoch": 1.141732283464567,
"grad_norm": 0.04537362622805077,
"learning_rate": 4.6557995612565144e-05,
"loss": 0.0563,
"step": 145
},
{
"epoch": 1.1496062992125984,
"grad_norm": 0.043403479823984015,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.0534,
"step": 146
},
{
"epoch": 1.1574803149606299,
"grad_norm": 0.03764537967703119,
"learning_rate": 4.518485374734292e-05,
"loss": 0.0515,
"step": 147
},
{
"epoch": 1.1653543307086613,
"grad_norm": 0.056823438107632934,
"learning_rate": 4.449958895029604e-05,
"loss": 0.0742,
"step": 148
},
{
"epoch": 1.1732283464566928,
"grad_norm": 0.04193687232087755,
"learning_rate": 4.381536843653262e-05,
"loss": 0.0542,
"step": 149
},
{
"epoch": 1.1811023622047245,
"grad_norm": 0.046536924814026416,
"learning_rate": 4.3132322109079596e-05,
"loss": 0.0855,
"step": 150
},
{
"epoch": 1.188976377952756,
"grad_norm": 0.05457156199530363,
"learning_rate": 4.2450579648038154e-05,
"loss": 0.0553,
"step": 151
},
{
"epoch": 1.1968503937007875,
"grad_norm": 0.04033132170380821,
"learning_rate": 4.17702704859633e-05,
"loss": 0.0538,
"step": 152
},
{
"epoch": 1.204724409448819,
"grad_norm": 0.049656351067623145,
"learning_rate": 4.109152378329036e-05,
"loss": 0.0542,
"step": 153
},
{
"epoch": 1.2125984251968505,
"grad_norm": 0.03998164718927446,
"learning_rate": 4.0414468403813095e-05,
"loss": 0.0521,
"step": 154
},
{
"epoch": 1.220472440944882,
"grad_norm": 0.05745317845238628,
"learning_rate": 3.973923289021829e-05,
"loss": 0.0609,
"step": 155
},
{
"epoch": 1.2283464566929134,
"grad_norm": 0.042323711634250194,
"learning_rate": 3.9065945439681214e-05,
"loss": 0.0626,
"step": 156
},
{
"epoch": 1.236220472440945,
"grad_norm": 0.05723447684288661,
"learning_rate": 3.839473387952662e-05,
"loss": 0.0643,
"step": 157
},
{
"epoch": 1.2440944881889764,
"grad_norm": 0.05609057194605591,
"learning_rate": 3.772572564296005e-05,
"loss": 0.0446,
"step": 158
},
{
"epoch": 1.2519685039370079,
"grad_norm": 0.04383475827201075,
"learning_rate": 3.705904774487396e-05,
"loss": 0.054,
"step": 159
},
{
"epoch": 1.2598425196850394,
"grad_norm": 0.04721149501353491,
"learning_rate": 3.639482675773324e-05,
"loss": 0.0768,
"step": 160
},
{
"epoch": 1.2677165354330708,
"grad_norm": 0.044660845340955664,
"learning_rate": 3.5733188787544745e-05,
"loss": 0.0433,
"step": 161
},
{
"epoch": 1.2755905511811023,
"grad_norm": 0.05604911676781777,
"learning_rate": 3.5074259449915284e-05,
"loss": 0.0621,
"step": 162
},
{
"epoch": 1.2834645669291338,
"grad_norm": 0.049412392554855175,
"learning_rate": 3.4418163846202944e-05,
"loss": 0.067,
"step": 163
},
{
"epoch": 1.2913385826771653,
"grad_norm": 0.05304697118137444,
"learning_rate": 3.3765026539765834e-05,
"loss": 0.06,
"step": 164
},
{
"epoch": 1.2992125984251968,
"grad_norm": 0.05294702902408293,
"learning_rate": 3.3114971532313056e-05,
"loss": 0.067,
"step": 165
},
{
"epoch": 1.3070866141732282,
"grad_norm": 0.041258642954892746,
"learning_rate": 3.2468122240362284e-05,
"loss": 0.0472,
"step": 166
},
{
"epoch": 1.3149606299212597,
"grad_norm": 0.053885127357226115,
"learning_rate": 3.18246014718085e-05,
"loss": 0.0679,
"step": 167
},
{
"epoch": 1.3228346456692912,
"grad_norm": 0.06157836754578656,
"learning_rate": 3.118453140260823e-05,
"loss": 0.0735,
"step": 168
},
{
"epoch": 1.330708661417323,
"grad_norm": 0.05459458021353128,
"learning_rate": 3.0548033553583705e-05,
"loss": 0.0618,
"step": 169
},
{
"epoch": 1.3385826771653544,
"grad_norm": 0.06138229722884799,
"learning_rate": 2.991522876735154e-05,
"loss": 0.06,
"step": 170
},
{
"epoch": 1.3464566929133859,
"grad_norm": 0.04607977533183901,
"learning_rate": 2.928623718538006e-05,
"loss": 0.056,
"step": 171
},
{
"epoch": 1.3543307086614174,
"grad_norm": 0.04457994569263475,
"learning_rate": 2.866117822517982e-05,
"loss": 0.0505,
"step": 172
},
{
"epoch": 1.3622047244094488,
"grad_norm": 0.05347674759185391,
"learning_rate": 2.804017055763149e-05,
"loss": 0.0701,
"step": 173
},
{
"epoch": 1.3700787401574803,
"grad_norm": 0.04603095018943048,
"learning_rate": 2.7423332084455544e-05,
"loss": 0.0661,
"step": 174
},
{
"epoch": 1.3779527559055118,
"grad_norm": 0.04535204168167475,
"learning_rate": 2.681077991582797e-05,
"loss": 0.0638,
"step": 175
},
{
"epoch": 1.3858267716535433,
"grad_norm": 0.044079347580669566,
"learning_rate": 2.6202630348146324e-05,
"loss": 0.0483,
"step": 176
},
{
"epoch": 1.3937007874015748,
"grad_norm": 0.04787312823352356,
"learning_rate": 2.5598998841950107e-05,
"loss": 0.0485,
"step": 177
},
{
"epoch": 1.4015748031496063,
"grad_norm": 0.053556935205416635,
"learning_rate": 2.500000000000001e-05,
"loss": 0.0598,
"step": 178
},
{
"epoch": 1.4094488188976377,
"grad_norm": 0.048892583117746094,
"learning_rate": 2.4405747545519963e-05,
"loss": 0.0671,
"step": 179
},
{
"epoch": 1.4173228346456692,
"grad_norm": 0.05573731864284115,
"learning_rate": 2.381635430060611e-05,
"loss": 0.0519,
"step": 180
},
{
"epoch": 1.425196850393701,
"grad_norm": 0.04104144198286972,
"learning_rate": 2.323193216480698e-05,
"loss": 0.0619,
"step": 181
},
{
"epoch": 1.4330708661417324,
"grad_norm": 0.04902777007420262,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.0587,
"step": 182
},
{
"epoch": 1.4409448818897639,
"grad_norm": 0.04484354432801945,
"learning_rate": 2.207844407871929e-05,
"loss": 0.0554,
"step": 183
},
{
"epoch": 1.4488188976377954,
"grad_norm": 0.06590936087340947,
"learning_rate": 2.150959712448669e-05,
"loss": 0.0811,
"step": 184
},
{
"epoch": 1.4566929133858268,
"grad_norm": 0.04102430626416167,
"learning_rate": 2.094615922990309e-05,
"loss": 0.0428,
"step": 185
},
{
"epoch": 1.4645669291338583,
"grad_norm": 0.041794329984435635,
"learning_rate": 2.0388237366751006e-05,
"loss": 0.0605,
"step": 186
},
{
"epoch": 1.4724409448818898,
"grad_norm": 0.05045021005842732,
"learning_rate": 1.9835937459564064e-05,
"loss": 0.0536,
"step": 187
},
{
"epoch": 1.4803149606299213,
"grad_norm": 0.04618763538981042,
"learning_rate": 1.928936436551661e-05,
"loss": 0.0556,
"step": 188
},
{
"epoch": 1.4881889763779528,
"grad_norm": 0.05106802741492108,
"learning_rate": 1.874862185451608e-05,
"loss": 0.0709,
"step": 189
},
{
"epoch": 1.4960629921259843,
"grad_norm": 0.05597419656508567,
"learning_rate": 1.821381258950161e-05,
"loss": 0.0489,
"step": 190
},
{
"epoch": 1.5039370078740157,
"grad_norm": 0.06222180812872232,
"learning_rate": 1.768503810695295e-05,
"loss": 0.0578,
"step": 191
},
{
"epoch": 1.5118110236220472,
"grad_norm": 0.04964108845251158,
"learning_rate": 1.7162398797613282e-05,
"loss": 0.0555,
"step": 192
},
{
"epoch": 1.5196850393700787,
"grad_norm": 0.05088603506896626,
"learning_rate": 1.6645993887429345e-05,
"loss": 0.0622,
"step": 193
},
{
"epoch": 1.5275590551181102,
"grad_norm": 0.046495091752318314,
"learning_rate": 1.6135921418712956e-05,
"loss": 0.0595,
"step": 194
},
{
"epoch": 1.5354330708661417,
"grad_norm": 0.053917463079799346,
"learning_rate": 1.563227823152708e-05,
"loss": 0.075,
"step": 195
},
{
"epoch": 1.5433070866141732,
"grad_norm": 0.08260510658354386,
"learning_rate": 1.5135159945300231e-05,
"loss": 0.0716,
"step": 196
},
{
"epoch": 1.5511811023622046,
"grad_norm": 0.04738267952758889,
"learning_rate": 1.4644660940672627e-05,
"loss": 0.0587,
"step": 197
},
{
"epoch": 1.5590551181102361,
"grad_norm": 0.051915775011872965,
"learning_rate": 1.4160874341577446e-05,
"loss": 0.0521,
"step": 198
},
{
"epoch": 1.5669291338582676,
"grad_norm": 0.05481705129948632,
"learning_rate": 1.368389199756075e-05,
"loss": 0.0752,
"step": 199
},
{
"epoch": 1.574803149606299,
"grad_norm": 0.05217737291956296,
"learning_rate": 1.3213804466343421e-05,
"loss": 0.0736,
"step": 200
},
{
"epoch": 1.574803149606299,
"eval_loss": 0.06070369854569435,
"eval_runtime": 6.4146,
"eval_samples_per_second": 0.935,
"eval_steps_per_second": 0.312,
"step": 200
},
{
"epoch": 1.5826771653543306,
"grad_norm": 0.04678999221530293,
"learning_rate": 1.275070099662815e-05,
"loss": 0.0707,
"step": 201
},
{
"epoch": 1.590551181102362,
"grad_norm": 0.05124530088869734,
"learning_rate": 1.2294669511155193e-05,
"loss": 0.0559,
"step": 202
},
{
"epoch": 1.5984251968503937,
"grad_norm": 0.05350993198205092,
"learning_rate": 1.1845796590009683e-05,
"loss": 0.0768,
"step": 203
},
{
"epoch": 1.6062992125984252,
"grad_norm": 0.05328374514051326,
"learning_rate": 1.1404167454183957e-05,
"loss": 0.0638,
"step": 204
},
{
"epoch": 1.6141732283464567,
"grad_norm": 0.049069065326141866,
"learning_rate": 1.0969865949397901e-05,
"loss": 0.0701,
"step": 205
},
{
"epoch": 1.6220472440944882,
"grad_norm": 0.051454451624391905,
"learning_rate": 1.0542974530180327e-05,
"loss": 0.0535,
"step": 206
},
{
"epoch": 1.6299212598425197,
"grad_norm": 0.04437769885060027,
"learning_rate": 1.012357424421455e-05,
"loss": 0.0521,
"step": 207
},
{
"epoch": 1.6377952755905512,
"grad_norm": 0.04825172010643271,
"learning_rate": 9.711744716951093e-06,
"loss": 0.0689,
"step": 208
},
{
"epoch": 1.6456692913385826,
"grad_norm": 0.06554675030762395,
"learning_rate": 9.307564136490254e-06,
"loss": 0.0612,
"step": 209
},
{
"epoch": 1.6535433070866141,
"grad_norm": 0.05866366054712351,
"learning_rate": 8.911109238737747e-06,
"loss": 0.0791,
"step": 210
},
{
"epoch": 1.6614173228346458,
"grad_norm": 0.04336852944598847,
"learning_rate": 8.522455292835934e-06,
"loss": 0.0539,
"step": 211
},
{
"epoch": 1.6692913385826773,
"grad_norm": 0.051947799814034856,
"learning_rate": 8.141676086873572e-06,
"loss": 0.0639,
"step": 212
},
{
"epoch": 1.6771653543307088,
"grad_norm": 0.04351599998197119,
"learning_rate": 7.768843913876756e-06,
"loss": 0.0482,
"step": 213
},
{
"epoch": 1.6850393700787403,
"grad_norm": 0.06480464685922364,
"learning_rate": 7.404029558083653e-06,
"loss": 0.0662,
"step": 214
},
{
"epoch": 1.6929133858267718,
"grad_norm": 0.04654351665030391,
"learning_rate": 7.047302281505736e-06,
"loss": 0.0566,
"step": 215
},
{
"epoch": 1.7007874015748032,
"grad_norm": 0.04652775620652711,
"learning_rate": 6.698729810778065e-06,
"loss": 0.0571,
"step": 216
},
{
"epoch": 1.7086614173228347,
"grad_norm": 0.047224618696043545,
"learning_rate": 6.3583783243009285e-06,
"loss": 0.0592,
"step": 217
},
{
"epoch": 1.7165354330708662,
"grad_norm": 0.053615270349897666,
"learning_rate": 6.026312439675552e-06,
"loss": 0.0646,
"step": 218
},
{
"epoch": 1.7244094488188977,
"grad_norm": 0.04665853226611898,
"learning_rate": 5.702595201436101e-06,
"loss": 0.0574,
"step": 219
},
{
"epoch": 1.7322834645669292,
"grad_norm": 0.05099339647037513,
"learning_rate": 5.387288069080299e-06,
"loss": 0.0614,
"step": 220
},
{
"epoch": 1.7401574803149606,
"grad_norm": 0.043289965587830936,
"learning_rate": 5.080450905401057e-06,
"loss": 0.0468,
"step": 221
},
{
"epoch": 1.7480314960629921,
"grad_norm": 0.0692890477383107,
"learning_rate": 4.782141965121128e-06,
"loss": 0.0493,
"step": 222
},
{
"epoch": 1.7559055118110236,
"grad_norm": 0.0403442528103141,
"learning_rate": 4.492417883833155e-06,
"loss": 0.0476,
"step": 223
},
{
"epoch": 1.763779527559055,
"grad_norm": 0.04191520401385937,
"learning_rate": 4.2113336672471245e-06,
"loss": 0.052,
"step": 224
},
{
"epoch": 1.7716535433070866,
"grad_norm": 0.054916613323571846,
"learning_rate": 3.9389426807471766e-06,
"loss": 0.0653,
"step": 225
},
{
"epoch": 1.779527559055118,
"grad_norm": 0.04750102972177334,
"learning_rate": 3.675296639259912e-06,
"loss": 0.0578,
"step": 226
},
{
"epoch": 1.7874015748031495,
"grad_norm": 0.0588876838593015,
"learning_rate": 3.420445597436056e-06,
"loss": 0.0565,
"step": 227
},
{
"epoch": 1.795275590551181,
"grad_norm": 0.050319040408150564,
"learning_rate": 3.1744379401472677e-06,
"loss": 0.0546,
"step": 228
},
{
"epoch": 1.8031496062992125,
"grad_norm": 0.05754277450687693,
"learning_rate": 2.9373203733000232e-06,
"loss": 0.0686,
"step": 229
},
{
"epoch": 1.811023622047244,
"grad_norm": 0.05586902250961728,
"learning_rate": 2.7091379149682685e-06,
"loss": 0.0673,
"step": 230
},
{
"epoch": 1.8188976377952755,
"grad_norm": 0.05258380689743355,
"learning_rate": 2.4899338868464404e-06,
"loss": 0.0656,
"step": 231
},
{
"epoch": 1.826771653543307,
"grad_norm": 0.0444146353656455,
"learning_rate": 2.2797499060246253e-06,
"loss": 0.0543,
"step": 232
},
{
"epoch": 1.8346456692913384,
"grad_norm": 0.050066903014096574,
"learning_rate": 2.0786258770873647e-06,
"loss": 0.0599,
"step": 233
},
{
"epoch": 1.84251968503937,
"grad_norm": 0.045770675687925415,
"learning_rate": 1.8865999845374793e-06,
"loss": 0.0599,
"step": 234
},
{
"epoch": 1.8503937007874016,
"grad_norm": 0.05464584148563528,
"learning_rate": 1.70370868554659e-06,
"loss": 0.0685,
"step": 235
},
{
"epoch": 1.858267716535433,
"grad_norm": 0.04968886797769799,
"learning_rate": 1.5299867030334814e-06,
"loss": 0.068,
"step": 236
},
{
"epoch": 1.8661417322834646,
"grad_norm": 0.0552386892230046,
"learning_rate": 1.3654670190718034e-06,
"loss": 0.0668,
"step": 237
},
{
"epoch": 1.874015748031496,
"grad_norm": 0.042676978639831414,
"learning_rate": 1.210180868628219e-06,
"loss": 0.053,
"step": 238
},
{
"epoch": 1.8818897637795275,
"grad_norm": 0.04702355316793717,
"learning_rate": 1.064157733632276e-06,
"loss": 0.0513,
"step": 239
},
{
"epoch": 1.889763779527559,
"grad_norm": 0.07317497229013392,
"learning_rate": 9.274253373791064e-07,
"loss": 0.0609,
"step": 240
},
{
"epoch": 1.8976377952755905,
"grad_norm": 0.04768857996113459,
"learning_rate": 8.000096392660029e-07,
"loss": 0.0523,
"step": 241
},
{
"epoch": 1.905511811023622,
"grad_norm": 0.0600627581282451,
"learning_rate": 6.819348298638839e-07,
"loss": 0.0756,
"step": 242
},
{
"epoch": 1.9133858267716537,
"grad_norm": 0.048224067594923814,
"learning_rate": 5.732233263245845e-07,
"loss": 0.0568,
"step": 243
},
{
"epoch": 1.9212598425196852,
"grad_norm": 0.051208440325568635,
"learning_rate": 4.738957681248379e-07,
"loss": 0.0576,
"step": 244
},
{
"epoch": 1.9291338582677167,
"grad_norm": 0.05305714441239726,
"learning_rate": 3.839710131477492e-07,
"loss": 0.0578,
"step": 245
},
{
"epoch": 1.9370078740157481,
"grad_norm": 0.0539897244444409,
"learning_rate": 3.034661341025258e-07,
"loss": 0.0586,
"step": 246
},
{
"epoch": 1.9448818897637796,
"grad_norm": 0.04186116850231722,
"learning_rate": 2.323964152831426e-07,
"loss": 0.0511,
"step": 247
},
{
"epoch": 1.952755905511811,
"grad_norm": 0.049234512985121694,
"learning_rate": 1.7077534966650766e-07,
"loss": 0.0597,
"step": 248
},
{
"epoch": 1.9606299212598426,
"grad_norm": 0.0551149693713877,
"learning_rate": 1.1861463635077785e-07,
"loss": 0.0684,
"step": 249
},
{
"epoch": 1.968503937007874,
"grad_norm": 0.05304684823721321,
"learning_rate": 7.59241783341913e-08,
"loss": 0.0682,
"step": 250
},
{
"epoch": 1.9763779527559056,
"grad_norm": 0.07253818377949055,
"learning_rate": 4.2712080634949024e-08,
"loss": 0.0637,
"step": 251
},
{
"epoch": 1.984251968503937,
"grad_norm": 0.053065918527288605,
"learning_rate": 1.8984648752429225e-08,
"loss": 0.0693,
"step": 252
},
{
"epoch": 1.9921259842519685,
"grad_norm": 0.05734495408696275,
"learning_rate": 4.746387470044855e-09,
"loss": 0.061,
"step": 253
},
{
"epoch": 2.0,
"grad_norm": 0.05703286573729645,
"learning_rate": 0.0,
"loss": 0.0489,
"step": 254
},
{
"epoch": 2.0,
"step": 254,
"total_flos": 643226180517888.0,
"train_loss": 0.07837846240131405,
"train_runtime": 1925.3112,
"train_samples_per_second": 0.525,
"train_steps_per_second": 0.132
}
],
"logging_steps": 1,
"max_steps": 254,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 643226180517888.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}