dimasik87's picture
Training in progress, step 200, checkpoint
bc67adf verified
raw
history blame
36.8 kB
{
"best_metric": 0.0005893517518416047,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 0.10787486515641856,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005393743257820927,
"grad_norm": 0.9549857378005981,
"learning_rate": 1.001e-05,
"loss": 0.2799,
"step": 1
},
{
"epoch": 0.0005393743257820927,
"eval_loss": 0.4979684054851532,
"eval_runtime": 45.1708,
"eval_samples_per_second": 17.29,
"eval_steps_per_second": 4.339,
"step": 1
},
{
"epoch": 0.0010787486515641855,
"grad_norm": 0.6719993948936462,
"learning_rate": 2.002e-05,
"loss": 0.1665,
"step": 2
},
{
"epoch": 0.0016181229773462784,
"grad_norm": 0.5584933757781982,
"learning_rate": 3.0029999999999995e-05,
"loss": 0.1785,
"step": 3
},
{
"epoch": 0.002157497303128371,
"grad_norm": 0.6560541391372681,
"learning_rate": 4.004e-05,
"loss": 0.1566,
"step": 4
},
{
"epoch": 0.002696871628910464,
"grad_norm": 0.8838739991188049,
"learning_rate": 5.005e-05,
"loss": 0.1356,
"step": 5
},
{
"epoch": 0.003236245954692557,
"grad_norm": 0.6256913542747498,
"learning_rate": 6.005999999999999e-05,
"loss": 0.0931,
"step": 6
},
{
"epoch": 0.0037756202804746495,
"grad_norm": 0.6155531406402588,
"learning_rate": 7.006999999999998e-05,
"loss": 0.0667,
"step": 7
},
{
"epoch": 0.004314994606256742,
"grad_norm": 0.584822416305542,
"learning_rate": 8.008e-05,
"loss": 0.0352,
"step": 8
},
{
"epoch": 0.0048543689320388345,
"grad_norm": 1.1850945949554443,
"learning_rate": 9.009e-05,
"loss": 0.0139,
"step": 9
},
{
"epoch": 0.005393743257820928,
"grad_norm": 0.36779069900512695,
"learning_rate": 0.0001001,
"loss": 0.0119,
"step": 10
},
{
"epoch": 0.005933117583603021,
"grad_norm": 1.6826813220977783,
"learning_rate": 9.957315789473684e-05,
"loss": 0.0115,
"step": 11
},
{
"epoch": 0.006472491909385114,
"grad_norm": 0.3343207538127899,
"learning_rate": 9.904631578947367e-05,
"loss": 0.0034,
"step": 12
},
{
"epoch": 0.007011866235167206,
"grad_norm": 0.03047369234263897,
"learning_rate": 9.851947368421052e-05,
"loss": 0.0005,
"step": 13
},
{
"epoch": 0.007551240560949299,
"grad_norm": 0.04409059137105942,
"learning_rate": 9.799263157894736e-05,
"loss": 0.0003,
"step": 14
},
{
"epoch": 0.008090614886731391,
"grad_norm": 3.0516834259033203,
"learning_rate": 9.746578947368421e-05,
"loss": 0.0131,
"step": 15
},
{
"epoch": 0.008629989212513484,
"grad_norm": 0.3956446945667267,
"learning_rate": 9.693894736842104e-05,
"loss": 0.001,
"step": 16
},
{
"epoch": 0.009169363538295576,
"grad_norm": 1.5148561000823975,
"learning_rate": 9.641210526315789e-05,
"loss": 0.0061,
"step": 17
},
{
"epoch": 0.009708737864077669,
"grad_norm": 0.49977800250053406,
"learning_rate": 9.588526315789473e-05,
"loss": 0.0017,
"step": 18
},
{
"epoch": 0.010248112189859764,
"grad_norm": 0.012113417498767376,
"learning_rate": 9.535842105263157e-05,
"loss": 0.0001,
"step": 19
},
{
"epoch": 0.010787486515641856,
"grad_norm": 0.008402293547987938,
"learning_rate": 9.483157894736841e-05,
"loss": 0.0002,
"step": 20
},
{
"epoch": 0.011326860841423949,
"grad_norm": 0.002001185668632388,
"learning_rate": 9.430473684210526e-05,
"loss": 0.0,
"step": 21
},
{
"epoch": 0.011866235167206042,
"grad_norm": 0.5545786619186401,
"learning_rate": 9.37778947368421e-05,
"loss": 0.0068,
"step": 22
},
{
"epoch": 0.012405609492988134,
"grad_norm": 0.06889030337333679,
"learning_rate": 9.325105263157894e-05,
"loss": 0.0004,
"step": 23
},
{
"epoch": 0.012944983818770227,
"grad_norm": 0.15830865502357483,
"learning_rate": 9.272421052631578e-05,
"loss": 0.0011,
"step": 24
},
{
"epoch": 0.01348435814455232,
"grad_norm": 0.7862672805786133,
"learning_rate": 9.219736842105263e-05,
"loss": 0.0063,
"step": 25
},
{
"epoch": 0.014023732470334413,
"grad_norm": 0.009982357732951641,
"learning_rate": 9.167052631578946e-05,
"loss": 0.0001,
"step": 26
},
{
"epoch": 0.014563106796116505,
"grad_norm": 0.009793877601623535,
"learning_rate": 9.114368421052632e-05,
"loss": 0.0001,
"step": 27
},
{
"epoch": 0.015102481121898598,
"grad_norm": 0.006467557977885008,
"learning_rate": 9.061684210526315e-05,
"loss": 0.0001,
"step": 28
},
{
"epoch": 0.01564185544768069,
"grad_norm": 0.012111790478229523,
"learning_rate": 9.009e-05,
"loss": 0.0001,
"step": 29
},
{
"epoch": 0.016181229773462782,
"grad_norm": 0.06000563129782677,
"learning_rate": 8.956315789473683e-05,
"loss": 0.0007,
"step": 30
},
{
"epoch": 0.016720604099244876,
"grad_norm": 0.03677884489297867,
"learning_rate": 8.903631578947368e-05,
"loss": 0.0007,
"step": 31
},
{
"epoch": 0.017259978425026967,
"grad_norm": 0.03280596062541008,
"learning_rate": 8.850947368421052e-05,
"loss": 0.0003,
"step": 32
},
{
"epoch": 0.01779935275080906,
"grad_norm": 0.002885522786527872,
"learning_rate": 8.798263157894736e-05,
"loss": 0.0001,
"step": 33
},
{
"epoch": 0.018338727076591153,
"grad_norm": 0.4575079679489136,
"learning_rate": 8.745578947368422e-05,
"loss": 0.0156,
"step": 34
},
{
"epoch": 0.018878101402373247,
"grad_norm": 0.0030563841573894024,
"learning_rate": 8.692894736842105e-05,
"loss": 0.0,
"step": 35
},
{
"epoch": 0.019417475728155338,
"grad_norm": 0.0014249957166612148,
"learning_rate": 8.64021052631579e-05,
"loss": 0.0,
"step": 36
},
{
"epoch": 0.019956850053937433,
"grad_norm": 0.0230252668261528,
"learning_rate": 8.587526315789473e-05,
"loss": 0.0001,
"step": 37
},
{
"epoch": 0.020496224379719527,
"grad_norm": 0.005644981749355793,
"learning_rate": 8.534842105263157e-05,
"loss": 0.0001,
"step": 38
},
{
"epoch": 0.021035598705501618,
"grad_norm": 0.02447853423655033,
"learning_rate": 8.482157894736842e-05,
"loss": 0.0001,
"step": 39
},
{
"epoch": 0.021574973031283712,
"grad_norm": 0.004401453770697117,
"learning_rate": 8.429473684210525e-05,
"loss": 0.0001,
"step": 40
},
{
"epoch": 0.022114347357065803,
"grad_norm": 0.05064772441983223,
"learning_rate": 8.376789473684211e-05,
"loss": 0.0004,
"step": 41
},
{
"epoch": 0.022653721682847898,
"grad_norm": 1.4466474056243896,
"learning_rate": 8.324105263157894e-05,
"loss": 0.0453,
"step": 42
},
{
"epoch": 0.02319309600862999,
"grad_norm": 0.0030545571353286505,
"learning_rate": 8.271421052631579e-05,
"loss": 0.0001,
"step": 43
},
{
"epoch": 0.023732470334412083,
"grad_norm": 0.45695409178733826,
"learning_rate": 8.218736842105262e-05,
"loss": 0.0142,
"step": 44
},
{
"epoch": 0.024271844660194174,
"grad_norm": 0.45072630047798157,
"learning_rate": 8.166052631578947e-05,
"loss": 0.0007,
"step": 45
},
{
"epoch": 0.02481121898597627,
"grad_norm": 0.20009317994117737,
"learning_rate": 8.113368421052631e-05,
"loss": 0.0005,
"step": 46
},
{
"epoch": 0.02535059331175836,
"grad_norm": 0.01844087988138199,
"learning_rate": 8.060684210526315e-05,
"loss": 0.0001,
"step": 47
},
{
"epoch": 0.025889967637540454,
"grad_norm": 0.009140203706920147,
"learning_rate": 8.008e-05,
"loss": 0.0002,
"step": 48
},
{
"epoch": 0.026429341963322545,
"grad_norm": 1.665168046951294,
"learning_rate": 7.955315789473684e-05,
"loss": 0.0267,
"step": 49
},
{
"epoch": 0.02696871628910464,
"grad_norm": 1.6538095474243164,
"learning_rate": 7.902631578947368e-05,
"loss": 0.0311,
"step": 50
},
{
"epoch": 0.02696871628910464,
"eval_loss": 0.05669600889086723,
"eval_runtime": 43.6492,
"eval_samples_per_second": 17.893,
"eval_steps_per_second": 4.49,
"step": 50
},
{
"epoch": 0.02750809061488673,
"grad_norm": 2.739877223968506,
"learning_rate": 7.849947368421052e-05,
"loss": 0.0604,
"step": 51
},
{
"epoch": 0.028047464940668825,
"grad_norm": 0.010129177011549473,
"learning_rate": 7.797263157894736e-05,
"loss": 0.0001,
"step": 52
},
{
"epoch": 0.028586839266450916,
"grad_norm": 0.03367248922586441,
"learning_rate": 7.744578947368421e-05,
"loss": 0.0002,
"step": 53
},
{
"epoch": 0.02912621359223301,
"grad_norm": 1.0684995651245117,
"learning_rate": 7.691894736842104e-05,
"loss": 0.0011,
"step": 54
},
{
"epoch": 0.0296655879180151,
"grad_norm": 0.006163384765386581,
"learning_rate": 7.63921052631579e-05,
"loss": 0.0,
"step": 55
},
{
"epoch": 0.030204962243797196,
"grad_norm": 0.004382474347949028,
"learning_rate": 7.586526315789473e-05,
"loss": 0.0001,
"step": 56
},
{
"epoch": 0.030744336569579287,
"grad_norm": 0.006588236894458532,
"learning_rate": 7.533842105263158e-05,
"loss": 0.0001,
"step": 57
},
{
"epoch": 0.03128371089536138,
"grad_norm": 0.019274592399597168,
"learning_rate": 7.481157894736841e-05,
"loss": 0.0001,
"step": 58
},
{
"epoch": 0.03182308522114347,
"grad_norm": 0.006948168855160475,
"learning_rate": 7.428473684210526e-05,
"loss": 0.0001,
"step": 59
},
{
"epoch": 0.032362459546925564,
"grad_norm": 0.001927727716974914,
"learning_rate": 7.375789473684209e-05,
"loss": 0.0001,
"step": 60
},
{
"epoch": 0.03290183387270766,
"grad_norm": 0.05395427718758583,
"learning_rate": 7.323105263157895e-05,
"loss": 0.0004,
"step": 61
},
{
"epoch": 0.03344120819848975,
"grad_norm": 0.14390406012535095,
"learning_rate": 7.270421052631578e-05,
"loss": 0.0006,
"step": 62
},
{
"epoch": 0.03398058252427184,
"grad_norm": 0.04436345770955086,
"learning_rate": 7.217736842105263e-05,
"loss": 0.0004,
"step": 63
},
{
"epoch": 0.034519956850053934,
"grad_norm": 0.005591034423559904,
"learning_rate": 7.165052631578947e-05,
"loss": 0.0001,
"step": 64
},
{
"epoch": 0.03505933117583603,
"grad_norm": 1.5156598091125488,
"learning_rate": 7.11236842105263e-05,
"loss": 0.0033,
"step": 65
},
{
"epoch": 0.03559870550161812,
"grad_norm": 0.20944921672344208,
"learning_rate": 7.059684210526315e-05,
"loss": 0.0015,
"step": 66
},
{
"epoch": 0.036138079827400214,
"grad_norm": 0.02203325368463993,
"learning_rate": 7.006999999999998e-05,
"loss": 0.0002,
"step": 67
},
{
"epoch": 0.036677454153182305,
"grad_norm": 0.012391135096549988,
"learning_rate": 6.954315789473684e-05,
"loss": 0.0001,
"step": 68
},
{
"epoch": 0.0372168284789644,
"grad_norm": 0.003814356168732047,
"learning_rate": 6.901631578947368e-05,
"loss": 0.0001,
"step": 69
},
{
"epoch": 0.037756202804746494,
"grad_norm": 0.43400076031684875,
"learning_rate": 6.848947368421052e-05,
"loss": 0.0108,
"step": 70
},
{
"epoch": 0.038295577130528585,
"grad_norm": 0.001942815724760294,
"learning_rate": 6.796263157894737e-05,
"loss": 0.0001,
"step": 71
},
{
"epoch": 0.038834951456310676,
"grad_norm": 0.007045631296932697,
"learning_rate": 6.74357894736842e-05,
"loss": 0.0001,
"step": 72
},
{
"epoch": 0.039374325782092774,
"grad_norm": 0.0019868926610797644,
"learning_rate": 6.690894736842105e-05,
"loss": 0.0,
"step": 73
},
{
"epoch": 0.039913700107874865,
"grad_norm": 0.3928805887699127,
"learning_rate": 6.638210526315788e-05,
"loss": 0.001,
"step": 74
},
{
"epoch": 0.040453074433656956,
"grad_norm": 0.00091419683303684,
"learning_rate": 6.585526315789474e-05,
"loss": 0.0,
"step": 75
},
{
"epoch": 0.040992448759439054,
"grad_norm": 0.00108022999484092,
"learning_rate": 6.532842105263157e-05,
"loss": 0.0,
"step": 76
},
{
"epoch": 0.041531823085221145,
"grad_norm": 0.004014655947685242,
"learning_rate": 6.480157894736842e-05,
"loss": 0.0001,
"step": 77
},
{
"epoch": 0.042071197411003236,
"grad_norm": 0.0030816688667982817,
"learning_rate": 6.427473684210526e-05,
"loss": 0.0001,
"step": 78
},
{
"epoch": 0.04261057173678533,
"grad_norm": 0.0016585150733590126,
"learning_rate": 6.37478947368421e-05,
"loss": 0.0,
"step": 79
},
{
"epoch": 0.043149946062567425,
"grad_norm": 0.00667704688385129,
"learning_rate": 6.322105263157894e-05,
"loss": 0.0001,
"step": 80
},
{
"epoch": 0.043689320388349516,
"grad_norm": 0.007074718829244375,
"learning_rate": 6.269421052631577e-05,
"loss": 0.0001,
"step": 81
},
{
"epoch": 0.04422869471413161,
"grad_norm": 0.002394700888544321,
"learning_rate": 6.216736842105263e-05,
"loss": 0.0,
"step": 82
},
{
"epoch": 0.0447680690399137,
"grad_norm": 0.0016268681501969695,
"learning_rate": 6.164052631578947e-05,
"loss": 0.0,
"step": 83
},
{
"epoch": 0.045307443365695796,
"grad_norm": 0.00599514739587903,
"learning_rate": 6.111368421052631e-05,
"loss": 0.0001,
"step": 84
},
{
"epoch": 0.04584681769147789,
"grad_norm": 0.0022963311057537794,
"learning_rate": 6.058684210526315e-05,
"loss": 0.0,
"step": 85
},
{
"epoch": 0.04638619201725998,
"grad_norm": 0.007355900481343269,
"learning_rate": 6.005999999999999e-05,
"loss": 0.0001,
"step": 86
},
{
"epoch": 0.04692556634304207,
"grad_norm": 0.02201763354241848,
"learning_rate": 5.953315789473684e-05,
"loss": 0.0002,
"step": 87
},
{
"epoch": 0.04746494066882417,
"grad_norm": 0.005624725949019194,
"learning_rate": 5.9006315789473676e-05,
"loss": 0.0001,
"step": 88
},
{
"epoch": 0.04800431499460626,
"grad_norm": 0.04811863973736763,
"learning_rate": 5.847947368421053e-05,
"loss": 0.0002,
"step": 89
},
{
"epoch": 0.04854368932038835,
"grad_norm": 0.0029467041604220867,
"learning_rate": 5.795263157894737e-05,
"loss": 0.0001,
"step": 90
},
{
"epoch": 0.04908306364617044,
"grad_norm": 0.03302125632762909,
"learning_rate": 5.742578947368421e-05,
"loss": 0.0001,
"step": 91
},
{
"epoch": 0.04962243797195254,
"grad_norm": 0.0017132150242105126,
"learning_rate": 5.6898947368421046e-05,
"loss": 0.0,
"step": 92
},
{
"epoch": 0.05016181229773463,
"grad_norm": 0.0011448146542534232,
"learning_rate": 5.6372105263157886e-05,
"loss": 0.0,
"step": 93
},
{
"epoch": 0.05070118662351672,
"grad_norm": 0.0019763971213251352,
"learning_rate": 5.584526315789473e-05,
"loss": 0.0,
"step": 94
},
{
"epoch": 0.05124056094929881,
"grad_norm": 0.0009319214150309563,
"learning_rate": 5.531842105263158e-05,
"loss": 0.0,
"step": 95
},
{
"epoch": 0.05177993527508091,
"grad_norm": 0.03375813364982605,
"learning_rate": 5.4791578947368424e-05,
"loss": 0.0005,
"step": 96
},
{
"epoch": 0.052319309600863,
"grad_norm": 0.16154968738555908,
"learning_rate": 5.426473684210526e-05,
"loss": 0.0025,
"step": 97
},
{
"epoch": 0.05285868392664509,
"grad_norm": 0.0034327851608395576,
"learning_rate": 5.37378947368421e-05,
"loss": 0.0,
"step": 98
},
{
"epoch": 0.05339805825242718,
"grad_norm": 0.0067018670961260796,
"learning_rate": 5.321105263157894e-05,
"loss": 0.0001,
"step": 99
},
{
"epoch": 0.05393743257820928,
"grad_norm": 0.21445177495479584,
"learning_rate": 5.268421052631578e-05,
"loss": 0.0017,
"step": 100
},
{
"epoch": 0.05393743257820928,
"eval_loss": 0.0010083840461447835,
"eval_runtime": 43.6657,
"eval_samples_per_second": 17.886,
"eval_steps_per_second": 4.489,
"step": 100
},
{
"epoch": 0.05447680690399137,
"grad_norm": 0.459598571062088,
"learning_rate": 5.2157368421052626e-05,
"loss": 0.0081,
"step": 101
},
{
"epoch": 0.05501618122977346,
"grad_norm": 0.0010549342259764671,
"learning_rate": 5.163052631578947e-05,
"loss": 0.0,
"step": 102
},
{
"epoch": 0.05555555555555555,
"grad_norm": 0.002258673310279846,
"learning_rate": 5.110368421052632e-05,
"loss": 0.0,
"step": 103
},
{
"epoch": 0.05609492988133765,
"grad_norm": 0.0001912364095915109,
"learning_rate": 5.057684210526316e-05,
"loss": 0.0,
"step": 104
},
{
"epoch": 0.05663430420711974,
"grad_norm": 0.0003946495126001537,
"learning_rate": 5.005e-05,
"loss": 0.0,
"step": 105
},
{
"epoch": 0.05717367853290183,
"grad_norm": 0.0004527179116848856,
"learning_rate": 4.9523157894736836e-05,
"loss": 0.0,
"step": 106
},
{
"epoch": 0.05771305285868392,
"grad_norm": 0.0007224463624879718,
"learning_rate": 4.899631578947368e-05,
"loss": 0.0,
"step": 107
},
{
"epoch": 0.05825242718446602,
"grad_norm": 0.0027744832914322615,
"learning_rate": 4.846947368421052e-05,
"loss": 0.0001,
"step": 108
},
{
"epoch": 0.05879180151024811,
"grad_norm": 0.0003102615592069924,
"learning_rate": 4.794263157894737e-05,
"loss": 0.0,
"step": 109
},
{
"epoch": 0.0593311758360302,
"grad_norm": 0.00021459658455569297,
"learning_rate": 4.7415789473684206e-05,
"loss": 0.0,
"step": 110
},
{
"epoch": 0.059870550161812294,
"grad_norm": 0.00045238586608320475,
"learning_rate": 4.688894736842105e-05,
"loss": 0.0,
"step": 111
},
{
"epoch": 0.06040992448759439,
"grad_norm": 0.00048040415276773274,
"learning_rate": 4.636210526315789e-05,
"loss": 0.0,
"step": 112
},
{
"epoch": 0.06094929881337648,
"grad_norm": 0.00028796499827876687,
"learning_rate": 4.583526315789473e-05,
"loss": 0.0,
"step": 113
},
{
"epoch": 0.061488673139158574,
"grad_norm": 0.0009007256594486535,
"learning_rate": 4.530842105263158e-05,
"loss": 0.0,
"step": 114
},
{
"epoch": 0.06202804746494067,
"grad_norm": 0.001204671454615891,
"learning_rate": 4.4781578947368416e-05,
"loss": 0.0,
"step": 115
},
{
"epoch": 0.06256742179072276,
"grad_norm": 0.007920761592686176,
"learning_rate": 4.425473684210526e-05,
"loss": 0.0,
"step": 116
},
{
"epoch": 0.06310679611650485,
"grad_norm": 0.000548014766536653,
"learning_rate": 4.372789473684211e-05,
"loss": 0.0,
"step": 117
},
{
"epoch": 0.06364617044228695,
"grad_norm": 0.0007022125646471977,
"learning_rate": 4.320105263157895e-05,
"loss": 0.0,
"step": 118
},
{
"epoch": 0.06418554476806904,
"grad_norm": 0.000792205857578665,
"learning_rate": 4.2674210526315786e-05,
"loss": 0.0,
"step": 119
},
{
"epoch": 0.06472491909385113,
"grad_norm": 0.0005556220421567559,
"learning_rate": 4.2147368421052626e-05,
"loss": 0.0,
"step": 120
},
{
"epoch": 0.06526429341963323,
"grad_norm": 0.0013536950573325157,
"learning_rate": 4.162052631578947e-05,
"loss": 0.0,
"step": 121
},
{
"epoch": 0.06580366774541532,
"grad_norm": 0.001718906918540597,
"learning_rate": 4.109368421052631e-05,
"loss": 0.0,
"step": 122
},
{
"epoch": 0.06634304207119741,
"grad_norm": 0.016501938924193382,
"learning_rate": 4.056684210526316e-05,
"loss": 0.0001,
"step": 123
},
{
"epoch": 0.0668824163969795,
"grad_norm": 0.2795630395412445,
"learning_rate": 4.004e-05,
"loss": 0.0005,
"step": 124
},
{
"epoch": 0.0674217907227616,
"grad_norm": 0.035502709448337555,
"learning_rate": 3.951315789473684e-05,
"loss": 0.0001,
"step": 125
},
{
"epoch": 0.06796116504854369,
"grad_norm": 0.002328946255147457,
"learning_rate": 3.898631578947368e-05,
"loss": 0.0,
"step": 126
},
{
"epoch": 0.06850053937432578,
"grad_norm": 0.0006106572109274566,
"learning_rate": 3.845947368421052e-05,
"loss": 0.0,
"step": 127
},
{
"epoch": 0.06903991370010787,
"grad_norm": 0.0013282396830618382,
"learning_rate": 3.7932631578947367e-05,
"loss": 0.0,
"step": 128
},
{
"epoch": 0.06957928802588997,
"grad_norm": 0.41567087173461914,
"learning_rate": 3.7405789473684206e-05,
"loss": 0.0035,
"step": 129
},
{
"epoch": 0.07011866235167206,
"grad_norm": 0.0007774877594783902,
"learning_rate": 3.6878947368421045e-05,
"loss": 0.0,
"step": 130
},
{
"epoch": 0.07065803667745416,
"grad_norm": 0.05387866869568825,
"learning_rate": 3.635210526315789e-05,
"loss": 0.0006,
"step": 131
},
{
"epoch": 0.07119741100323625,
"grad_norm": 0.0006187845719978213,
"learning_rate": 3.582526315789474e-05,
"loss": 0.0,
"step": 132
},
{
"epoch": 0.07173678532901834,
"grad_norm": 0.3249104917049408,
"learning_rate": 3.5298421052631576e-05,
"loss": 0.0058,
"step": 133
},
{
"epoch": 0.07227615965480043,
"grad_norm": 0.0009034304530359805,
"learning_rate": 3.477157894736842e-05,
"loss": 0.0,
"step": 134
},
{
"epoch": 0.07281553398058252,
"grad_norm": 0.0008392046438530087,
"learning_rate": 3.424473684210526e-05,
"loss": 0.0,
"step": 135
},
{
"epoch": 0.07335490830636461,
"grad_norm": 0.001984312431886792,
"learning_rate": 3.37178947368421e-05,
"loss": 0.0,
"step": 136
},
{
"epoch": 0.07389428263214672,
"grad_norm": 0.05136461928486824,
"learning_rate": 3.319105263157894e-05,
"loss": 0.0003,
"step": 137
},
{
"epoch": 0.0744336569579288,
"grad_norm": 0.0010661619016900659,
"learning_rate": 3.2664210526315786e-05,
"loss": 0.0,
"step": 138
},
{
"epoch": 0.0749730312837109,
"grad_norm": 0.007086303550750017,
"learning_rate": 3.213736842105263e-05,
"loss": 0.0001,
"step": 139
},
{
"epoch": 0.07551240560949299,
"grad_norm": 0.0012311493046581745,
"learning_rate": 3.161052631578947e-05,
"loss": 0.0,
"step": 140
},
{
"epoch": 0.07605177993527508,
"grad_norm": 0.0016970309661701322,
"learning_rate": 3.108368421052632e-05,
"loss": 0.0,
"step": 141
},
{
"epoch": 0.07659115426105717,
"grad_norm": 0.6121507883071899,
"learning_rate": 3.0556842105263156e-05,
"loss": 0.0016,
"step": 142
},
{
"epoch": 0.07713052858683926,
"grad_norm": 0.0011852330062538385,
"learning_rate": 3.0029999999999995e-05,
"loss": 0.0,
"step": 143
},
{
"epoch": 0.07766990291262135,
"grad_norm": 0.0008150951471179724,
"learning_rate": 2.9503157894736838e-05,
"loss": 0.0,
"step": 144
},
{
"epoch": 0.07820927723840346,
"grad_norm": 0.0009472601232118905,
"learning_rate": 2.8976315789473684e-05,
"loss": 0.0,
"step": 145
},
{
"epoch": 0.07874865156418555,
"grad_norm": 0.003503723070025444,
"learning_rate": 2.8449473684210523e-05,
"loss": 0.0001,
"step": 146
},
{
"epoch": 0.07928802588996764,
"grad_norm": 0.0022005646023899317,
"learning_rate": 2.7922631578947366e-05,
"loss": 0.0001,
"step": 147
},
{
"epoch": 0.07982740021574973,
"grad_norm": 0.5109080672264099,
"learning_rate": 2.7395789473684212e-05,
"loss": 0.0101,
"step": 148
},
{
"epoch": 0.08036677454153182,
"grad_norm": 0.0015276124468073249,
"learning_rate": 2.686894736842105e-05,
"loss": 0.0001,
"step": 149
},
{
"epoch": 0.08090614886731391,
"grad_norm": 0.0025751348584890366,
"learning_rate": 2.634210526315789e-05,
"loss": 0.0001,
"step": 150
},
{
"epoch": 0.08090614886731391,
"eval_loss": 0.0005893517518416047,
"eval_runtime": 43.6869,
"eval_samples_per_second": 17.877,
"eval_steps_per_second": 4.486,
"step": 150
},
{
"epoch": 0.081445523193096,
"grad_norm": 0.0056745377369225025,
"learning_rate": 2.5815263157894736e-05,
"loss": 0.0,
"step": 151
},
{
"epoch": 0.08198489751887811,
"grad_norm": 0.00784875638782978,
"learning_rate": 2.528842105263158e-05,
"loss": 0.0001,
"step": 152
},
{
"epoch": 0.0825242718446602,
"grad_norm": 0.06929171085357666,
"learning_rate": 2.4761578947368418e-05,
"loss": 0.0015,
"step": 153
},
{
"epoch": 0.08306364617044229,
"grad_norm": 0.0055921077728271484,
"learning_rate": 2.423473684210526e-05,
"loss": 0.0001,
"step": 154
},
{
"epoch": 0.08360302049622438,
"grad_norm": 0.002456638030707836,
"learning_rate": 2.3707894736842103e-05,
"loss": 0.0,
"step": 155
},
{
"epoch": 0.08414239482200647,
"grad_norm": 0.016230836510658264,
"learning_rate": 2.3181052631578946e-05,
"loss": 0.0002,
"step": 156
},
{
"epoch": 0.08468176914778856,
"grad_norm": 0.0014073759084567428,
"learning_rate": 2.265421052631579e-05,
"loss": 0.0,
"step": 157
},
{
"epoch": 0.08522114347357065,
"grad_norm": 0.18957898020744324,
"learning_rate": 2.212736842105263e-05,
"loss": 0.002,
"step": 158
},
{
"epoch": 0.08576051779935275,
"grad_norm": 0.001173275290057063,
"learning_rate": 2.1600526315789474e-05,
"loss": 0.0,
"step": 159
},
{
"epoch": 0.08629989212513485,
"grad_norm": 0.0007326776976697147,
"learning_rate": 2.1073684210526313e-05,
"loss": 0.0,
"step": 160
},
{
"epoch": 0.08683926645091694,
"grad_norm": 0.008999370969831944,
"learning_rate": 2.0546842105263155e-05,
"loss": 0.0001,
"step": 161
},
{
"epoch": 0.08737864077669903,
"grad_norm": 0.0007513080490753055,
"learning_rate": 2.002e-05,
"loss": 0.0,
"step": 162
},
{
"epoch": 0.08791801510248112,
"grad_norm": 0.0003164597728755325,
"learning_rate": 1.949315789473684e-05,
"loss": 0.0,
"step": 163
},
{
"epoch": 0.08845738942826321,
"grad_norm": 0.0006100442842580378,
"learning_rate": 1.8966315789473683e-05,
"loss": 0.0,
"step": 164
},
{
"epoch": 0.0889967637540453,
"grad_norm": 0.002358382800593972,
"learning_rate": 1.8439473684210522e-05,
"loss": 0.0,
"step": 165
},
{
"epoch": 0.0895361380798274,
"grad_norm": 0.0008702076738700271,
"learning_rate": 1.791263157894737e-05,
"loss": 0.0,
"step": 166
},
{
"epoch": 0.09007551240560949,
"grad_norm": 0.0013868837850168347,
"learning_rate": 1.738578947368421e-05,
"loss": 0.0,
"step": 167
},
{
"epoch": 0.09061488673139159,
"grad_norm": 0.0006682085804641247,
"learning_rate": 1.685894736842105e-05,
"loss": 0.0,
"step": 168
},
{
"epoch": 0.09115426105717368,
"grad_norm": 0.0009037026320584118,
"learning_rate": 1.6332105263157893e-05,
"loss": 0.0,
"step": 169
},
{
"epoch": 0.09169363538295577,
"grad_norm": 0.0003754556237254292,
"learning_rate": 1.5805263157894735e-05,
"loss": 0.0,
"step": 170
},
{
"epoch": 0.09223300970873786,
"grad_norm": 0.0007883550715632737,
"learning_rate": 1.5278421052631578e-05,
"loss": 0.0,
"step": 171
},
{
"epoch": 0.09277238403451996,
"grad_norm": 0.00029830558924004436,
"learning_rate": 1.4751578947368419e-05,
"loss": 0.0,
"step": 172
},
{
"epoch": 0.09331175836030205,
"grad_norm": 0.00025384570471942425,
"learning_rate": 1.4224736842105262e-05,
"loss": 0.0,
"step": 173
},
{
"epoch": 0.09385113268608414,
"grad_norm": 0.000792273145634681,
"learning_rate": 1.3697894736842106e-05,
"loss": 0.0,
"step": 174
},
{
"epoch": 0.09439050701186623,
"grad_norm": 0.00023794101434759796,
"learning_rate": 1.3171052631578945e-05,
"loss": 0.0,
"step": 175
},
{
"epoch": 0.09492988133764833,
"grad_norm": 0.0003109975659754127,
"learning_rate": 1.264421052631579e-05,
"loss": 0.0,
"step": 176
},
{
"epoch": 0.09546925566343042,
"grad_norm": 0.0019688678439706564,
"learning_rate": 1.211736842105263e-05,
"loss": 0.0,
"step": 177
},
{
"epoch": 0.09600862998921252,
"grad_norm": 0.00809526164084673,
"learning_rate": 1.1590526315789473e-05,
"loss": 0.0001,
"step": 178
},
{
"epoch": 0.0965480043149946,
"grad_norm": 0.0008922089473344386,
"learning_rate": 1.1063684210526316e-05,
"loss": 0.0,
"step": 179
},
{
"epoch": 0.0970873786407767,
"grad_norm": 0.0005603457102552056,
"learning_rate": 1.0536842105263156e-05,
"loss": 0.0,
"step": 180
},
{
"epoch": 0.09762675296655879,
"grad_norm": 0.001602335018105805,
"learning_rate": 1.001e-05,
"loss": 0.0,
"step": 181
},
{
"epoch": 0.09816612729234088,
"grad_norm": 0.000997183844447136,
"learning_rate": 9.483157894736842e-06,
"loss": 0.0,
"step": 182
},
{
"epoch": 0.09870550161812297,
"grad_norm": 0.0009953331900760531,
"learning_rate": 8.956315789473684e-06,
"loss": 0.0,
"step": 183
},
{
"epoch": 0.09924487594390508,
"grad_norm": 0.0015611740527674556,
"learning_rate": 8.429473684210525e-06,
"loss": 0.0,
"step": 184
},
{
"epoch": 0.09978425026968717,
"grad_norm": 0.08797088265419006,
"learning_rate": 7.902631578947368e-06,
"loss": 0.0011,
"step": 185
},
{
"epoch": 0.10032362459546926,
"grad_norm": 0.0015272749587893486,
"learning_rate": 7.3757894736842095e-06,
"loss": 0.0,
"step": 186
},
{
"epoch": 0.10086299892125135,
"grad_norm": 0.007411181926727295,
"learning_rate": 6.848947368421053e-06,
"loss": 0.0001,
"step": 187
},
{
"epoch": 0.10140237324703344,
"grad_norm": 0.001111446414142847,
"learning_rate": 6.322105263157895e-06,
"loss": 0.0,
"step": 188
},
{
"epoch": 0.10194174757281553,
"grad_norm": 0.0008587180054746568,
"learning_rate": 5.7952631578947365e-06,
"loss": 0.0,
"step": 189
},
{
"epoch": 0.10248112189859762,
"grad_norm": 0.008806944824755192,
"learning_rate": 5.268421052631578e-06,
"loss": 0.0001,
"step": 190
},
{
"epoch": 0.10302049622437973,
"grad_norm": 0.059028804302215576,
"learning_rate": 4.741578947368421e-06,
"loss": 0.0008,
"step": 191
},
{
"epoch": 0.10355987055016182,
"grad_norm": 0.0017502488335594535,
"learning_rate": 4.2147368421052626e-06,
"loss": 0.0,
"step": 192
},
{
"epoch": 0.10409924487594391,
"grad_norm": 0.0008180232835002244,
"learning_rate": 3.6878947368421047e-06,
"loss": 0.0,
"step": 193
},
{
"epoch": 0.104638619201726,
"grad_norm": 0.0005253468989394605,
"learning_rate": 3.1610526315789474e-06,
"loss": 0.0,
"step": 194
},
{
"epoch": 0.10517799352750809,
"grad_norm": 0.007774029858410358,
"learning_rate": 2.634210526315789e-06,
"loss": 0.0001,
"step": 195
},
{
"epoch": 0.10571736785329018,
"grad_norm": 0.001689290627837181,
"learning_rate": 2.1073684210526313e-06,
"loss": 0.0,
"step": 196
},
{
"epoch": 0.10625674217907227,
"grad_norm": 0.005365308839827776,
"learning_rate": 1.5805263157894737e-06,
"loss": 0.0001,
"step": 197
},
{
"epoch": 0.10679611650485436,
"grad_norm": 0.0035137736704200506,
"learning_rate": 1.0536842105263156e-06,
"loss": 0.0,
"step": 198
},
{
"epoch": 0.10733549083063647,
"grad_norm": 0.0472775362432003,
"learning_rate": 5.268421052631578e-07,
"loss": 0.0005,
"step": 199
},
{
"epoch": 0.10787486515641856,
"grad_norm": 0.029273375868797302,
"learning_rate": 0.0,
"loss": 0.0003,
"step": 200
},
{
"epoch": 0.10787486515641856,
"eval_loss": 0.000882925814948976,
"eval_runtime": 43.7697,
"eval_samples_per_second": 17.843,
"eval_steps_per_second": 4.478,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.580280733197926e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}