clip-flow-qwen-vl-7b-final / trainer_state.json
jiuhai's picture
Upload folder using huggingface_hub
569de7a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1393,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007178750897343862,
"grad_norm": 0.28594130277633667,
"learning_rate": 0.0,
"loss": 1.2147,
"step": 1
},
{
"epoch": 0.0014357501794687725,
"grad_norm": 0.2666853368282318,
"learning_rate": 4.000000000000001e-06,
"loss": 1.226,
"step": 2
},
{
"epoch": 0.0021536252692031586,
"grad_norm": 0.2672528922557831,
"learning_rate": 8.000000000000001e-06,
"loss": 1.2442,
"step": 3
},
{
"epoch": 0.002871500358937545,
"grad_norm": 2.0374395847320557,
"learning_rate": 1.2e-05,
"loss": 1.6612,
"step": 4
},
{
"epoch": 0.003589375448671931,
"grad_norm": 0.2744747996330261,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.2383,
"step": 5
},
{
"epoch": 0.004307250538406317,
"grad_norm": 0.2731163501739502,
"learning_rate": 2e-05,
"loss": 1.2612,
"step": 6
},
{
"epoch": 0.005025125628140704,
"grad_norm": 0.5806142687797546,
"learning_rate": 1.9999987192609945e-05,
"loss": 1.4832,
"step": 7
},
{
"epoch": 0.00574300071787509,
"grad_norm": 0.29958781599998474,
"learning_rate": 1.9999948770505387e-05,
"loss": 1.2451,
"step": 8
},
{
"epoch": 0.006460875807609476,
"grad_norm": 0.8398181796073914,
"learning_rate": 1.9999884733883163e-05,
"loss": 1.2251,
"step": 9
},
{
"epoch": 0.007178750897343862,
"grad_norm": 0.5620901584625244,
"learning_rate": 1.9999795083071327e-05,
"loss": 1.4215,
"step": 10
},
{
"epoch": 0.007896625987078248,
"grad_norm": 0.31047117710113525,
"learning_rate": 1.999967981852916e-05,
"loss": 1.2814,
"step": 11
},
{
"epoch": 0.008614501076812634,
"grad_norm": 0.33838632702827454,
"learning_rate": 1.9999538940847157e-05,
"loss": 1.2543,
"step": 12
},
{
"epoch": 0.00933237616654702,
"grad_norm": 0.5756534934043884,
"learning_rate": 1.9999372450747025e-05,
"loss": 1.3828,
"step": 13
},
{
"epoch": 0.010050251256281407,
"grad_norm": 0.6112161874771118,
"learning_rate": 1.9999180349081687e-05,
"loss": 1.4476,
"step": 14
},
{
"epoch": 0.010768126346015794,
"grad_norm": 0.3352164030075073,
"learning_rate": 1.9998962636835273e-05,
"loss": 1.2413,
"step": 15
},
{
"epoch": 0.01148600143575018,
"grad_norm": 0.28125348687171936,
"learning_rate": 1.999871931512311e-05,
"loss": 1.1692,
"step": 16
},
{
"epoch": 0.012203876525484566,
"grad_norm": 0.8347467184066772,
"learning_rate": 1.9998450385191728e-05,
"loss": 1.5037,
"step": 17
},
{
"epoch": 0.012921751615218953,
"grad_norm": 0.3188466429710388,
"learning_rate": 1.999815584841884e-05,
"loss": 1.2483,
"step": 18
},
{
"epoch": 0.013639626704953339,
"grad_norm": 0.36748605966567993,
"learning_rate": 1.9997835706313347e-05,
"loss": 1.2355,
"step": 19
},
{
"epoch": 0.014357501794687724,
"grad_norm": 0.37445810437202454,
"learning_rate": 1.999748996051532e-05,
"loss": 1.2169,
"step": 20
},
{
"epoch": 0.01507537688442211,
"grad_norm": 0.3378954827785492,
"learning_rate": 1.9997118612796006e-05,
"loss": 1.2259,
"step": 21
},
{
"epoch": 0.015793251974156496,
"grad_norm": 0.28010478615760803,
"learning_rate": 1.9996721665057797e-05,
"loss": 1.2048,
"step": 22
},
{
"epoch": 0.016511127063890883,
"grad_norm": 1.0997238159179688,
"learning_rate": 1.999629911933424e-05,
"loss": 1.3627,
"step": 23
},
{
"epoch": 0.01722900215362527,
"grad_norm": 0.28337937593460083,
"learning_rate": 1.9995850977790022e-05,
"loss": 1.2185,
"step": 24
},
{
"epoch": 0.017946877243359655,
"grad_norm": 0.3515605628490448,
"learning_rate": 1.9995377242720946e-05,
"loss": 1.2808,
"step": 25
},
{
"epoch": 0.01866475233309404,
"grad_norm": 0.37158203125,
"learning_rate": 1.999487791655394e-05,
"loss": 1.2446,
"step": 26
},
{
"epoch": 0.019382627422828428,
"grad_norm": 0.3290520906448364,
"learning_rate": 1.9994353001847027e-05,
"loss": 1.2313,
"step": 27
},
{
"epoch": 0.020100502512562814,
"grad_norm": 0.4342392086982727,
"learning_rate": 1.9993802501289328e-05,
"loss": 1.3652,
"step": 28
},
{
"epoch": 0.0208183776022972,
"grad_norm": 0.6782366037368774,
"learning_rate": 1.9993226417701028e-05,
"loss": 1.4407,
"step": 29
},
{
"epoch": 0.021536252692031587,
"grad_norm": 1.0299732685089111,
"learning_rate": 1.999262475403338e-05,
"loss": 1.4609,
"step": 30
},
{
"epoch": 0.022254127781765973,
"grad_norm": 0.27583467960357666,
"learning_rate": 1.9991997513368674e-05,
"loss": 1.205,
"step": 31
},
{
"epoch": 0.02297200287150036,
"grad_norm": 0.4443705677986145,
"learning_rate": 1.9991344698920248e-05,
"loss": 1.3496,
"step": 32
},
{
"epoch": 0.023689877961234746,
"grad_norm": 0.3160634934902191,
"learning_rate": 1.9990666314032436e-05,
"loss": 1.2201,
"step": 33
},
{
"epoch": 0.024407753050969132,
"grad_norm": 0.5006676316261292,
"learning_rate": 1.998996236218057e-05,
"loss": 1.4296,
"step": 34
},
{
"epoch": 0.02512562814070352,
"grad_norm": 0.42285463213920593,
"learning_rate": 1.998923284697097e-05,
"loss": 1.402,
"step": 35
},
{
"epoch": 0.025843503230437905,
"grad_norm": 0.5728902220726013,
"learning_rate": 1.9988477772140907e-05,
"loss": 1.2901,
"step": 36
},
{
"epoch": 0.02656137832017229,
"grad_norm": 0.758953869342804,
"learning_rate": 1.9987697141558603e-05,
"loss": 1.3689,
"step": 37
},
{
"epoch": 0.027279253409906678,
"grad_norm": 0.28098437190055847,
"learning_rate": 1.9986890959223182e-05,
"loss": 1.1861,
"step": 38
},
{
"epoch": 0.02799712849964106,
"grad_norm": 0.3095812201499939,
"learning_rate": 1.998605922926469e-05,
"loss": 1.2307,
"step": 39
},
{
"epoch": 0.028715003589375447,
"grad_norm": 0.31783121824264526,
"learning_rate": 1.998520195594404e-05,
"loss": 1.1731,
"step": 40
},
{
"epoch": 0.029432878679109833,
"grad_norm": 0.5449131727218628,
"learning_rate": 1.9984319143653006e-05,
"loss": 1.4122,
"step": 41
},
{
"epoch": 0.03015075376884422,
"grad_norm": 0.3425084352493286,
"learning_rate": 1.9983410796914197e-05,
"loss": 1.232,
"step": 42
},
{
"epoch": 0.030868628858578606,
"grad_norm": 0.3560698628425598,
"learning_rate": 1.998247692038103e-05,
"loss": 1.228,
"step": 43
},
{
"epoch": 0.03158650394831299,
"grad_norm": 0.2846856415271759,
"learning_rate": 1.998151751883772e-05,
"loss": 1.1939,
"step": 44
},
{
"epoch": 0.03230437903804738,
"grad_norm": 0.3970009684562683,
"learning_rate": 1.9980532597199235e-05,
"loss": 1.3507,
"step": 45
},
{
"epoch": 0.033022254127781765,
"grad_norm": 0.780319094657898,
"learning_rate": 1.9979522160511284e-05,
"loss": 1.4147,
"step": 46
},
{
"epoch": 0.03374012921751615,
"grad_norm": 0.32033440470695496,
"learning_rate": 1.997848621395029e-05,
"loss": 1.204,
"step": 47
},
{
"epoch": 0.03445800430725054,
"grad_norm": 0.645374059677124,
"learning_rate": 1.9977424762823366e-05,
"loss": 1.2397,
"step": 48
},
{
"epoch": 0.035175879396984924,
"grad_norm": 0.4876931607723236,
"learning_rate": 1.9976337812568273e-05,
"loss": 1.2226,
"step": 49
},
{
"epoch": 0.03589375448671931,
"grad_norm": 0.293576180934906,
"learning_rate": 1.997522536875341e-05,
"loss": 1.2281,
"step": 50
},
{
"epoch": 0.0366116295764537,
"grad_norm": 0.9625378847122192,
"learning_rate": 1.9974087437077786e-05,
"loss": 1.2895,
"step": 51
},
{
"epoch": 0.03732950466618808,
"grad_norm": 0.2653510272502899,
"learning_rate": 1.9972924023370966e-05,
"loss": 1.1359,
"step": 52
},
{
"epoch": 0.03804737975592247,
"grad_norm": 0.46903130412101746,
"learning_rate": 1.9971735133593074e-05,
"loss": 1.1813,
"step": 53
},
{
"epoch": 0.038765254845656856,
"grad_norm": 0.33105671405792236,
"learning_rate": 1.9970520773834734e-05,
"loss": 1.2488,
"step": 54
},
{
"epoch": 0.03948312993539124,
"grad_norm": 0.3274335265159607,
"learning_rate": 1.996928095031706e-05,
"loss": 1.1956,
"step": 55
},
{
"epoch": 0.04020100502512563,
"grad_norm": 0.30211400985717773,
"learning_rate": 1.9968015669391612e-05,
"loss": 1.1859,
"step": 56
},
{
"epoch": 0.040918880114860015,
"grad_norm": 0.4520847797393799,
"learning_rate": 1.9966724937540375e-05,
"loss": 1.3556,
"step": 57
},
{
"epoch": 0.0416367552045944,
"grad_norm": 0.26978108286857605,
"learning_rate": 1.9965408761375703e-05,
"loss": 1.2434,
"step": 58
},
{
"epoch": 0.04235463029432879,
"grad_norm": 0.37947210669517517,
"learning_rate": 1.996406714764031e-05,
"loss": 1.1514,
"step": 59
},
{
"epoch": 0.043072505384063174,
"grad_norm": 0.363207072019577,
"learning_rate": 1.9962700103207232e-05,
"loss": 1.1866,
"step": 60
},
{
"epoch": 0.04379038047379756,
"grad_norm": 0.3245425522327423,
"learning_rate": 1.9961307635079768e-05,
"loss": 1.1572,
"step": 61
},
{
"epoch": 0.04450825556353195,
"grad_norm": 0.27696144580841064,
"learning_rate": 1.9959889750391474e-05,
"loss": 1.2074,
"step": 62
},
{
"epoch": 0.04522613065326633,
"grad_norm": 0.28961682319641113,
"learning_rate": 1.9958446456406117e-05,
"loss": 1.1847,
"step": 63
},
{
"epoch": 0.04594400574300072,
"grad_norm": 0.3816603422164917,
"learning_rate": 1.9956977760517618e-05,
"loss": 1.2932,
"step": 64
},
{
"epoch": 0.046661880832735106,
"grad_norm": 0.3908121883869171,
"learning_rate": 1.995548367025005e-05,
"loss": 1.2683,
"step": 65
},
{
"epoch": 0.04737975592246949,
"grad_norm": 0.3329205811023712,
"learning_rate": 1.9953964193257564e-05,
"loss": 1.2455,
"step": 66
},
{
"epoch": 0.04809763101220388,
"grad_norm": 0.5550855398178101,
"learning_rate": 1.9952419337324384e-05,
"loss": 1.2555,
"step": 67
},
{
"epoch": 0.048815506101938265,
"grad_norm": 0.28543972969055176,
"learning_rate": 1.995084911036473e-05,
"loss": 1.1661,
"step": 68
},
{
"epoch": 0.04953338119167265,
"grad_norm": 0.26742851734161377,
"learning_rate": 1.994925352042281e-05,
"loss": 1.2163,
"step": 69
},
{
"epoch": 0.05025125628140704,
"grad_norm": 0.32335177063941956,
"learning_rate": 1.994763257567276e-05,
"loss": 1.1836,
"step": 70
},
{
"epoch": 0.050969131371141424,
"grad_norm": 0.6627066135406494,
"learning_rate": 1.9945986284418607e-05,
"loss": 1.3515,
"step": 71
},
{
"epoch": 0.05168700646087581,
"grad_norm": 1.2874963283538818,
"learning_rate": 1.994431465509423e-05,
"loss": 1.6039,
"step": 72
},
{
"epoch": 0.0524048815506102,
"grad_norm": 0.4531334638595581,
"learning_rate": 1.994261769626332e-05,
"loss": 1.2178,
"step": 73
},
{
"epoch": 0.05312275664034458,
"grad_norm": 0.26209312677383423,
"learning_rate": 1.994089541661931e-05,
"loss": 1.1948,
"step": 74
},
{
"epoch": 0.05384063173007897,
"grad_norm": 0.31642234325408936,
"learning_rate": 1.9939147824985366e-05,
"loss": 1.2251,
"step": 75
},
{
"epoch": 0.054558506819813356,
"grad_norm": 0.681365430355072,
"learning_rate": 1.993737493031433e-05,
"loss": 1.2787,
"step": 76
},
{
"epoch": 0.05527638190954774,
"grad_norm": 0.3119295537471771,
"learning_rate": 1.993557674168866e-05,
"loss": 1.2423,
"step": 77
},
{
"epoch": 0.05599425699928212,
"grad_norm": 0.295529842376709,
"learning_rate": 1.9933753268320394e-05,
"loss": 1.1692,
"step": 78
},
{
"epoch": 0.05671213208901651,
"grad_norm": 0.4035413861274719,
"learning_rate": 1.9931904519551106e-05,
"loss": 1.3287,
"step": 79
},
{
"epoch": 0.057430007178750894,
"grad_norm": 0.596662700176239,
"learning_rate": 1.993003050485186e-05,
"loss": 1.3592,
"step": 80
},
{
"epoch": 0.05814788226848528,
"grad_norm": 0.4334593117237091,
"learning_rate": 1.992813123382314e-05,
"loss": 1.307,
"step": 81
},
{
"epoch": 0.05886575735821967,
"grad_norm": 0.27314937114715576,
"learning_rate": 1.9926206716194843e-05,
"loss": 1.1615,
"step": 82
},
{
"epoch": 0.05958363244795405,
"grad_norm": 0.46125203371047974,
"learning_rate": 1.9924256961826177e-05,
"loss": 1.4522,
"step": 83
},
{
"epoch": 0.06030150753768844,
"grad_norm": 0.2750553488731384,
"learning_rate": 1.9922281980705655e-05,
"loss": 1.1959,
"step": 84
},
{
"epoch": 0.061019382627422826,
"grad_norm": 0.31477832794189453,
"learning_rate": 1.9920281782951013e-05,
"loss": 1.2452,
"step": 85
},
{
"epoch": 0.06173725771715721,
"grad_norm": 0.4933525323867798,
"learning_rate": 1.991825637880918e-05,
"loss": 1.3354,
"step": 86
},
{
"epoch": 0.0624551328068916,
"grad_norm": 0.32180190086364746,
"learning_rate": 1.9916205778656207e-05,
"loss": 1.1748,
"step": 87
},
{
"epoch": 0.06317300789662599,
"grad_norm": 0.7428871393203735,
"learning_rate": 1.9914129992997235e-05,
"loss": 1.21,
"step": 88
},
{
"epoch": 0.06389088298636038,
"grad_norm": 0.531775176525116,
"learning_rate": 1.9912029032466415e-05,
"loss": 1.3917,
"step": 89
},
{
"epoch": 0.06460875807609476,
"grad_norm": 0.4157712459564209,
"learning_rate": 1.9909902907826883e-05,
"loss": 1.2385,
"step": 90
},
{
"epoch": 0.06532663316582915,
"grad_norm": 1.7364940643310547,
"learning_rate": 1.990775162997068e-05,
"loss": 1.532,
"step": 91
},
{
"epoch": 0.06604450825556353,
"grad_norm": 0.5754929184913635,
"learning_rate": 1.9905575209918704e-05,
"loss": 1.3588,
"step": 92
},
{
"epoch": 0.06676238334529792,
"grad_norm": 0.3331480622291565,
"learning_rate": 1.9903373658820667e-05,
"loss": 1.1613,
"step": 93
},
{
"epoch": 0.0674802584350323,
"grad_norm": 0.35004234313964844,
"learning_rate": 1.990114698795501e-05,
"loss": 1.206,
"step": 94
},
{
"epoch": 0.0681981335247667,
"grad_norm": 0.36566850543022156,
"learning_rate": 1.989889520872887e-05,
"loss": 1.1561,
"step": 95
},
{
"epoch": 0.06891600861450108,
"grad_norm": 0.5920188426971436,
"learning_rate": 1.9896618332678022e-05,
"loss": 1.4349,
"step": 96
},
{
"epoch": 0.06963388370423547,
"grad_norm": 0.3459800183773041,
"learning_rate": 1.9894316371466794e-05,
"loss": 1.1681,
"step": 97
},
{
"epoch": 0.07035175879396985,
"grad_norm": 0.49259281158447266,
"learning_rate": 1.9891989336888034e-05,
"loss": 1.312,
"step": 98
},
{
"epoch": 0.07106963388370424,
"grad_norm": 0.31321045756340027,
"learning_rate": 1.988963724086304e-05,
"loss": 1.1947,
"step": 99
},
{
"epoch": 0.07178750897343862,
"grad_norm": 0.38120272755622864,
"learning_rate": 1.988726009544149e-05,
"loss": 1.2053,
"step": 100
},
{
"epoch": 0.07250538406317301,
"grad_norm": 0.3773294985294342,
"learning_rate": 1.9884857912801402e-05,
"loss": 1.1776,
"step": 101
},
{
"epoch": 0.0732232591529074,
"grad_norm": 0.9980801939964294,
"learning_rate": 1.988243070524905e-05,
"loss": 1.1999,
"step": 102
},
{
"epoch": 0.07394113424264179,
"grad_norm": 0.2763481140136719,
"learning_rate": 1.9879978485218913e-05,
"loss": 1.161,
"step": 103
},
{
"epoch": 0.07465900933237617,
"grad_norm": 0.2802128493785858,
"learning_rate": 1.9877501265273606e-05,
"loss": 1.1897,
"step": 104
},
{
"epoch": 0.07537688442211055,
"grad_norm": 1.2885147333145142,
"learning_rate": 1.9874999058103813e-05,
"loss": 1.5123,
"step": 105
},
{
"epoch": 0.07609475951184494,
"grad_norm": 0.3465665280818939,
"learning_rate": 1.9872471876528238e-05,
"loss": 1.1916,
"step": 106
},
{
"epoch": 0.07681263460157932,
"grad_norm": 0.4765189588069916,
"learning_rate": 1.9869919733493517e-05,
"loss": 1.2785,
"step": 107
},
{
"epoch": 0.07753050969131371,
"grad_norm": 0.3360905349254608,
"learning_rate": 1.986734264207417e-05,
"loss": 1.1136,
"step": 108
},
{
"epoch": 0.07824838478104809,
"grad_norm": 0.41545817255973816,
"learning_rate": 1.9864740615472516e-05,
"loss": 1.2077,
"step": 109
},
{
"epoch": 0.07896625987078248,
"grad_norm": 0.2709164619445801,
"learning_rate": 1.986211366701863e-05,
"loss": 1.2149,
"step": 110
},
{
"epoch": 0.07968413496051686,
"grad_norm": 0.28264105319976807,
"learning_rate": 1.9859461810170248e-05,
"loss": 1.1749,
"step": 111
},
{
"epoch": 0.08040201005025126,
"grad_norm": 0.2910136282444,
"learning_rate": 1.9856785058512723e-05,
"loss": 1.1727,
"step": 112
},
{
"epoch": 0.08111988513998564,
"grad_norm": 0.2950994372367859,
"learning_rate": 1.9854083425758933e-05,
"loss": 1.1458,
"step": 113
},
{
"epoch": 0.08183776022972003,
"grad_norm": 0.40134933590888977,
"learning_rate": 1.9851356925749217e-05,
"loss": 1.2594,
"step": 114
},
{
"epoch": 0.08255563531945441,
"grad_norm": 0.38769832253456116,
"learning_rate": 1.9848605572451326e-05,
"loss": 1.1522,
"step": 115
},
{
"epoch": 0.0832735104091888,
"grad_norm": 0.24265125393867493,
"learning_rate": 1.9845829379960313e-05,
"loss": 1.1218,
"step": 116
},
{
"epoch": 0.08399138549892318,
"grad_norm": 0.2472332864999771,
"learning_rate": 1.9843028362498496e-05,
"loss": 1.1572,
"step": 117
},
{
"epoch": 0.08470926058865758,
"grad_norm": 0.37674251198768616,
"learning_rate": 1.984020253441536e-05,
"loss": 1.1687,
"step": 118
},
{
"epoch": 0.08542713567839195,
"grad_norm": 0.27769795060157776,
"learning_rate": 1.98373519101875e-05,
"loss": 1.183,
"step": 119
},
{
"epoch": 0.08614501076812635,
"grad_norm": 1.3421275615692139,
"learning_rate": 1.9834476504418538e-05,
"loss": 1.3964,
"step": 120
},
{
"epoch": 0.08686288585786073,
"grad_norm": 0.3361498713493347,
"learning_rate": 1.983157633183905e-05,
"loss": 1.1567,
"step": 121
},
{
"epoch": 0.08758076094759512,
"grad_norm": 0.253305584192276,
"learning_rate": 1.9828651407306497e-05,
"loss": 1.1364,
"step": 122
},
{
"epoch": 0.0882986360373295,
"grad_norm": 0.48132067918777466,
"learning_rate": 1.9825701745805136e-05,
"loss": 1.3131,
"step": 123
},
{
"epoch": 0.0890165111270639,
"grad_norm": 0.399525910615921,
"learning_rate": 1.982272736244595e-05,
"loss": 1.2078,
"step": 124
},
{
"epoch": 0.08973438621679827,
"grad_norm": 0.48796120285987854,
"learning_rate": 1.9819728272466578e-05,
"loss": 1.2872,
"step": 125
},
{
"epoch": 0.09045226130653267,
"grad_norm": 0.37568381428718567,
"learning_rate": 1.9816704491231225e-05,
"loss": 1.0992,
"step": 126
},
{
"epoch": 0.09117013639626705,
"grad_norm": 0.2590203881263733,
"learning_rate": 1.9813656034230593e-05,
"loss": 1.2027,
"step": 127
},
{
"epoch": 0.09188801148600144,
"grad_norm": 0.5362399816513062,
"learning_rate": 1.9810582917081786e-05,
"loss": 1.25,
"step": 128
},
{
"epoch": 0.09260588657573582,
"grad_norm": 0.46542394161224365,
"learning_rate": 1.980748515552825e-05,
"loss": 1.1606,
"step": 129
},
{
"epoch": 0.09332376166547021,
"grad_norm": 0.26864808797836304,
"learning_rate": 1.980436276543969e-05,
"loss": 1.138,
"step": 130
},
{
"epoch": 0.09404163675520459,
"grad_norm": 0.36033275723457336,
"learning_rate": 1.980121576281196e-05,
"loss": 1.1465,
"step": 131
},
{
"epoch": 0.09475951184493898,
"grad_norm": 0.3540986180305481,
"learning_rate": 1.9798044163767023e-05,
"loss": 1.1093,
"step": 132
},
{
"epoch": 0.09547738693467336,
"grad_norm": 0.4412473738193512,
"learning_rate": 1.979484798455284e-05,
"loss": 1.2397,
"step": 133
},
{
"epoch": 0.09619526202440776,
"grad_norm": 0.2563040852546692,
"learning_rate": 1.97916272415433e-05,
"loss": 1.1977,
"step": 134
},
{
"epoch": 0.09691313711414214,
"grad_norm": 0.29979026317596436,
"learning_rate": 1.9788381951238122e-05,
"loss": 1.1749,
"step": 135
},
{
"epoch": 0.09763101220387653,
"grad_norm": 0.5110599398612976,
"learning_rate": 1.9785112130262792e-05,
"loss": 1.4412,
"step": 136
},
{
"epoch": 0.09834888729361091,
"grad_norm": 0.3159632980823517,
"learning_rate": 1.978181779536845e-05,
"loss": 1.1907,
"step": 137
},
{
"epoch": 0.0990667623833453,
"grad_norm": 0.2782893776893616,
"learning_rate": 1.9778498963431837e-05,
"loss": 1.176,
"step": 138
},
{
"epoch": 0.09978463747307968,
"grad_norm": 0.5137550234794617,
"learning_rate": 1.977515565145518e-05,
"loss": 1.2715,
"step": 139
},
{
"epoch": 0.10050251256281408,
"grad_norm": 1.1845322847366333,
"learning_rate": 1.9771787876566124e-05,
"loss": 1.1766,
"step": 140
},
{
"epoch": 0.10122038765254845,
"grad_norm": 0.5801928639411926,
"learning_rate": 1.976839565601762e-05,
"loss": 1.2851,
"step": 141
},
{
"epoch": 0.10193826274228285,
"grad_norm": 0.6315880417823792,
"learning_rate": 1.9764979007187875e-05,
"loss": 1.2729,
"step": 142
},
{
"epoch": 0.10265613783201723,
"grad_norm": 0.5468786358833313,
"learning_rate": 1.976153794758023e-05,
"loss": 1.1672,
"step": 143
},
{
"epoch": 0.10337401292175162,
"grad_norm": 0.3717634379863739,
"learning_rate": 1.975807249482307e-05,
"loss": 1.2785,
"step": 144
},
{
"epoch": 0.104091888011486,
"grad_norm": 0.334043949842453,
"learning_rate": 1.9754582666669776e-05,
"loss": 1.1835,
"step": 145
},
{
"epoch": 0.1048097631012204,
"grad_norm": 0.3045234978199005,
"learning_rate": 1.9751068480998573e-05,
"loss": 1.1433,
"step": 146
},
{
"epoch": 0.10552763819095477,
"grad_norm": 0.2606176733970642,
"learning_rate": 1.974752995581248e-05,
"loss": 1.1382,
"step": 147
},
{
"epoch": 0.10624551328068917,
"grad_norm": 0.28245827555656433,
"learning_rate": 1.974396710923921e-05,
"loss": 1.2097,
"step": 148
},
{
"epoch": 0.10696338837042355,
"grad_norm": 0.26672127842903137,
"learning_rate": 1.9740379959531063e-05,
"loss": 1.1527,
"step": 149
},
{
"epoch": 0.10768126346015794,
"grad_norm": 0.27380216121673584,
"learning_rate": 1.9736768525064854e-05,
"loss": 1.1372,
"step": 150
},
{
"epoch": 0.10839913854989232,
"grad_norm": 0.27657026052474976,
"learning_rate": 1.9733132824341802e-05,
"loss": 1.1499,
"step": 151
},
{
"epoch": 0.10911701363962671,
"grad_norm": 0.32931819558143616,
"learning_rate": 1.9729472875987442e-05,
"loss": 1.1688,
"step": 152
},
{
"epoch": 0.10983488872936109,
"grad_norm": 0.3223106265068054,
"learning_rate": 1.972578869875153e-05,
"loss": 1.2062,
"step": 153
},
{
"epoch": 0.11055276381909548,
"grad_norm": 0.2810954749584198,
"learning_rate": 1.9722080311507938e-05,
"loss": 1.1553,
"step": 154
},
{
"epoch": 0.11127063890882986,
"grad_norm": 0.2533334195613861,
"learning_rate": 1.9718347733254578e-05,
"loss": 1.1613,
"step": 155
},
{
"epoch": 0.11198851399856424,
"grad_norm": 0.44092583656311035,
"learning_rate": 1.971459098311328e-05,
"loss": 1.2391,
"step": 156
},
{
"epoch": 0.11270638908829864,
"grad_norm": 0.4267832636833191,
"learning_rate": 1.971081008032971e-05,
"loss": 1.329,
"step": 157
},
{
"epoch": 0.11342426417803302,
"grad_norm": 0.3565402030944824,
"learning_rate": 1.970700504427327e-05,
"loss": 1.1892,
"step": 158
},
{
"epoch": 0.11414213926776741,
"grad_norm": 0.4627990424633026,
"learning_rate": 1.9703175894436987e-05,
"loss": 1.2786,
"step": 159
},
{
"epoch": 0.11486001435750179,
"grad_norm": 0.31331735849380493,
"learning_rate": 1.9699322650437433e-05,
"loss": 1.1981,
"step": 160
},
{
"epoch": 0.11557788944723618,
"grad_norm": 0.3424057960510254,
"learning_rate": 1.9695445332014605e-05,
"loss": 1.1984,
"step": 161
},
{
"epoch": 0.11629576453697056,
"grad_norm": 0.41167309880256653,
"learning_rate": 1.969154395903183e-05,
"loss": 1.2147,
"step": 162
},
{
"epoch": 0.11701363962670495,
"grad_norm": 0.2450089007616043,
"learning_rate": 1.968761855147568e-05,
"loss": 1.1662,
"step": 163
},
{
"epoch": 0.11773151471643933,
"grad_norm": 0.39046624302864075,
"learning_rate": 1.9683669129455838e-05,
"loss": 1.1795,
"step": 164
},
{
"epoch": 0.11844938980617373,
"grad_norm": 0.28102466464042664,
"learning_rate": 1.967969571320502e-05,
"loss": 1.1091,
"step": 165
},
{
"epoch": 0.1191672648959081,
"grad_norm": 0.5738700032234192,
"learning_rate": 1.9675698323078864e-05,
"loss": 1.3535,
"step": 166
},
{
"epoch": 0.1198851399856425,
"grad_norm": 0.31552526354789734,
"learning_rate": 1.9671676979555827e-05,
"loss": 1.1579,
"step": 167
},
{
"epoch": 0.12060301507537688,
"grad_norm": 0.280458003282547,
"learning_rate": 1.9667631703237073e-05,
"loss": 1.1366,
"step": 168
},
{
"epoch": 0.12132089016511127,
"grad_norm": 0.3525103032588959,
"learning_rate": 1.9663562514846367e-05,
"loss": 1.1681,
"step": 169
},
{
"epoch": 0.12203876525484565,
"grad_norm": 0.24137817323207855,
"learning_rate": 1.9659469435229993e-05,
"loss": 1.1573,
"step": 170
},
{
"epoch": 0.12275664034458005,
"grad_norm": 0.5218951106071472,
"learning_rate": 1.9655352485356615e-05,
"loss": 1.2824,
"step": 171
},
{
"epoch": 0.12347451543431442,
"grad_norm": 2.0925486087799072,
"learning_rate": 1.965121168631718e-05,
"loss": 1.4668,
"step": 172
},
{
"epoch": 0.12419239052404882,
"grad_norm": 0.40912044048309326,
"learning_rate": 1.9647047059324828e-05,
"loss": 1.3401,
"step": 173
},
{
"epoch": 0.1249102656137832,
"grad_norm": 0.44498252868652344,
"learning_rate": 1.9642858625714754e-05,
"loss": 1.2355,
"step": 174
},
{
"epoch": 0.12562814070351758,
"grad_norm": 0.47769415378570557,
"learning_rate": 1.9638646406944123e-05,
"loss": 1.2284,
"step": 175
},
{
"epoch": 0.12634601579325197,
"grad_norm": 0.26920589804649353,
"learning_rate": 1.963441042459194e-05,
"loss": 1.0832,
"step": 176
},
{
"epoch": 0.12706389088298636,
"grad_norm": 0.23602105677127838,
"learning_rate": 1.963015070035897e-05,
"loss": 1.1441,
"step": 177
},
{
"epoch": 0.12778176597272076,
"grad_norm": 0.31674906611442566,
"learning_rate": 1.9625867256067578e-05,
"loss": 1.1578,
"step": 178
},
{
"epoch": 0.12849964106245512,
"grad_norm": 0.26973602175712585,
"learning_rate": 1.962156011366167e-05,
"loss": 1.0848,
"step": 179
},
{
"epoch": 0.12921751615218952,
"grad_norm": 0.3377930223941803,
"learning_rate": 1.961722929520654e-05,
"loss": 1.253,
"step": 180
},
{
"epoch": 0.1299353912419239,
"grad_norm": 0.5862683653831482,
"learning_rate": 1.9612874822888787e-05,
"loss": 1.3045,
"step": 181
},
{
"epoch": 0.1306532663316583,
"grad_norm": 2.053889036178589,
"learning_rate": 1.960849671901618e-05,
"loss": 1.2307,
"step": 182
},
{
"epoch": 0.13137114142139267,
"grad_norm": 0.3686397671699524,
"learning_rate": 1.9604095006017546e-05,
"loss": 1.1462,
"step": 183
},
{
"epoch": 0.13208901651112706,
"grad_norm": 0.2870405912399292,
"learning_rate": 1.9599669706442676e-05,
"loss": 1.1773,
"step": 184
},
{
"epoch": 0.13280689160086145,
"grad_norm": 0.4055803120136261,
"learning_rate": 1.9595220842962178e-05,
"loss": 1.1849,
"step": 185
},
{
"epoch": 0.13352476669059585,
"grad_norm": 0.3482927978038788,
"learning_rate": 1.959074843836739e-05,
"loss": 1.1565,
"step": 186
},
{
"epoch": 0.1342426417803302,
"grad_norm": 0.348791241645813,
"learning_rate": 1.958625251557024e-05,
"loss": 1.2002,
"step": 187
},
{
"epoch": 0.1349605168700646,
"grad_norm": 0.6808797121047974,
"learning_rate": 1.9581733097603145e-05,
"loss": 1.3044,
"step": 188
},
{
"epoch": 0.135678391959799,
"grad_norm": 0.2962851822376251,
"learning_rate": 1.9577190207618884e-05,
"loss": 1.1848,
"step": 189
},
{
"epoch": 0.1363962670495334,
"grad_norm": 0.2507338523864746,
"learning_rate": 1.9572623868890483e-05,
"loss": 1.2179,
"step": 190
},
{
"epoch": 0.13711414213926776,
"grad_norm": 0.3296150863170624,
"learning_rate": 1.956803410481109e-05,
"loss": 1.1572,
"step": 191
},
{
"epoch": 0.13783201722900215,
"grad_norm": 0.40111851692199707,
"learning_rate": 1.9563420938893875e-05,
"loss": 1.156,
"step": 192
},
{
"epoch": 0.13854989231873654,
"grad_norm": 0.6545300483703613,
"learning_rate": 1.955878439477187e-05,
"loss": 1.1174,
"step": 193
},
{
"epoch": 0.13926776740847094,
"grad_norm": 0.3253189027309418,
"learning_rate": 1.95541244961979e-05,
"loss": 1.1701,
"step": 194
},
{
"epoch": 0.1399856424982053,
"grad_norm": 0.2571970522403717,
"learning_rate": 1.954944126704441e-05,
"loss": 1.1431,
"step": 195
},
{
"epoch": 0.1407035175879397,
"grad_norm": 0.2755349278450012,
"learning_rate": 1.9544734731303384e-05,
"loss": 1.0937,
"step": 196
},
{
"epoch": 0.1414213926776741,
"grad_norm": 0.45541664958000183,
"learning_rate": 1.9540004913086196e-05,
"loss": 1.1376,
"step": 197
},
{
"epoch": 0.14213926776740848,
"grad_norm": 0.3663983941078186,
"learning_rate": 1.9535251836623493e-05,
"loss": 1.1753,
"step": 198
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.2416490763425827,
"learning_rate": 1.953047552626508e-05,
"loss": 1.1567,
"step": 199
},
{
"epoch": 0.14357501794687724,
"grad_norm": 0.3104878067970276,
"learning_rate": 1.9525676006479785e-05,
"loss": 1.1494,
"step": 200
},
{
"epoch": 0.14429289303661164,
"grad_norm": 0.5359750390052795,
"learning_rate": 1.9520853301855335e-05,
"loss": 1.2076,
"step": 201
},
{
"epoch": 0.14501076812634603,
"grad_norm": 0.2803167700767517,
"learning_rate": 1.951600743709824e-05,
"loss": 1.1267,
"step": 202
},
{
"epoch": 0.1457286432160804,
"grad_norm": 0.2752203643321991,
"learning_rate": 1.951113843703364e-05,
"loss": 1.1385,
"step": 203
},
{
"epoch": 0.1464465183058148,
"grad_norm": 0.7712585926055908,
"learning_rate": 1.950624632660522e-05,
"loss": 1.3345,
"step": 204
},
{
"epoch": 0.14716439339554918,
"grad_norm": 0.4798975884914398,
"learning_rate": 1.950133113087504e-05,
"loss": 1.2706,
"step": 205
},
{
"epoch": 0.14788226848528357,
"grad_norm": 0.5709019303321838,
"learning_rate": 1.9496392875023432e-05,
"loss": 1.1709,
"step": 206
},
{
"epoch": 0.14860014357501794,
"grad_norm": 0.449605256319046,
"learning_rate": 1.9491431584348866e-05,
"loss": 1.1443,
"step": 207
},
{
"epoch": 0.14931801866475233,
"grad_norm": 0.42168155312538147,
"learning_rate": 1.9486447284267817e-05,
"loss": 1.1603,
"step": 208
},
{
"epoch": 0.15003589375448673,
"grad_norm": 0.3384092450141907,
"learning_rate": 1.948144000031463e-05,
"loss": 1.1259,
"step": 209
},
{
"epoch": 0.1507537688442211,
"grad_norm": 0.26280686259269714,
"learning_rate": 1.9476409758141407e-05,
"loss": 1.1936,
"step": 210
},
{
"epoch": 0.15147164393395549,
"grad_norm": 0.39539268612861633,
"learning_rate": 1.947135658351785e-05,
"loss": 1.2688,
"step": 211
},
{
"epoch": 0.15218951902368988,
"grad_norm": 0.26841455698013306,
"learning_rate": 1.9466280502331157e-05,
"loss": 1.106,
"step": 212
},
{
"epoch": 0.15290739411342427,
"grad_norm": 0.3093603849411011,
"learning_rate": 1.9461181540585864e-05,
"loss": 1.1567,
"step": 213
},
{
"epoch": 0.15362526920315864,
"grad_norm": 0.2611793875694275,
"learning_rate": 1.945605972440373e-05,
"loss": 1.142,
"step": 214
},
{
"epoch": 0.15434314429289303,
"grad_norm": 0.24437759816646576,
"learning_rate": 1.9450915080023595e-05,
"loss": 1.1771,
"step": 215
},
{
"epoch": 0.15506101938262742,
"grad_norm": 0.6352970600128174,
"learning_rate": 1.9445747633801244e-05,
"loss": 1.2487,
"step": 216
},
{
"epoch": 0.15577889447236182,
"grad_norm": 0.2860240638256073,
"learning_rate": 1.9440557412209276e-05,
"loss": 1.1032,
"step": 217
},
{
"epoch": 0.15649676956209618,
"grad_norm": 0.2164093255996704,
"learning_rate": 1.943534444183697e-05,
"loss": 1.1733,
"step": 218
},
{
"epoch": 0.15721464465183058,
"grad_norm": 0.35152971744537354,
"learning_rate": 1.9430108749390144e-05,
"loss": 1.1196,
"step": 219
},
{
"epoch": 0.15793251974156497,
"grad_norm": 0.30813077092170715,
"learning_rate": 1.9424850361691018e-05,
"loss": 1.1169,
"step": 220
},
{
"epoch": 0.15865039483129936,
"grad_norm": 35.50032424926758,
"learning_rate": 1.9419569305678085e-05,
"loss": 1.6415,
"step": 221
},
{
"epoch": 0.15936826992103373,
"grad_norm": 0.662775993347168,
"learning_rate": 1.9414265608405957e-05,
"loss": 1.103,
"step": 222
},
{
"epoch": 0.16008614501076812,
"grad_norm": 0.3208405375480652,
"learning_rate": 1.940893929704525e-05,
"loss": 1.1864,
"step": 223
},
{
"epoch": 0.16080402010050251,
"grad_norm": 2.3010807037353516,
"learning_rate": 1.9403590398882412e-05,
"loss": 1.3652,
"step": 224
},
{
"epoch": 0.1615218951902369,
"grad_norm": 0.46707963943481445,
"learning_rate": 1.9398218941319623e-05,
"loss": 1.1541,
"step": 225
},
{
"epoch": 0.16223977027997127,
"grad_norm": 0.44534140825271606,
"learning_rate": 1.9392824951874617e-05,
"loss": 1.124,
"step": 226
},
{
"epoch": 0.16295764536970567,
"grad_norm": 0.45970529317855835,
"learning_rate": 1.938740845818057e-05,
"loss": 1.0864,
"step": 227
},
{
"epoch": 0.16367552045944006,
"grad_norm": 0.2933219075202942,
"learning_rate": 1.9381969487985937e-05,
"loss": 1.1468,
"step": 228
},
{
"epoch": 0.16439339554917445,
"grad_norm": 0.34940993785858154,
"learning_rate": 1.9376508069154326e-05,
"loss": 1.1782,
"step": 229
},
{
"epoch": 0.16511127063890882,
"grad_norm": 0.2643601596355438,
"learning_rate": 1.9371024229664344e-05,
"loss": 1.1619,
"step": 230
},
{
"epoch": 0.1658291457286432,
"grad_norm": 0.5773788690567017,
"learning_rate": 1.9365517997609458e-05,
"loss": 1.3177,
"step": 231
},
{
"epoch": 0.1665470208183776,
"grad_norm": 0.2795402705669403,
"learning_rate": 1.9359989401197854e-05,
"loss": 1.2282,
"step": 232
},
{
"epoch": 0.167264895908112,
"grad_norm": 1.007430076599121,
"learning_rate": 1.935443846875229e-05,
"loss": 1.5002,
"step": 233
},
{
"epoch": 0.16798277099784636,
"grad_norm": 0.35091131925582886,
"learning_rate": 1.934886522870995e-05,
"loss": 1.1605,
"step": 234
},
{
"epoch": 0.16870064608758076,
"grad_norm": 0.41040295362472534,
"learning_rate": 1.934326970962229e-05,
"loss": 1.1619,
"step": 235
},
{
"epoch": 0.16941852117731515,
"grad_norm": 0.24819721281528473,
"learning_rate": 1.9337651940154915e-05,
"loss": 1.152,
"step": 236
},
{
"epoch": 0.17013639626704954,
"grad_norm": 0.31907832622528076,
"learning_rate": 1.933201194908741e-05,
"loss": 1.1385,
"step": 237
},
{
"epoch": 0.1708542713567839,
"grad_norm": 0.5197360515594482,
"learning_rate": 1.9326349765313203e-05,
"loss": 1.1882,
"step": 238
},
{
"epoch": 0.1715721464465183,
"grad_norm": 0.526611328125,
"learning_rate": 1.9320665417839405e-05,
"loss": 1.1695,
"step": 239
},
{
"epoch": 0.1722900215362527,
"grad_norm": 0.4685799479484558,
"learning_rate": 1.9314958935786685e-05,
"loss": 1.2111,
"step": 240
},
{
"epoch": 0.1730078966259871,
"grad_norm": 0.3952530026435852,
"learning_rate": 1.9309230348389097e-05,
"loss": 1.1435,
"step": 241
},
{
"epoch": 0.17372577171572146,
"grad_norm": 0.7753391861915588,
"learning_rate": 1.9303479684993944e-05,
"loss": 1.3008,
"step": 242
},
{
"epoch": 0.17444364680545585,
"grad_norm": 0.2788830101490021,
"learning_rate": 1.9297706975061618e-05,
"loss": 1.1781,
"step": 243
},
{
"epoch": 0.17516152189519024,
"grad_norm": 0.6017007827758789,
"learning_rate": 1.929191224816546e-05,
"loss": 1.2131,
"step": 244
},
{
"epoch": 0.17587939698492464,
"grad_norm": 0.324994832277298,
"learning_rate": 1.9286095533991608e-05,
"loss": 1.1453,
"step": 245
},
{
"epoch": 0.176597272074659,
"grad_norm": 0.5178776979446411,
"learning_rate": 1.928025686233882e-05,
"loss": 1.1078,
"step": 246
},
{
"epoch": 0.1773151471643934,
"grad_norm": 0.371931254863739,
"learning_rate": 1.9274396263118366e-05,
"loss": 1.1255,
"step": 247
},
{
"epoch": 0.1780330222541278,
"grad_norm": 0.38106516003608704,
"learning_rate": 1.926851376635383e-05,
"loss": 1.1887,
"step": 248
},
{
"epoch": 0.17875089734386218,
"grad_norm": 0.35742947459220886,
"learning_rate": 1.926260940218099e-05,
"loss": 1.1045,
"step": 249
},
{
"epoch": 0.17946877243359655,
"grad_norm": 0.5332513451576233,
"learning_rate": 1.9256683200847638e-05,
"loss": 1.2387,
"step": 250
},
{
"epoch": 0.18018664752333094,
"grad_norm": 0.24585315585136414,
"learning_rate": 1.9250735192713447e-05,
"loss": 1.0728,
"step": 251
},
{
"epoch": 0.18090452261306533,
"grad_norm": 0.4742366075515747,
"learning_rate": 1.92447654082498e-05,
"loss": 1.2066,
"step": 252
},
{
"epoch": 0.18162239770279973,
"grad_norm": 0.2356850951910019,
"learning_rate": 1.9238773878039638e-05,
"loss": 1.0949,
"step": 253
},
{
"epoch": 0.1823402727925341,
"grad_norm": 0.24765734374523163,
"learning_rate": 1.9232760632777312e-05,
"loss": 1.1209,
"step": 254
},
{
"epoch": 0.18305814788226848,
"grad_norm": 0.4552963674068451,
"learning_rate": 1.922672570326841e-05,
"loss": 1.1351,
"step": 255
},
{
"epoch": 0.18377602297200288,
"grad_norm": 0.28184211254119873,
"learning_rate": 1.922066912042961e-05,
"loss": 1.1333,
"step": 256
},
{
"epoch": 0.18449389806173727,
"grad_norm": 0.32718122005462646,
"learning_rate": 1.921459091528852e-05,
"loss": 1.2047,
"step": 257
},
{
"epoch": 0.18521177315147164,
"grad_norm": 0.4896772503852844,
"learning_rate": 1.9208491118983517e-05,
"loss": 1.2929,
"step": 258
},
{
"epoch": 0.18592964824120603,
"grad_norm": 0.6823757886886597,
"learning_rate": 1.9202369762763587e-05,
"loss": 1.3192,
"step": 259
},
{
"epoch": 0.18664752333094042,
"grad_norm": 0.28378769755363464,
"learning_rate": 1.9196226877988175e-05,
"loss": 1.1805,
"step": 260
},
{
"epoch": 0.1873653984206748,
"grad_norm": 0.43191203474998474,
"learning_rate": 1.9190062496127008e-05,
"loss": 1.0961,
"step": 261
},
{
"epoch": 0.18808327351040918,
"grad_norm": 0.2138504683971405,
"learning_rate": 1.9183876648759938e-05,
"loss": 1.1073,
"step": 262
},
{
"epoch": 0.18880114860014358,
"grad_norm": 0.25294116139411926,
"learning_rate": 1.9177669367576794e-05,
"loss": 1.108,
"step": 263
},
{
"epoch": 0.18951902368987797,
"grad_norm": 0.2560763955116272,
"learning_rate": 1.9171440684377204e-05,
"loss": 1.1339,
"step": 264
},
{
"epoch": 0.19023689877961233,
"grad_norm": 0.2744760811328888,
"learning_rate": 1.9165190631070435e-05,
"loss": 1.2059,
"step": 265
},
{
"epoch": 0.19095477386934673,
"grad_norm": 0.2920529842376709,
"learning_rate": 1.915891923967524e-05,
"loss": 1.1597,
"step": 266
},
{
"epoch": 0.19167264895908112,
"grad_norm": 0.4311828315258026,
"learning_rate": 1.9152626542319673e-05,
"loss": 1.2409,
"step": 267
},
{
"epoch": 0.19239052404881551,
"grad_norm": 0.29880067706108093,
"learning_rate": 1.9146312571240954e-05,
"loss": 1.1518,
"step": 268
},
{
"epoch": 0.19310839913854988,
"grad_norm": 0.4842053949832916,
"learning_rate": 1.9139977358785277e-05,
"loss": 1.2759,
"step": 269
},
{
"epoch": 0.19382627422828427,
"grad_norm": 0.3157087564468384,
"learning_rate": 1.9133620937407657e-05,
"loss": 1.0769,
"step": 270
},
{
"epoch": 0.19454414931801867,
"grad_norm": 0.3534604012966156,
"learning_rate": 1.912724333967176e-05,
"loss": 1.1963,
"step": 271
},
{
"epoch": 0.19526202440775306,
"grad_norm": 0.4059816598892212,
"learning_rate": 1.912084459824974e-05,
"loss": 1.2315,
"step": 272
},
{
"epoch": 0.19597989949748743,
"grad_norm": 0.521371066570282,
"learning_rate": 1.9114424745922065e-05,
"loss": 1.1831,
"step": 273
},
{
"epoch": 0.19669777458722182,
"grad_norm": 2.541219472885132,
"learning_rate": 1.910798381557736e-05,
"loss": 1.4695,
"step": 274
},
{
"epoch": 0.1974156496769562,
"grad_norm": 0.3894226551055908,
"learning_rate": 1.9101521840212225e-05,
"loss": 1.1516,
"step": 275
},
{
"epoch": 0.1981335247666906,
"grad_norm": 0.3359840512275696,
"learning_rate": 1.9095038852931078e-05,
"loss": 1.124,
"step": 276
},
{
"epoch": 0.19885139985642497,
"grad_norm": 0.336245059967041,
"learning_rate": 1.9088534886945978e-05,
"loss": 1.1803,
"step": 277
},
{
"epoch": 0.19956927494615936,
"grad_norm": 0.2564994990825653,
"learning_rate": 1.908200997557645e-05,
"loss": 1.0962,
"step": 278
},
{
"epoch": 0.20028715003589376,
"grad_norm": 0.26010560989379883,
"learning_rate": 1.907546415224934e-05,
"loss": 1.126,
"step": 279
},
{
"epoch": 0.20100502512562815,
"grad_norm": 0.6532833576202393,
"learning_rate": 1.90688974504986e-05,
"loss": 1.2128,
"step": 280
},
{
"epoch": 0.20172290021536252,
"grad_norm": 0.3625889718532562,
"learning_rate": 1.9062309903965166e-05,
"loss": 1.1788,
"step": 281
},
{
"epoch": 0.2024407753050969,
"grad_norm": 0.6255800127983093,
"learning_rate": 1.905570154639674e-05,
"loss": 1.328,
"step": 282
},
{
"epoch": 0.2031586503948313,
"grad_norm": 0.28508058190345764,
"learning_rate": 1.9049072411647652e-05,
"loss": 1.147,
"step": 283
},
{
"epoch": 0.2038765254845657,
"grad_norm": 2.5196280479431152,
"learning_rate": 1.9042422533678668e-05,
"loss": 1.3916,
"step": 284
},
{
"epoch": 0.20459440057430006,
"grad_norm": 0.6198713183403015,
"learning_rate": 1.903575194655682e-05,
"loss": 1.2978,
"step": 285
},
{
"epoch": 0.20531227566403445,
"grad_norm": 0.37188202142715454,
"learning_rate": 1.902906068445523e-05,
"loss": 1.165,
"step": 286
},
{
"epoch": 0.20603015075376885,
"grad_norm": 0.24965988099575043,
"learning_rate": 1.902234878165294e-05,
"loss": 1.1278,
"step": 287
},
{
"epoch": 0.20674802584350324,
"grad_norm": 0.7749665379524231,
"learning_rate": 1.9015616272534733e-05,
"loss": 1.2764,
"step": 288
},
{
"epoch": 0.2074659009332376,
"grad_norm": 0.3034153878688812,
"learning_rate": 1.9008863191590964e-05,
"loss": 1.105,
"step": 289
},
{
"epoch": 0.208183776022972,
"grad_norm": 0.33309271931648254,
"learning_rate": 1.9002089573417357e-05,
"loss": 1.1371,
"step": 290
},
{
"epoch": 0.2089016511127064,
"grad_norm": 0.30133146047592163,
"learning_rate": 1.899529545271487e-05,
"loss": 1.1086,
"step": 291
},
{
"epoch": 0.2096195262024408,
"grad_norm": 0.4333893954753876,
"learning_rate": 1.8988480864289483e-05,
"loss": 1.1897,
"step": 292
},
{
"epoch": 0.21033740129217515,
"grad_norm": 0.31405553221702576,
"learning_rate": 1.898164584305203e-05,
"loss": 1.1297,
"step": 293
},
{
"epoch": 0.21105527638190955,
"grad_norm": 0.24001596868038177,
"learning_rate": 1.8974790424018025e-05,
"loss": 1.1327,
"step": 294
},
{
"epoch": 0.21177315147164394,
"grad_norm": 0.3047102987766266,
"learning_rate": 1.8967914642307476e-05,
"loss": 1.1492,
"step": 295
},
{
"epoch": 0.21249102656137833,
"grad_norm": 0.3045389652252197,
"learning_rate": 1.896101853314472e-05,
"loss": 1.1083,
"step": 296
},
{
"epoch": 0.2132089016511127,
"grad_norm": 0.3781316578388214,
"learning_rate": 1.8954102131858206e-05,
"loss": 1.157,
"step": 297
},
{
"epoch": 0.2139267767408471,
"grad_norm": 0.38882461190223694,
"learning_rate": 1.8947165473880363e-05,
"loss": 1.1802,
"step": 298
},
{
"epoch": 0.21464465183058148,
"grad_norm": 0.5264966487884521,
"learning_rate": 1.8940208594747386e-05,
"loss": 1.0983,
"step": 299
},
{
"epoch": 0.21536252692031588,
"grad_norm": 0.4941420555114746,
"learning_rate": 1.8933231530099058e-05,
"loss": 1.2014,
"step": 300
},
{
"epoch": 0.21608040201005024,
"grad_norm": 0.2968602180480957,
"learning_rate": 1.8926234315678576e-05,
"loss": 1.1773,
"step": 301
},
{
"epoch": 0.21679827709978464,
"grad_norm": 0.36839526891708374,
"learning_rate": 1.8919216987332358e-05,
"loss": 1.194,
"step": 302
},
{
"epoch": 0.21751615218951903,
"grad_norm": 0.27106499671936035,
"learning_rate": 1.891217958100987e-05,
"loss": 1.1406,
"step": 303
},
{
"epoch": 0.21823402727925342,
"grad_norm": 0.4870721995830536,
"learning_rate": 1.890512213276344e-05,
"loss": 1.1759,
"step": 304
},
{
"epoch": 0.2189519023689878,
"grad_norm": 3.0273375511169434,
"learning_rate": 1.8898044678748054e-05,
"loss": 1.0953,
"step": 305
},
{
"epoch": 0.21966977745872218,
"grad_norm": 0.5378653407096863,
"learning_rate": 1.889094725522121e-05,
"loss": 1.2847,
"step": 306
},
{
"epoch": 0.22038765254845658,
"grad_norm": 0.26293638348579407,
"learning_rate": 1.888382989854269e-05,
"loss": 1.118,
"step": 307
},
{
"epoch": 0.22110552763819097,
"grad_norm": 0.3193033039569855,
"learning_rate": 1.8876692645174398e-05,
"loss": 1.1277,
"step": 308
},
{
"epoch": 0.22182340272792533,
"grad_norm": 0.47098249197006226,
"learning_rate": 1.8869535531680177e-05,
"loss": 1.2661,
"step": 309
},
{
"epoch": 0.22254127781765973,
"grad_norm": 0.45273101329803467,
"learning_rate": 1.8862358594725596e-05,
"loss": 1.1281,
"step": 310
},
{
"epoch": 0.22325915290739412,
"grad_norm": 0.27840176224708557,
"learning_rate": 1.8855161871077792e-05,
"loss": 1.1138,
"step": 311
},
{
"epoch": 0.22397702799712849,
"grad_norm": 0.25028085708618164,
"learning_rate": 1.884794539760526e-05,
"loss": 1.0966,
"step": 312
},
{
"epoch": 0.22469490308686288,
"grad_norm": 0.33616170287132263,
"learning_rate": 1.884070921127768e-05,
"loss": 1.1496,
"step": 313
},
{
"epoch": 0.22541277817659727,
"grad_norm": 0.34810954332351685,
"learning_rate": 1.8833453349165714e-05,
"loss": 1.1693,
"step": 314
},
{
"epoch": 0.22613065326633167,
"grad_norm": 0.3485371768474579,
"learning_rate": 1.8826177848440828e-05,
"loss": 1.1069,
"step": 315
},
{
"epoch": 0.22684852835606603,
"grad_norm": 0.3029579222202301,
"learning_rate": 1.8818882746375087e-05,
"loss": 1.0823,
"step": 316
},
{
"epoch": 0.22756640344580042,
"grad_norm": 0.24140548706054688,
"learning_rate": 1.8811568080340984e-05,
"loss": 1.1347,
"step": 317
},
{
"epoch": 0.22828427853553482,
"grad_norm": 0.25040584802627563,
"learning_rate": 1.8804233887811226e-05,
"loss": 1.1236,
"step": 318
},
{
"epoch": 0.2290021536252692,
"grad_norm": 0.6614571213722229,
"learning_rate": 1.8796880206358563e-05,
"loss": 1.3318,
"step": 319
},
{
"epoch": 0.22972002871500358,
"grad_norm": 0.2212487757205963,
"learning_rate": 1.8789507073655576e-05,
"loss": 1.1516,
"step": 320
},
{
"epoch": 0.23043790380473797,
"grad_norm": 0.24655699729919434,
"learning_rate": 1.8782114527474504e-05,
"loss": 1.1223,
"step": 321
},
{
"epoch": 0.23115577889447236,
"grad_norm": 0.23814330995082855,
"learning_rate": 1.8774702605687037e-05,
"loss": 1.1611,
"step": 322
},
{
"epoch": 0.23187365398420676,
"grad_norm": 0.3251168727874756,
"learning_rate": 1.876727134626412e-05,
"loss": 1.1245,
"step": 323
},
{
"epoch": 0.23259152907394112,
"grad_norm": 0.24903184175491333,
"learning_rate": 1.8759820787275777e-05,
"loss": 1.0912,
"step": 324
},
{
"epoch": 0.23330940416367552,
"grad_norm": 0.29283952713012695,
"learning_rate": 1.875235096689088e-05,
"loss": 1.1674,
"step": 325
},
{
"epoch": 0.2340272792534099,
"grad_norm": 0.24461185932159424,
"learning_rate": 1.8744861923377003e-05,
"loss": 1.1241,
"step": 326
},
{
"epoch": 0.2347451543431443,
"grad_norm": 4.819317817687988,
"learning_rate": 1.8737353695100183e-05,
"loss": 1.1464,
"step": 327
},
{
"epoch": 0.23546302943287867,
"grad_norm": 0.7460237741470337,
"learning_rate": 1.8729826320524737e-05,
"loss": 1.4698,
"step": 328
},
{
"epoch": 0.23618090452261306,
"grad_norm": 0.3449172377586365,
"learning_rate": 1.8722279838213082e-05,
"loss": 1.1501,
"step": 329
},
{
"epoch": 0.23689877961234745,
"grad_norm": 0.30681246519088745,
"learning_rate": 1.8714714286825512e-05,
"loss": 1.091,
"step": 330
},
{
"epoch": 0.23761665470208185,
"grad_norm": 0.4400181770324707,
"learning_rate": 1.8707129705120012e-05,
"loss": 1.1017,
"step": 331
},
{
"epoch": 0.2383345297918162,
"grad_norm": 0.5187708735466003,
"learning_rate": 1.8699526131952067e-05,
"loss": 1.2389,
"step": 332
},
{
"epoch": 0.2390524048815506,
"grad_norm": 0.44073569774627686,
"learning_rate": 1.869190360627444e-05,
"loss": 1.1137,
"step": 333
},
{
"epoch": 0.239770279971285,
"grad_norm": 0.3202749490737915,
"learning_rate": 1.8684262167136998e-05,
"loss": 1.1693,
"step": 334
},
{
"epoch": 0.2404881550610194,
"grad_norm": 0.2710592448711395,
"learning_rate": 1.8676601853686502e-05,
"loss": 1.1118,
"step": 335
},
{
"epoch": 0.24120603015075376,
"grad_norm": 0.2804523706436157,
"learning_rate": 1.866892270516639e-05,
"loss": 1.0901,
"step": 336
},
{
"epoch": 0.24192390524048815,
"grad_norm": 0.2933681905269623,
"learning_rate": 1.8661224760916618e-05,
"loss": 1.1076,
"step": 337
},
{
"epoch": 0.24264178033022255,
"grad_norm": 0.29812660813331604,
"learning_rate": 1.86535080603734e-05,
"loss": 1.0992,
"step": 338
},
{
"epoch": 0.24335965541995694,
"grad_norm": 0.42179393768310547,
"learning_rate": 1.8645772643069064e-05,
"loss": 1.1911,
"step": 339
},
{
"epoch": 0.2440775305096913,
"grad_norm": 0.3564510941505432,
"learning_rate": 1.8638018548631808e-05,
"loss": 1.1381,
"step": 340
},
{
"epoch": 0.2447954055994257,
"grad_norm": 0.45639586448669434,
"learning_rate": 1.8630245816785516e-05,
"loss": 1.2329,
"step": 341
},
{
"epoch": 0.2455132806891601,
"grad_norm": 0.28465136885643005,
"learning_rate": 1.862245448734956e-05,
"loss": 1.1781,
"step": 342
},
{
"epoch": 0.24623115577889448,
"grad_norm": 0.2306850552558899,
"learning_rate": 1.861464460023856e-05,
"loss": 1.1508,
"step": 343
},
{
"epoch": 0.24694903086862885,
"grad_norm": 0.6161120533943176,
"learning_rate": 1.8606816195462244e-05,
"loss": 1.2499,
"step": 344
},
{
"epoch": 0.24766690595836324,
"grad_norm": 0.25378888845443726,
"learning_rate": 1.8598969313125175e-05,
"loss": 1.1348,
"step": 345
},
{
"epoch": 0.24838478104809764,
"grad_norm": 0.7147504687309265,
"learning_rate": 1.859110399342659e-05,
"loss": 1.0882,
"step": 346
},
{
"epoch": 0.24910265613783203,
"grad_norm": 1.6163944005966187,
"learning_rate": 1.858322027666017e-05,
"loss": 1.466,
"step": 347
},
{
"epoch": 0.2498205312275664,
"grad_norm": 0.3895832598209381,
"learning_rate": 1.8575318203213857e-05,
"loss": 1.1098,
"step": 348
},
{
"epoch": 0.2505384063173008,
"grad_norm": 0.3162965476512909,
"learning_rate": 1.856739781356962e-05,
"loss": 1.1079,
"step": 349
},
{
"epoch": 0.25125628140703515,
"grad_norm": 0.31275758147239685,
"learning_rate": 1.855945914830327e-05,
"loss": 1.1573,
"step": 350
},
{
"epoch": 0.2519741564967696,
"grad_norm": 0.5552882552146912,
"learning_rate": 1.8551502248084236e-05,
"loss": 1.2549,
"step": 351
},
{
"epoch": 0.25269203158650394,
"grad_norm": 0.3312999904155731,
"learning_rate": 1.8543527153675375e-05,
"loss": 1.1306,
"step": 352
},
{
"epoch": 0.25340990667623836,
"grad_norm": 0.30454468727111816,
"learning_rate": 1.8535533905932738e-05,
"loss": 1.1809,
"step": 353
},
{
"epoch": 0.2541277817659727,
"grad_norm": 0.3623380959033966,
"learning_rate": 1.8527522545805387e-05,
"loss": 1.123,
"step": 354
},
{
"epoch": 0.2548456568557071,
"grad_norm": 0.3103772699832916,
"learning_rate": 1.8519493114335162e-05,
"loss": 1.1182,
"step": 355
},
{
"epoch": 0.2555635319454415,
"grad_norm": 0.2671801447868347,
"learning_rate": 1.8511445652656494e-05,
"loss": 1.1652,
"step": 356
},
{
"epoch": 0.2562814070351759,
"grad_norm": 0.38310304284095764,
"learning_rate": 1.850338020199617e-05,
"loss": 1.1408,
"step": 357
},
{
"epoch": 0.25699928212491024,
"grad_norm": 0.3919181823730469,
"learning_rate": 1.849529680367314e-05,
"loss": 1.0756,
"step": 358
},
{
"epoch": 0.25771715721464467,
"grad_norm": 0.29616478085517883,
"learning_rate": 1.84871954990983e-05,
"loss": 1.1288,
"step": 359
},
{
"epoch": 0.25843503230437903,
"grad_norm": 0.32790932059288025,
"learning_rate": 1.8479076329774275e-05,
"loss": 1.0784,
"step": 360
},
{
"epoch": 0.25915290739411345,
"grad_norm": 0.43394553661346436,
"learning_rate": 1.8470939337295214e-05,
"loss": 1.1948,
"step": 361
},
{
"epoch": 0.2598707824838478,
"grad_norm": 0.2332722693681717,
"learning_rate": 1.8462784563346568e-05,
"loss": 1.0912,
"step": 362
},
{
"epoch": 0.2605886575735822,
"grad_norm": 0.2292940765619278,
"learning_rate": 1.845461204970489e-05,
"loss": 1.1111,
"step": 363
},
{
"epoch": 0.2613065326633166,
"grad_norm": 0.4375426769256592,
"learning_rate": 1.8446421838237605e-05,
"loss": 1.1876,
"step": 364
},
{
"epoch": 0.26202440775305097,
"grad_norm": 0.23707646131515503,
"learning_rate": 1.8438213970902813e-05,
"loss": 1.1889,
"step": 365
},
{
"epoch": 0.26274228284278534,
"grad_norm": 0.36057260632514954,
"learning_rate": 1.8429988489749048e-05,
"loss": 1.1589,
"step": 366
},
{
"epoch": 0.26346015793251976,
"grad_norm": 0.33151188492774963,
"learning_rate": 1.842174543691509e-05,
"loss": 1.1056,
"step": 367
},
{
"epoch": 0.2641780330222541,
"grad_norm": 0.45307648181915283,
"learning_rate": 1.841348485462974e-05,
"loss": 1.1998,
"step": 368
},
{
"epoch": 0.2648959081119885,
"grad_norm": 5.533525466918945,
"learning_rate": 1.8405206785211595e-05,
"loss": 1.1409,
"step": 369
},
{
"epoch": 0.2656137832017229,
"grad_norm": 0.3712926506996155,
"learning_rate": 1.8396911271068843e-05,
"loss": 1.0697,
"step": 370
},
{
"epoch": 0.2663316582914573,
"grad_norm": 0.41767457127571106,
"learning_rate": 1.8388598354699034e-05,
"loss": 1.1379,
"step": 371
},
{
"epoch": 0.2670495333811917,
"grad_norm": 0.42338940501213074,
"learning_rate": 1.838026807868888e-05,
"loss": 1.1044,
"step": 372
},
{
"epoch": 0.26776740847092606,
"grad_norm": 9.821000099182129,
"learning_rate": 1.837192048571401e-05,
"loss": 1.1888,
"step": 373
},
{
"epoch": 0.2684852835606604,
"grad_norm": 0.2834051549434662,
"learning_rate": 1.836355561853878e-05,
"loss": 1.135,
"step": 374
},
{
"epoch": 0.26920315865039485,
"grad_norm": 0.44755667448043823,
"learning_rate": 1.835517352001604e-05,
"loss": 1.233,
"step": 375
},
{
"epoch": 0.2699210337401292,
"grad_norm": 0.2938573658466339,
"learning_rate": 1.8346774233086907e-05,
"loss": 1.1495,
"step": 376
},
{
"epoch": 0.2706389088298636,
"grad_norm": 0.41367679834365845,
"learning_rate": 1.833835780078056e-05,
"loss": 1.2121,
"step": 377
},
{
"epoch": 0.271356783919598,
"grad_norm": 0.2853800654411316,
"learning_rate": 1.8329924266214013e-05,
"loss": 1.1065,
"step": 378
},
{
"epoch": 0.27207465900933236,
"grad_norm": 0.5363768339157104,
"learning_rate": 1.832147367259189e-05,
"loss": 1.1133,
"step": 379
},
{
"epoch": 0.2727925340990668,
"grad_norm": 0.22328981757164001,
"learning_rate": 1.831300606320621e-05,
"loss": 1.0409,
"step": 380
},
{
"epoch": 0.27351040918880115,
"grad_norm": 0.48021677136421204,
"learning_rate": 1.8304521481436168e-05,
"loss": 1.2496,
"step": 381
},
{
"epoch": 0.2742282842785355,
"grad_norm": 0.2657185196876526,
"learning_rate": 1.8296019970747904e-05,
"loss": 1.1268,
"step": 382
},
{
"epoch": 0.27494615936826994,
"grad_norm": 0.3658749759197235,
"learning_rate": 1.8287501574694274e-05,
"loss": 1.1838,
"step": 383
},
{
"epoch": 0.2756640344580043,
"grad_norm": 0.29449597001075745,
"learning_rate": 1.8278966336914655e-05,
"loss": 1.1806,
"step": 384
},
{
"epoch": 0.27638190954773867,
"grad_norm": 0.7698999643325806,
"learning_rate": 1.8270414301134696e-05,
"loss": 1.2385,
"step": 385
},
{
"epoch": 0.2770997846374731,
"grad_norm": 0.3127557933330536,
"learning_rate": 1.8261845511166093e-05,
"loss": 1.0942,
"step": 386
},
{
"epoch": 0.27781765972720746,
"grad_norm": 0.2682400047779083,
"learning_rate": 1.8253260010906383e-05,
"loss": 1.1652,
"step": 387
},
{
"epoch": 0.2785355348169419,
"grad_norm": 0.4573856592178345,
"learning_rate": 1.824465784433871e-05,
"loss": 1.2724,
"step": 388
},
{
"epoch": 0.27925340990667624,
"grad_norm": 0.30964091420173645,
"learning_rate": 1.8236039055531588e-05,
"loss": 1.0837,
"step": 389
},
{
"epoch": 0.2799712849964106,
"grad_norm": 0.5770804286003113,
"learning_rate": 1.82274036886387e-05,
"loss": 1.3268,
"step": 390
},
{
"epoch": 0.28068916008614503,
"grad_norm": 0.23842279613018036,
"learning_rate": 1.8218751787898648e-05,
"loss": 1.086,
"step": 391
},
{
"epoch": 0.2814070351758794,
"grad_norm": 0.21970561146736145,
"learning_rate": 1.821008339763474e-05,
"loss": 1.1039,
"step": 392
},
{
"epoch": 0.28212491026561376,
"grad_norm": 0.5542939305305481,
"learning_rate": 1.8201398562254754e-05,
"loss": 1.2382,
"step": 393
},
{
"epoch": 0.2828427853553482,
"grad_norm": 0.36761996150016785,
"learning_rate": 1.8192697326250723e-05,
"loss": 1.2037,
"step": 394
},
{
"epoch": 0.28356066044508255,
"grad_norm": 0.23678098618984222,
"learning_rate": 1.81839797341987e-05,
"loss": 1.0795,
"step": 395
},
{
"epoch": 0.28427853553481697,
"grad_norm": 0.2703116238117218,
"learning_rate": 1.8175245830758515e-05,
"loss": 1.0809,
"step": 396
},
{
"epoch": 0.28499641062455133,
"grad_norm": 0.6142400503158569,
"learning_rate": 1.8166495660673586e-05,
"loss": 1.2617,
"step": 397
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.2628495395183563,
"learning_rate": 1.8157729268770635e-05,
"loss": 1.137,
"step": 398
},
{
"epoch": 0.2864321608040201,
"grad_norm": 0.30924415588378906,
"learning_rate": 1.814894669995951e-05,
"loss": 1.156,
"step": 399
},
{
"epoch": 0.2871500358937545,
"grad_norm": 0.2843545973300934,
"learning_rate": 1.8140147999232928e-05,
"loss": 1.1346,
"step": 400
},
{
"epoch": 0.28786791098348885,
"grad_norm": 0.7029257416725159,
"learning_rate": 1.8131333211666236e-05,
"loss": 1.2186,
"step": 401
},
{
"epoch": 0.28858578607322327,
"grad_norm": 0.2657949924468994,
"learning_rate": 1.8122502382417212e-05,
"loss": 1.1465,
"step": 402
},
{
"epoch": 0.28930366116295764,
"grad_norm": 0.2940109372138977,
"learning_rate": 1.81136555567258e-05,
"loss": 1.1007,
"step": 403
},
{
"epoch": 0.29002153625269206,
"grad_norm": 0.5118001699447632,
"learning_rate": 1.8104792779913905e-05,
"loss": 1.179,
"step": 404
},
{
"epoch": 0.2907394113424264,
"grad_norm": 0.27588653564453125,
"learning_rate": 1.8095914097385135e-05,
"loss": 1.1536,
"step": 405
},
{
"epoch": 0.2914572864321608,
"grad_norm": 0.2666616141796112,
"learning_rate": 1.8087019554624597e-05,
"loss": 1.1004,
"step": 406
},
{
"epoch": 0.2921751615218952,
"grad_norm": 0.2696448564529419,
"learning_rate": 1.807810919719864e-05,
"loss": 1.116,
"step": 407
},
{
"epoch": 0.2928930366116296,
"grad_norm": 0.44319644570350647,
"learning_rate": 1.806918307075463e-05,
"loss": 1.1544,
"step": 408
},
{
"epoch": 0.29361091170136394,
"grad_norm": 0.2961446940898895,
"learning_rate": 1.8060241221020724e-05,
"loss": 1.1079,
"step": 409
},
{
"epoch": 0.29432878679109836,
"grad_norm": 0.535943865776062,
"learning_rate": 1.8051283693805627e-05,
"loss": 1.2377,
"step": 410
},
{
"epoch": 0.29504666188083273,
"grad_norm": 0.22256693243980408,
"learning_rate": 1.804231053499835e-05,
"loss": 1.1848,
"step": 411
},
{
"epoch": 0.29576453697056715,
"grad_norm": 0.22374652326107025,
"learning_rate": 1.8033321790567996e-05,
"loss": 1.1023,
"step": 412
},
{
"epoch": 0.2964824120603015,
"grad_norm": 0.241849884390831,
"learning_rate": 1.802431750656351e-05,
"loss": 1.081,
"step": 413
},
{
"epoch": 0.2972002871500359,
"grad_norm": 0.21531161665916443,
"learning_rate": 1.8015297729113437e-05,
"loss": 1.1114,
"step": 414
},
{
"epoch": 0.2979181622397703,
"grad_norm": 0.5768148899078369,
"learning_rate": 1.80062625044257e-05,
"loss": 1.1076,
"step": 415
},
{
"epoch": 0.29863603732950467,
"grad_norm": 0.9602562189102173,
"learning_rate": 1.7997211878787367e-05,
"loss": 1.2154,
"step": 416
},
{
"epoch": 0.29935391241923903,
"grad_norm": 0.5085827112197876,
"learning_rate": 1.7988145898564383e-05,
"loss": 1.0853,
"step": 417
},
{
"epoch": 0.30007178750897345,
"grad_norm": 0.25142329931259155,
"learning_rate": 1.7979064610201373e-05,
"loss": 1.0924,
"step": 418
},
{
"epoch": 0.3007896625987078,
"grad_norm": 0.35126984119415283,
"learning_rate": 1.7969968060221378e-05,
"loss": 1.0696,
"step": 419
},
{
"epoch": 0.3015075376884422,
"grad_norm": 0.3864904046058655,
"learning_rate": 1.7960856295225618e-05,
"loss": 1.1267,
"step": 420
},
{
"epoch": 0.3022254127781766,
"grad_norm": 0.40476828813552856,
"learning_rate": 1.7951729361893274e-05,
"loss": 1.1587,
"step": 421
},
{
"epoch": 0.30294328786791097,
"grad_norm": 36.076351165771484,
"learning_rate": 1.7942587306981214e-05,
"loss": 1.0542,
"step": 422
},
{
"epoch": 0.3036611629576454,
"grad_norm": 0.2805517911911011,
"learning_rate": 1.7933430177323788e-05,
"loss": 1.1292,
"step": 423
},
{
"epoch": 0.30437903804737976,
"grad_norm": 0.28951379656791687,
"learning_rate": 1.792425801983257e-05,
"loss": 1.0955,
"step": 424
},
{
"epoch": 0.3050969131371141,
"grad_norm": 0.25895097851753235,
"learning_rate": 1.7915070881496114e-05,
"loss": 1.1066,
"step": 425
},
{
"epoch": 0.30581478822684854,
"grad_norm": 0.38622137904167175,
"learning_rate": 1.7905868809379737e-05,
"loss": 1.1115,
"step": 426
},
{
"epoch": 0.3065326633165829,
"grad_norm": 0.7420461177825928,
"learning_rate": 1.7896651850625235e-05,
"loss": 1.185,
"step": 427
},
{
"epoch": 0.3072505384063173,
"grad_norm": 0.25646522641181946,
"learning_rate": 1.788742005245069e-05,
"loss": 1.1046,
"step": 428
},
{
"epoch": 0.3079684134960517,
"grad_norm": 0.38760238885879517,
"learning_rate": 1.78781734621502e-05,
"loss": 1.1659,
"step": 429
},
{
"epoch": 0.30868628858578606,
"grad_norm": 0.3064006567001343,
"learning_rate": 1.786891212709364e-05,
"loss": 1.092,
"step": 430
},
{
"epoch": 0.3094041636755205,
"grad_norm": 0.4092943072319031,
"learning_rate": 1.7859636094726415e-05,
"loss": 1.1406,
"step": 431
},
{
"epoch": 0.31012203876525485,
"grad_norm": 0.39417821168899536,
"learning_rate": 1.7850345412569237e-05,
"loss": 1.0614,
"step": 432
},
{
"epoch": 0.3108399138549892,
"grad_norm": 0.3639516830444336,
"learning_rate": 1.784104012821786e-05,
"loss": 1.1227,
"step": 433
},
{
"epoch": 0.31155778894472363,
"grad_norm": 0.2613206207752228,
"learning_rate": 1.7831720289342852e-05,
"loss": 1.0766,
"step": 434
},
{
"epoch": 0.312275664034458,
"grad_norm": 0.44435498118400574,
"learning_rate": 1.7822385943689333e-05,
"loss": 1.2368,
"step": 435
},
{
"epoch": 0.31299353912419237,
"grad_norm": 381.1724853515625,
"learning_rate": 1.7813037139076743e-05,
"loss": 1.1427,
"step": 436
},
{
"epoch": 0.3137114142139268,
"grad_norm": 0.36013710498809814,
"learning_rate": 1.7803673923398602e-05,
"loss": 1.1707,
"step": 437
},
{
"epoch": 0.31442928930366115,
"grad_norm": 0.4431007504463196,
"learning_rate": 1.7794296344622245e-05,
"loss": 1.1485,
"step": 438
},
{
"epoch": 0.3151471643933956,
"grad_norm": 0.31186437606811523,
"learning_rate": 1.7784904450788608e-05,
"loss": 1.1456,
"step": 439
},
{
"epoch": 0.31586503948312994,
"grad_norm": 0.28195399045944214,
"learning_rate": 1.7775498290011935e-05,
"loss": 1.1428,
"step": 440
},
{
"epoch": 0.3165829145728643,
"grad_norm": 0.28521496057510376,
"learning_rate": 1.7766077910479584e-05,
"loss": 1.0742,
"step": 441
},
{
"epoch": 0.3173007896625987,
"grad_norm": 0.22959643602371216,
"learning_rate": 1.7756643360451743e-05,
"loss": 1.0827,
"step": 442
},
{
"epoch": 0.3180186647523331,
"grad_norm": 0.4172683358192444,
"learning_rate": 1.7747194688261194e-05,
"loss": 1.0979,
"step": 443
},
{
"epoch": 0.31873653984206746,
"grad_norm": 0.3398095667362213,
"learning_rate": 1.7737731942313077e-05,
"loss": 1.0966,
"step": 444
},
{
"epoch": 0.3194544149318019,
"grad_norm": 0.3017900586128235,
"learning_rate": 1.7728255171084614e-05,
"loss": 1.1515,
"step": 445
},
{
"epoch": 0.32017229002153624,
"grad_norm": 0.34165719151496887,
"learning_rate": 1.7718764423124892e-05,
"loss": 1.129,
"step": 446
},
{
"epoch": 0.32089016511127066,
"grad_norm": 0.46820157766342163,
"learning_rate": 1.7709259747054594e-05,
"loss": 1.1185,
"step": 447
},
{
"epoch": 0.32160804020100503,
"grad_norm": 0.3458547294139862,
"learning_rate": 1.769974119156576e-05,
"loss": 1.1134,
"step": 448
},
{
"epoch": 0.3223259152907394,
"grad_norm": 0.4865923821926117,
"learning_rate": 1.7690208805421526e-05,
"loss": 1.1727,
"step": 449
},
{
"epoch": 0.3230437903804738,
"grad_norm": 0.2898300886154175,
"learning_rate": 1.7680662637455892e-05,
"loss": 1.0716,
"step": 450
},
{
"epoch": 0.3237616654702082,
"grad_norm": 0.32885676622390747,
"learning_rate": 1.7671102736573454e-05,
"loss": 1.0732,
"step": 451
},
{
"epoch": 0.32447954055994255,
"grad_norm": 0.3227647542953491,
"learning_rate": 1.7661529151749164e-05,
"loss": 1.0888,
"step": 452
},
{
"epoch": 0.32519741564967697,
"grad_norm": 0.3530004620552063,
"learning_rate": 1.7651941932028077e-05,
"loss": 1.1758,
"step": 453
},
{
"epoch": 0.32591529073941133,
"grad_norm": 0.3144376277923584,
"learning_rate": 1.76423411265251e-05,
"loss": 1.2014,
"step": 454
},
{
"epoch": 0.32663316582914576,
"grad_norm": 0.24854792654514313,
"learning_rate": 1.7632726784424733e-05,
"loss": 1.1555,
"step": 455
},
{
"epoch": 0.3273510409188801,
"grad_norm": 0.33343812823295593,
"learning_rate": 1.762309895498083e-05,
"loss": 1.1325,
"step": 456
},
{
"epoch": 0.3280689160086145,
"grad_norm": 0.2514241635799408,
"learning_rate": 1.761345768751634e-05,
"loss": 1.1321,
"step": 457
},
{
"epoch": 0.3287867910983489,
"grad_norm": 0.2710927724838257,
"learning_rate": 1.760380303142305e-05,
"loss": 1.1312,
"step": 458
},
{
"epoch": 0.3295046661880833,
"grad_norm": 0.2550286650657654,
"learning_rate": 1.759413503616133e-05,
"loss": 1.1223,
"step": 459
},
{
"epoch": 0.33022254127781764,
"grad_norm": 0.3207646310329437,
"learning_rate": 1.7584453751259913e-05,
"loss": 1.0433,
"step": 460
},
{
"epoch": 0.33094041636755206,
"grad_norm": 0.24946732819080353,
"learning_rate": 1.7574759226315583e-05,
"loss": 1.1089,
"step": 461
},
{
"epoch": 0.3316582914572864,
"grad_norm": 0.3521324694156647,
"learning_rate": 1.7565051510992966e-05,
"loss": 1.1433,
"step": 462
},
{
"epoch": 0.33237616654702085,
"grad_norm": 0.2754557728767395,
"learning_rate": 1.7555330655024263e-05,
"loss": 1.1118,
"step": 463
},
{
"epoch": 0.3330940416367552,
"grad_norm": 0.23766492307186127,
"learning_rate": 1.7545596708208993e-05,
"loss": 1.083,
"step": 464
},
{
"epoch": 0.3338119167264896,
"grad_norm": 0.34119418263435364,
"learning_rate": 1.7535849720413732e-05,
"loss": 1.1183,
"step": 465
},
{
"epoch": 0.334529791816224,
"grad_norm": 0.4450787305831909,
"learning_rate": 1.7526089741571878e-05,
"loss": 1.153,
"step": 466
},
{
"epoch": 0.33524766690595836,
"grad_norm": 0.3177962899208069,
"learning_rate": 1.7516316821683363e-05,
"loss": 1.1197,
"step": 467
},
{
"epoch": 0.33596554199569273,
"grad_norm": 0.38232341408729553,
"learning_rate": 1.7506531010814436e-05,
"loss": 1.1731,
"step": 468
},
{
"epoch": 0.33668341708542715,
"grad_norm": 0.24796949326992035,
"learning_rate": 1.749673235909737e-05,
"loss": 1.1165,
"step": 469
},
{
"epoch": 0.3374012921751615,
"grad_norm": 1.157897710800171,
"learning_rate": 1.7486920916730228e-05,
"loss": 1.2673,
"step": 470
},
{
"epoch": 0.3381191672648959,
"grad_norm": 114.63349914550781,
"learning_rate": 1.74770967339766e-05,
"loss": 1.1722,
"step": 471
},
{
"epoch": 0.3388370423546303,
"grad_norm": 0.2927913963794708,
"learning_rate": 1.7467259861165335e-05,
"loss": 1.116,
"step": 472
},
{
"epoch": 0.33955491744436467,
"grad_norm": 0.2694433033466339,
"learning_rate": 1.7457410348690312e-05,
"loss": 1.1104,
"step": 473
},
{
"epoch": 0.3402727925340991,
"grad_norm": 0.3550788164138794,
"learning_rate": 1.744754824701014e-05,
"loss": 1.1374,
"step": 474
},
{
"epoch": 0.34099066762383345,
"grad_norm": 0.3682781755924225,
"learning_rate": 1.7437673606647935e-05,
"loss": 1.1573,
"step": 475
},
{
"epoch": 0.3417085427135678,
"grad_norm": 0.6073625683784485,
"learning_rate": 1.7427786478191042e-05,
"loss": 1.2432,
"step": 476
},
{
"epoch": 0.34242641780330224,
"grad_norm": 0.268215149641037,
"learning_rate": 1.741788691229079e-05,
"loss": 1.1172,
"step": 477
},
{
"epoch": 0.3431442928930366,
"grad_norm": 0.6302129626274109,
"learning_rate": 1.7407974959662223e-05,
"loss": 1.1231,
"step": 478
},
{
"epoch": 0.34386216798277097,
"grad_norm": 0.31168705224990845,
"learning_rate": 1.7398050671083833e-05,
"loss": 1.1742,
"step": 479
},
{
"epoch": 0.3445800430725054,
"grad_norm": 0.42064017057418823,
"learning_rate": 1.7388114097397312e-05,
"loss": 1.1146,
"step": 480
},
{
"epoch": 0.34529791816223976,
"grad_norm": 0.5136387944221497,
"learning_rate": 1.7378165289507296e-05,
"loss": 1.321,
"step": 481
},
{
"epoch": 0.3460157932519742,
"grad_norm": 0.5692410469055176,
"learning_rate": 1.7368204298381086e-05,
"loss": 1.0915,
"step": 482
},
{
"epoch": 0.34673366834170855,
"grad_norm": 0.42343536019325256,
"learning_rate": 1.7358231175048402e-05,
"loss": 1.1114,
"step": 483
},
{
"epoch": 0.3474515434314429,
"grad_norm": 0.9200416803359985,
"learning_rate": 1.734824597060112e-05,
"loss": 1.0989,
"step": 484
},
{
"epoch": 0.34816941852117733,
"grad_norm": 1.0450941324234009,
"learning_rate": 1.7338248736192998e-05,
"loss": 1.143,
"step": 485
},
{
"epoch": 0.3488872936109117,
"grad_norm": 0.3270246684551239,
"learning_rate": 1.732823952303943e-05,
"loss": 1.0929,
"step": 486
},
{
"epoch": 0.34960516870064606,
"grad_norm": 0.6713224053382874,
"learning_rate": 1.7318218382417177e-05,
"loss": 1.1557,
"step": 487
},
{
"epoch": 0.3503230437903805,
"grad_norm": 0.559257984161377,
"learning_rate": 1.73081853656641e-05,
"loss": 1.2446,
"step": 488
},
{
"epoch": 0.35104091888011485,
"grad_norm": 0.2514427900314331,
"learning_rate": 1.7298140524178905e-05,
"loss": 1.1262,
"step": 489
},
{
"epoch": 0.35175879396984927,
"grad_norm": 0.4912594258785248,
"learning_rate": 1.7288083909420866e-05,
"loss": 1.2154,
"step": 490
},
{
"epoch": 0.35247666905958364,
"grad_norm": 0.2987563908100128,
"learning_rate": 1.7278015572909586e-05,
"loss": 1.028,
"step": 491
},
{
"epoch": 0.353194544149318,
"grad_norm": 0.6520453095436096,
"learning_rate": 1.7267935566224707e-05,
"loss": 1.1186,
"step": 492
},
{
"epoch": 0.3539124192390524,
"grad_norm": 0.28420624136924744,
"learning_rate": 1.7257843941005656e-05,
"loss": 1.0904,
"step": 493
},
{
"epoch": 0.3546302943287868,
"grad_norm": 0.27295732498168945,
"learning_rate": 1.7247740748951398e-05,
"loss": 1.0454,
"step": 494
},
{
"epoch": 0.35534816941852115,
"grad_norm": 0.32143691182136536,
"learning_rate": 1.7237626041820124e-05,
"loss": 1.0734,
"step": 495
},
{
"epoch": 0.3560660445082556,
"grad_norm": 0.25867822766304016,
"learning_rate": 1.722749987142905e-05,
"loss": 1.0591,
"step": 496
},
{
"epoch": 0.35678391959798994,
"grad_norm": 0.2761097848415375,
"learning_rate": 1.721736228965409e-05,
"loss": 1.1629,
"step": 497
},
{
"epoch": 0.35750179468772436,
"grad_norm": 0.2398725152015686,
"learning_rate": 1.720721334842963e-05,
"loss": 1.1519,
"step": 498
},
{
"epoch": 0.3582196697774587,
"grad_norm": 0.22945886850357056,
"learning_rate": 1.719705309974826e-05,
"loss": 1.0608,
"step": 499
},
{
"epoch": 0.3589375448671931,
"grad_norm": 0.23391370475292206,
"learning_rate": 1.7186881595660478e-05,
"loss": 1.063,
"step": 500
},
{
"epoch": 0.3596554199569275,
"grad_norm": 0.8408838510513306,
"learning_rate": 1.7176698888274455e-05,
"loss": 1.3951,
"step": 501
},
{
"epoch": 0.3603732950466619,
"grad_norm": 0.42259481549263,
"learning_rate": 1.7166505029755752e-05,
"loss": 1.1798,
"step": 502
},
{
"epoch": 0.36109117013639624,
"grad_norm": 0.3819274306297302,
"learning_rate": 1.715630007232706e-05,
"loss": 1.2017,
"step": 503
},
{
"epoch": 0.36180904522613067,
"grad_norm": 0.2353987842798233,
"learning_rate": 1.714608406826793e-05,
"loss": 1.0769,
"step": 504
},
{
"epoch": 0.36252692031586503,
"grad_norm": 0.23951439559459686,
"learning_rate": 1.713585706991449e-05,
"loss": 1.0743,
"step": 505
},
{
"epoch": 0.36324479540559945,
"grad_norm": 0.2518669664859772,
"learning_rate": 1.7125619129659215e-05,
"loss": 1.0942,
"step": 506
},
{
"epoch": 0.3639626704953338,
"grad_norm": 0.5533793568611145,
"learning_rate": 1.7115370299950616e-05,
"loss": 1.1229,
"step": 507
},
{
"epoch": 0.3646805455850682,
"grad_norm": 0.38030925393104553,
"learning_rate": 1.7105110633293e-05,
"loss": 1.1829,
"step": 508
},
{
"epoch": 0.3653984206748026,
"grad_norm": 0.2151963710784912,
"learning_rate": 1.7094840182246186e-05,
"loss": 1.1415,
"step": 509
},
{
"epoch": 0.36611629576453697,
"grad_norm": 0.47541192173957825,
"learning_rate": 1.7084558999425244e-05,
"loss": 1.3636,
"step": 510
},
{
"epoch": 0.36683417085427134,
"grad_norm": 0.3854585886001587,
"learning_rate": 1.7074267137500224e-05,
"loss": 1.1861,
"step": 511
},
{
"epoch": 0.36755204594400576,
"grad_norm": 0.4030381143093109,
"learning_rate": 1.7063964649195876e-05,
"loss": 1.1549,
"step": 512
},
{
"epoch": 0.3682699210337401,
"grad_norm": 0.24959807097911835,
"learning_rate": 1.7053651587291397e-05,
"loss": 1.1284,
"step": 513
},
{
"epoch": 0.36898779612347454,
"grad_norm": 0.3913261294364929,
"learning_rate": 1.7043328004620155e-05,
"loss": 1.0848,
"step": 514
},
{
"epoch": 0.3697056712132089,
"grad_norm": 0.251136839389801,
"learning_rate": 1.7032993954069403e-05,
"loss": 1.1282,
"step": 515
},
{
"epoch": 0.3704235463029433,
"grad_norm": 0.2389630526304245,
"learning_rate": 1.7022649488580028e-05,
"loss": 1.0215,
"step": 516
},
{
"epoch": 0.3711414213926777,
"grad_norm": 0.24776770174503326,
"learning_rate": 1.7012294661146278e-05,
"loss": 1.0338,
"step": 517
},
{
"epoch": 0.37185929648241206,
"grad_norm": 0.4876779317855835,
"learning_rate": 1.700192952481547e-05,
"loss": 1.0991,
"step": 518
},
{
"epoch": 0.3725771715721464,
"grad_norm": 0.31186917424201965,
"learning_rate": 1.699155413268775e-05,
"loss": 1.093,
"step": 519
},
{
"epoch": 0.37329504666188085,
"grad_norm": 0.23691536486148834,
"learning_rate": 1.698116853791579e-05,
"loss": 1.0681,
"step": 520
},
{
"epoch": 0.3740129217516152,
"grad_norm": 2.3807168006896973,
"learning_rate": 1.6970772793704536e-05,
"loss": 1.1821,
"step": 521
},
{
"epoch": 0.3747307968413496,
"grad_norm": 0.45725804567337036,
"learning_rate": 1.696036695331093e-05,
"loss": 1.2203,
"step": 522
},
{
"epoch": 0.375448671931084,
"grad_norm": 0.27984505891799927,
"learning_rate": 1.694995107004364e-05,
"loss": 1.1389,
"step": 523
},
{
"epoch": 0.37616654702081836,
"grad_norm": 0.32176896929740906,
"learning_rate": 1.6939525197262763e-05,
"loss": 1.1442,
"step": 524
},
{
"epoch": 0.3768844221105528,
"grad_norm": 0.2559623718261719,
"learning_rate": 1.69290893883796e-05,
"loss": 1.0509,
"step": 525
},
{
"epoch": 0.37760229720028715,
"grad_norm": 0.34299102425575256,
"learning_rate": 1.6918643696856335e-05,
"loss": 1.0765,
"step": 526
},
{
"epoch": 0.3783201722900215,
"grad_norm": 0.36113616824150085,
"learning_rate": 1.690818817620579e-05,
"loss": 1.1227,
"step": 527
},
{
"epoch": 0.37903804737975594,
"grad_norm": 0.7385759353637695,
"learning_rate": 1.689772287999113e-05,
"loss": 1.2611,
"step": 528
},
{
"epoch": 0.3797559224694903,
"grad_norm": 0.4936607778072357,
"learning_rate": 1.688724786182562e-05,
"loss": 1.2145,
"step": 529
},
{
"epoch": 0.38047379755922467,
"grad_norm": 0.6105899214744568,
"learning_rate": 1.6876763175372306e-05,
"loss": 1.0898,
"step": 530
},
{
"epoch": 0.3811916726489591,
"grad_norm": 0.33757439255714417,
"learning_rate": 1.686626887434378e-05,
"loss": 1.1311,
"step": 531
},
{
"epoch": 0.38190954773869346,
"grad_norm": 0.22930863499641418,
"learning_rate": 1.6855765012501884e-05,
"loss": 1.1404,
"step": 532
},
{
"epoch": 0.3826274228284279,
"grad_norm": 0.24472716450691223,
"learning_rate": 1.6845251643657442e-05,
"loss": 1.0967,
"step": 533
},
{
"epoch": 0.38334529791816224,
"grad_norm": 0.9064115285873413,
"learning_rate": 1.683472882166998e-05,
"loss": 1.2872,
"step": 534
},
{
"epoch": 0.3840631730078966,
"grad_norm": 0.49683505296707153,
"learning_rate": 1.6824196600447446e-05,
"loss": 1.1155,
"step": 535
},
{
"epoch": 0.38478104809763103,
"grad_norm": 0.21948783099651337,
"learning_rate": 1.6813655033945958e-05,
"loss": 1.0843,
"step": 536
},
{
"epoch": 0.3854989231873654,
"grad_norm": 0.2846214175224304,
"learning_rate": 1.6803104176169486e-05,
"loss": 1.0879,
"step": 537
},
{
"epoch": 0.38621679827709976,
"grad_norm": 0.36232081055641174,
"learning_rate": 1.6792544081169618e-05,
"loss": 1.165,
"step": 538
},
{
"epoch": 0.3869346733668342,
"grad_norm": 0.3394874036312103,
"learning_rate": 1.678197480304525e-05,
"loss": 1.092,
"step": 539
},
{
"epoch": 0.38765254845656855,
"grad_norm": 0.38495680689811707,
"learning_rate": 1.677139639594234e-05,
"loss": 1.1141,
"step": 540
},
{
"epoch": 0.38837042354630297,
"grad_norm": 0.2894507050514221,
"learning_rate": 1.6760808914053588e-05,
"loss": 1.0822,
"step": 541
},
{
"epoch": 0.38908829863603733,
"grad_norm": 0.5585907101631165,
"learning_rate": 1.675021241161821e-05,
"loss": 1.1023,
"step": 542
},
{
"epoch": 0.3898061737257717,
"grad_norm": 0.26919737458229065,
"learning_rate": 1.673960694292161e-05,
"loss": 1.1548,
"step": 543
},
{
"epoch": 0.3905240488155061,
"grad_norm": 0.2462243139743805,
"learning_rate": 1.672899256229515e-05,
"loss": 1.1222,
"step": 544
},
{
"epoch": 0.3912419239052405,
"grad_norm": 0.244044229388237,
"learning_rate": 1.671836932411583e-05,
"loss": 1.0474,
"step": 545
},
{
"epoch": 0.39195979899497485,
"grad_norm": 0.39821985363960266,
"learning_rate": 1.6707737282806033e-05,
"loss": 1.0614,
"step": 546
},
{
"epoch": 0.39267767408470927,
"grad_norm": 0.5356500148773193,
"learning_rate": 1.6697096492833234e-05,
"loss": 1.166,
"step": 547
},
{
"epoch": 0.39339554917444364,
"grad_norm": 0.28749582171440125,
"learning_rate": 1.6686447008709737e-05,
"loss": 1.125,
"step": 548
},
{
"epoch": 0.39411342426417806,
"grad_norm": 0.2848498225212097,
"learning_rate": 1.667578888499238e-05,
"loss": 1.1676,
"step": 549
},
{
"epoch": 0.3948312993539124,
"grad_norm": 0.31614670157432556,
"learning_rate": 1.6665122176282265e-05,
"loss": 1.1118,
"step": 550
},
{
"epoch": 0.3955491744436468,
"grad_norm": 0.3202102482318878,
"learning_rate": 1.6654446937224467e-05,
"loss": 1.0995,
"step": 551
},
{
"epoch": 0.3962670495333812,
"grad_norm": 0.525974452495575,
"learning_rate": 1.6643763222507766e-05,
"loss": 1.2701,
"step": 552
},
{
"epoch": 0.3969849246231156,
"grad_norm": 2.5211613178253174,
"learning_rate": 1.6633071086864366e-05,
"loss": 1.5002,
"step": 553
},
{
"epoch": 0.39770279971284994,
"grad_norm": 0.6316677331924438,
"learning_rate": 1.6622370585069604e-05,
"loss": 1.2834,
"step": 554
},
{
"epoch": 0.39842067480258436,
"grad_norm": 0.31538158655166626,
"learning_rate": 1.6611661771941686e-05,
"loss": 1.1316,
"step": 555
},
{
"epoch": 0.39913854989231873,
"grad_norm": 0.35625991225242615,
"learning_rate": 1.6600944702341386e-05,
"loss": 1.0744,
"step": 556
},
{
"epoch": 0.39985642498205315,
"grad_norm": 0.815897524356842,
"learning_rate": 1.6590219431171782e-05,
"loss": 1.4233,
"step": 557
},
{
"epoch": 0.4005743000717875,
"grad_norm": 0.24649186432361603,
"learning_rate": 1.6579486013377965e-05,
"loss": 1.0445,
"step": 558
},
{
"epoch": 0.4012921751615219,
"grad_norm": 0.38415780663490295,
"learning_rate": 1.656874450394676e-05,
"loss": 1.0642,
"step": 559
},
{
"epoch": 0.4020100502512563,
"grad_norm": 5.395670413970947,
"learning_rate": 1.6557994957906456e-05,
"loss": 1.419,
"step": 560
},
{
"epoch": 0.40272792534099067,
"grad_norm": 0.43887314200401306,
"learning_rate": 1.6547237430326494e-05,
"loss": 1.2503,
"step": 561
},
{
"epoch": 0.40344580043072503,
"grad_norm": 0.508529007434845,
"learning_rate": 1.6536471976317227e-05,
"loss": 1.1596,
"step": 562
},
{
"epoch": 0.40416367552045945,
"grad_norm": 0.9722962379455566,
"learning_rate": 1.6525698651029585e-05,
"loss": 1.1408,
"step": 563
},
{
"epoch": 0.4048815506101938,
"grad_norm": 0.6140845417976379,
"learning_rate": 1.651491750965486e-05,
"loss": 1.1447,
"step": 564
},
{
"epoch": 0.40559942569992824,
"grad_norm": 0.3369700312614441,
"learning_rate": 1.650412860742435e-05,
"loss": 1.087,
"step": 565
},
{
"epoch": 0.4063173007896626,
"grad_norm": 0.25257930159568787,
"learning_rate": 1.6493331999609133e-05,
"loss": 1.0692,
"step": 566
},
{
"epoch": 0.40703517587939697,
"grad_norm": 0.2892700433731079,
"learning_rate": 1.6482527741519755e-05,
"loss": 1.1293,
"step": 567
},
{
"epoch": 0.4077530509691314,
"grad_norm": 0.4760481119155884,
"learning_rate": 1.647171588850595e-05,
"loss": 1.0839,
"step": 568
},
{
"epoch": 0.40847092605886576,
"grad_norm": 0.3079492449760437,
"learning_rate": 1.6460896495956377e-05,
"loss": 1.1194,
"step": 569
},
{
"epoch": 0.4091888011486001,
"grad_norm": 0.26826906204223633,
"learning_rate": 1.6450069619298302e-05,
"loss": 1.0403,
"step": 570
},
{
"epoch": 0.40990667623833454,
"grad_norm": 0.3646854758262634,
"learning_rate": 1.6439235313997332e-05,
"loss": 1.0847,
"step": 571
},
{
"epoch": 0.4106245513280689,
"grad_norm": 0.265591561794281,
"learning_rate": 1.6428393635557146e-05,
"loss": 1.0887,
"step": 572
},
{
"epoch": 0.4113424264178033,
"grad_norm": 0.3478126525878906,
"learning_rate": 1.641754463951918e-05,
"loss": 1.0468,
"step": 573
},
{
"epoch": 0.4120603015075377,
"grad_norm": 0.7069317698478699,
"learning_rate": 1.640668838146237e-05,
"loss": 1.0561,
"step": 574
},
{
"epoch": 0.41277817659727206,
"grad_norm": 0.2756154239177704,
"learning_rate": 1.639582491700284e-05,
"loss": 1.1029,
"step": 575
},
{
"epoch": 0.4134960516870065,
"grad_norm": 0.2211773544549942,
"learning_rate": 1.638495430179365e-05,
"loss": 1.0336,
"step": 576
},
{
"epoch": 0.41421392677674085,
"grad_norm": 0.5828403830528259,
"learning_rate": 1.637407659152447e-05,
"loss": 1.1292,
"step": 577
},
{
"epoch": 0.4149318018664752,
"grad_norm": 0.2376655787229538,
"learning_rate": 1.6363191841921346e-05,
"loss": 1.0904,
"step": 578
},
{
"epoch": 0.41564967695620963,
"grad_norm": 0.24718521535396576,
"learning_rate": 1.6352300108746365e-05,
"loss": 1.0573,
"step": 579
},
{
"epoch": 0.416367552045944,
"grad_norm": 0.6126704216003418,
"learning_rate": 1.6341401447797397e-05,
"loss": 1.3006,
"step": 580
},
{
"epoch": 0.41708542713567837,
"grad_norm": 0.2663542926311493,
"learning_rate": 1.6330495914907803e-05,
"loss": 1.1387,
"step": 581
},
{
"epoch": 0.4178033022254128,
"grad_norm": 0.24516524374485016,
"learning_rate": 1.631958356594615e-05,
"loss": 1.0717,
"step": 582
},
{
"epoch": 0.41852117731514715,
"grad_norm": 0.3285379707813263,
"learning_rate": 1.6308664456815914e-05,
"loss": 1.0615,
"step": 583
},
{
"epoch": 0.4192390524048816,
"grad_norm": 0.2067587524652481,
"learning_rate": 1.6297738643455225e-05,
"loss": 1.1256,
"step": 584
},
{
"epoch": 0.41995692749461594,
"grad_norm": 0.25473785400390625,
"learning_rate": 1.6286806181836535e-05,
"loss": 1.0668,
"step": 585
},
{
"epoch": 0.4206748025843503,
"grad_norm": 0.23833997547626495,
"learning_rate": 1.6275867127966364e-05,
"loss": 1.0937,
"step": 586
},
{
"epoch": 0.4213926776740847,
"grad_norm": 0.2214917689561844,
"learning_rate": 1.6264921537885005e-05,
"loss": 1.0395,
"step": 587
},
{
"epoch": 0.4221105527638191,
"grad_norm": 0.2731074392795563,
"learning_rate": 1.625396946766624e-05,
"loss": 1.1332,
"step": 588
},
{
"epoch": 0.42282842785355346,
"grad_norm": 1.6967939138412476,
"learning_rate": 1.6243010973417033e-05,
"loss": 1.175,
"step": 589
},
{
"epoch": 0.4235463029432879,
"grad_norm": 0.6087107062339783,
"learning_rate": 1.623204611127728e-05,
"loss": 1.3019,
"step": 590
},
{
"epoch": 0.42426417803302224,
"grad_norm": 0.337079793214798,
"learning_rate": 1.6221074937419476e-05,
"loss": 1.1454,
"step": 591
},
{
"epoch": 0.42498205312275666,
"grad_norm": 0.5039341449737549,
"learning_rate": 1.621009750804847e-05,
"loss": 1.2322,
"step": 592
},
{
"epoch": 0.42569992821249103,
"grad_norm": 0.42420458793640137,
"learning_rate": 1.6199113879401143e-05,
"loss": 1.0776,
"step": 593
},
{
"epoch": 0.4264178033022254,
"grad_norm": 0.3929119110107422,
"learning_rate": 1.618812410774615e-05,
"loss": 1.1037,
"step": 594
},
{
"epoch": 0.4271356783919598,
"grad_norm": 0.24679097533226013,
"learning_rate": 1.61771282493836e-05,
"loss": 1.0428,
"step": 595
},
{
"epoch": 0.4278535534816942,
"grad_norm": 0.1979144811630249,
"learning_rate": 1.6166126360644798e-05,
"loss": 1.0692,
"step": 596
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.871056854724884,
"learning_rate": 1.6155118497891936e-05,
"loss": 1.378,
"step": 597
},
{
"epoch": 0.42928930366116297,
"grad_norm": 0.24336977303028107,
"learning_rate": 1.6144104717517802e-05,
"loss": 1.0514,
"step": 598
},
{
"epoch": 0.43000717875089733,
"grad_norm": 0.27994900941848755,
"learning_rate": 1.6133085075945518e-05,
"loss": 1.0876,
"step": 599
},
{
"epoch": 0.43072505384063176,
"grad_norm": 0.447373628616333,
"learning_rate": 1.6122059629628223e-05,
"loss": 1.2451,
"step": 600
},
{
"epoch": 0.4314429289303661,
"grad_norm": 0.2424324005842209,
"learning_rate": 1.611102843504879e-05,
"loss": 1.125,
"step": 601
},
{
"epoch": 0.4321608040201005,
"grad_norm": 0.29035624861717224,
"learning_rate": 1.609999154871954e-05,
"loss": 1.0825,
"step": 602
},
{
"epoch": 0.4328786791098349,
"grad_norm": 0.2490774393081665,
"learning_rate": 1.608894902718196e-05,
"loss": 1.0882,
"step": 603
},
{
"epoch": 0.4335965541995693,
"grad_norm": 0.2781699001789093,
"learning_rate": 1.607790092700641e-05,
"loss": 1.0924,
"step": 604
},
{
"epoch": 0.43431442928930364,
"grad_norm": 0.5178611278533936,
"learning_rate": 1.6066847304791808e-05,
"loss": 1.148,
"step": 605
},
{
"epoch": 0.43503230437903806,
"grad_norm": 0.30101197957992554,
"learning_rate": 1.6055788217165384e-05,
"loss": 1.0633,
"step": 606
},
{
"epoch": 0.4357501794687724,
"grad_norm": 0.22836507856845856,
"learning_rate": 1.6044723720782353e-05,
"loss": 1.1176,
"step": 607
},
{
"epoch": 0.43646805455850685,
"grad_norm": 0.31232619285583496,
"learning_rate": 1.6033653872325646e-05,
"loss": 1.1251,
"step": 608
},
{
"epoch": 0.4371859296482412,
"grad_norm": 0.7165313363075256,
"learning_rate": 1.6022578728505605e-05,
"loss": 1.1678,
"step": 609
},
{
"epoch": 0.4379038047379756,
"grad_norm": 0.40292295813560486,
"learning_rate": 1.6011498346059714e-05,
"loss": 1.1498,
"step": 610
},
{
"epoch": 0.43862167982771,
"grad_norm": 0.4741491973400116,
"learning_rate": 1.6000412781752274e-05,
"loss": 1.1067,
"step": 611
},
{
"epoch": 0.43933955491744436,
"grad_norm": 0.2265220284461975,
"learning_rate": 1.598932209237415e-05,
"loss": 1.0956,
"step": 612
},
{
"epoch": 0.44005743000717873,
"grad_norm": 0.24188081920146942,
"learning_rate": 1.5978226334742454e-05,
"loss": 1.0616,
"step": 613
},
{
"epoch": 0.44077530509691315,
"grad_norm": 0.26520058512687683,
"learning_rate": 1.5967125565700268e-05,
"loss": 1.0825,
"step": 614
},
{
"epoch": 0.4414931801866475,
"grad_norm": 0.2989540100097656,
"learning_rate": 1.595601984211634e-05,
"loss": 1.0677,
"step": 615
},
{
"epoch": 0.44221105527638194,
"grad_norm": 0.24873438477516174,
"learning_rate": 1.5944909220884802e-05,
"loss": 1.1293,
"step": 616
},
{
"epoch": 0.4429289303661163,
"grad_norm": 0.266328364610672,
"learning_rate": 1.593379375892488e-05,
"loss": 1.0891,
"step": 617
},
{
"epoch": 0.44364680545585067,
"grad_norm": 0.519723117351532,
"learning_rate": 1.59226735131806e-05,
"loss": 1.1946,
"step": 618
},
{
"epoch": 0.4443646805455851,
"grad_norm": 0.27866214513778687,
"learning_rate": 1.5911548540620482e-05,
"loss": 1.0822,
"step": 619
},
{
"epoch": 0.44508255563531945,
"grad_norm": 0.22386035323143005,
"learning_rate": 1.5900418898237282e-05,
"loss": 1.0814,
"step": 620
},
{
"epoch": 0.4458004307250538,
"grad_norm": 0.2788839638233185,
"learning_rate": 1.5889284643047664e-05,
"loss": 1.0965,
"step": 621
},
{
"epoch": 0.44651830581478824,
"grad_norm": 0.41083183884620667,
"learning_rate": 1.587814583209193e-05,
"loss": 1.0948,
"step": 622
},
{
"epoch": 0.4472361809045226,
"grad_norm": 1.0803130865097046,
"learning_rate": 1.5867002522433714e-05,
"loss": 1.3797,
"step": 623
},
{
"epoch": 0.44795405599425697,
"grad_norm": 0.38585397601127625,
"learning_rate": 1.5855854771159706e-05,
"loss": 1.1765,
"step": 624
},
{
"epoch": 0.4486719310839914,
"grad_norm": 0.25229373574256897,
"learning_rate": 1.5844702635379342e-05,
"loss": 1.122,
"step": 625
},
{
"epoch": 0.44938980617372576,
"grad_norm": 0.40471434593200684,
"learning_rate": 1.583354617222453e-05,
"loss": 1.1815,
"step": 626
},
{
"epoch": 0.4501076812634602,
"grad_norm": 0.31672340631484985,
"learning_rate": 1.5822385438849327e-05,
"loss": 1.1198,
"step": 627
},
{
"epoch": 0.45082555635319455,
"grad_norm": 0.2720855176448822,
"learning_rate": 1.5811220492429692e-05,
"loss": 1.1041,
"step": 628
},
{
"epoch": 0.4515434314429289,
"grad_norm": 0.5343842506408691,
"learning_rate": 1.580005139016315e-05,
"loss": 1.1088,
"step": 629
},
{
"epoch": 0.45226130653266333,
"grad_norm": 0.784042239189148,
"learning_rate": 1.5788878189268516e-05,
"loss": 1.3126,
"step": 630
},
{
"epoch": 0.4529791816223977,
"grad_norm": 0.26885804533958435,
"learning_rate": 1.5777700946985616e-05,
"loss": 1.1121,
"step": 631
},
{
"epoch": 0.45369705671213206,
"grad_norm": 0.2665066719055176,
"learning_rate": 1.5766519720574964e-05,
"loss": 1.0813,
"step": 632
},
{
"epoch": 0.4544149318018665,
"grad_norm": 0.3135133683681488,
"learning_rate": 1.5755334567317492e-05,
"loss": 1.0603,
"step": 633
},
{
"epoch": 0.45513280689160085,
"grad_norm": 0.38875702023506165,
"learning_rate": 1.574414554451425e-05,
"loss": 1.1628,
"step": 634
},
{
"epoch": 0.45585068198133527,
"grad_norm": 0.22252540290355682,
"learning_rate": 1.5732952709486108e-05,
"loss": 1.0928,
"step": 635
},
{
"epoch": 0.45656855707106964,
"grad_norm": 0.36836034059524536,
"learning_rate": 1.572175611957347e-05,
"loss": 1.0618,
"step": 636
},
{
"epoch": 0.457286432160804,
"grad_norm": 0.3460846543312073,
"learning_rate": 1.5710555832135974e-05,
"loss": 1.0675,
"step": 637
},
{
"epoch": 0.4580043072505384,
"grad_norm": 0.23035378754138947,
"learning_rate": 1.5699351904552197e-05,
"loss": 1.1211,
"step": 638
},
{
"epoch": 0.4587221823402728,
"grad_norm": 0.22755472362041473,
"learning_rate": 1.568814439421937e-05,
"loss": 1.0974,
"step": 639
},
{
"epoch": 0.45944005743000715,
"grad_norm": 0.7951919436454773,
"learning_rate": 1.567693335855307e-05,
"loss": 1.2301,
"step": 640
},
{
"epoch": 0.4601579325197416,
"grad_norm": 0.38795220851898193,
"learning_rate": 1.5665718854986946e-05,
"loss": 1.2064,
"step": 641
},
{
"epoch": 0.46087580760947594,
"grad_norm": 0.4807037115097046,
"learning_rate": 1.5654500940972405e-05,
"loss": 1.2368,
"step": 642
},
{
"epoch": 0.46159368269921036,
"grad_norm": 0.40683814883232117,
"learning_rate": 1.5643279673978328e-05,
"loss": 1.0689,
"step": 643
},
{
"epoch": 0.4623115577889447,
"grad_norm": 0.25337550044059753,
"learning_rate": 1.563205511149077e-05,
"loss": 1.0846,
"step": 644
},
{
"epoch": 0.4630294328786791,
"grad_norm": 0.24819082021713257,
"learning_rate": 1.562082731101267e-05,
"loss": 1.1202,
"step": 645
},
{
"epoch": 0.4637473079684135,
"grad_norm": 0.5900534987449646,
"learning_rate": 1.560959633006356e-05,
"loss": 1.1474,
"step": 646
},
{
"epoch": 0.4644651830581479,
"grad_norm": 0.2737368643283844,
"learning_rate": 1.5598362226179256e-05,
"loss": 1.1296,
"step": 647
},
{
"epoch": 0.46518305814788224,
"grad_norm": 0.3514651358127594,
"learning_rate": 1.558712505691159e-05,
"loss": 1.0973,
"step": 648
},
{
"epoch": 0.46590093323761667,
"grad_norm": 0.2335313856601715,
"learning_rate": 1.5575884879828068e-05,
"loss": 1.0952,
"step": 649
},
{
"epoch": 0.46661880832735103,
"grad_norm": 0.2619829773902893,
"learning_rate": 1.5564641752511638e-05,
"loss": 1.1108,
"step": 650
},
{
"epoch": 0.46733668341708545,
"grad_norm": 0.5428041219711304,
"learning_rate": 1.555339573256034e-05,
"loss": 1.2138,
"step": 651
},
{
"epoch": 0.4680545585068198,
"grad_norm": 0.35615596175193787,
"learning_rate": 1.5542146877587042e-05,
"loss": 1.1146,
"step": 652
},
{
"epoch": 0.4687724335965542,
"grad_norm": 0.31751585006713867,
"learning_rate": 1.5530895245219132e-05,
"loss": 1.1038,
"step": 653
},
{
"epoch": 0.4694903086862886,
"grad_norm": 0.46363669633865356,
"learning_rate": 1.5519640893098227e-05,
"loss": 1.0462,
"step": 654
},
{
"epoch": 0.47020818377602297,
"grad_norm": 0.23141205310821533,
"learning_rate": 1.550838387887988e-05,
"loss": 1.0729,
"step": 655
},
{
"epoch": 0.47092605886575734,
"grad_norm": 0.2929474711418152,
"learning_rate": 1.549712426023328e-05,
"loss": 1.0883,
"step": 656
},
{
"epoch": 0.47164393395549176,
"grad_norm": 0.2350241243839264,
"learning_rate": 1.5485862094840954e-05,
"loss": 1.1719,
"step": 657
},
{
"epoch": 0.4723618090452261,
"grad_norm": 0.41748782992362976,
"learning_rate": 1.5474597440398485e-05,
"loss": 1.1433,
"step": 658
},
{
"epoch": 0.47307968413496054,
"grad_norm": 0.3260294795036316,
"learning_rate": 1.5463330354614203e-05,
"loss": 1.0893,
"step": 659
},
{
"epoch": 0.4737975592246949,
"grad_norm": 0.3263336420059204,
"learning_rate": 1.5452060895208886e-05,
"loss": 1.1113,
"step": 660
},
{
"epoch": 0.4745154343144293,
"grad_norm": 0.24381938576698303,
"learning_rate": 1.5440789119915484e-05,
"loss": 1.1028,
"step": 661
},
{
"epoch": 0.4752333094041637,
"grad_norm": 1.0602526664733887,
"learning_rate": 1.5429515086478804e-05,
"loss": 1.1219,
"step": 662
},
{
"epoch": 0.47595118449389806,
"grad_norm": 0.31142809987068176,
"learning_rate": 1.5418238852655228e-05,
"loss": 1.0123,
"step": 663
},
{
"epoch": 0.4766690595836324,
"grad_norm": 0.24484698474407196,
"learning_rate": 1.5406960476212403e-05,
"loss": 1.092,
"step": 664
},
{
"epoch": 0.47738693467336685,
"grad_norm": 0.8904862403869629,
"learning_rate": 1.5395680014928957e-05,
"loss": 1.4171,
"step": 665
},
{
"epoch": 0.4781048097631012,
"grad_norm": 0.4050968289375305,
"learning_rate": 1.538439752659419e-05,
"loss": 1.1503,
"step": 666
},
{
"epoch": 0.47882268485283563,
"grad_norm": 0.3236379027366638,
"learning_rate": 1.5373113069007804e-05,
"loss": 1.1444,
"step": 667
},
{
"epoch": 0.47954055994257,
"grad_norm": 0.21954692900180817,
"learning_rate": 1.536182669997957e-05,
"loss": 1.0946,
"step": 668
},
{
"epoch": 0.48025843503230436,
"grad_norm": 0.8067034482955933,
"learning_rate": 1.5350538477329065e-05,
"loss": 1.2717,
"step": 669
},
{
"epoch": 0.4809763101220388,
"grad_norm": 0.2779107689857483,
"learning_rate": 1.533924845888536e-05,
"loss": 1.1264,
"step": 670
},
{
"epoch": 0.48169418521177315,
"grad_norm": 0.3051629960536957,
"learning_rate": 1.5327956702486716e-05,
"loss": 1.1387,
"step": 671
},
{
"epoch": 0.4824120603015075,
"grad_norm": 0.4867023825645447,
"learning_rate": 1.531666326598031e-05,
"loss": 1.1812,
"step": 672
},
{
"epoch": 0.48312993539124194,
"grad_norm": 0.2711983025074005,
"learning_rate": 1.5305368207221918e-05,
"loss": 1.0567,
"step": 673
},
{
"epoch": 0.4838478104809763,
"grad_norm": 2.6231706142425537,
"learning_rate": 1.5294071584075628e-05,
"loss": 1.1421,
"step": 674
},
{
"epoch": 0.48456568557071067,
"grad_norm": 0.21511957049369812,
"learning_rate": 1.5282773454413547e-05,
"loss": 1.0781,
"step": 675
},
{
"epoch": 0.4852835606604451,
"grad_norm": 0.3087017834186554,
"learning_rate": 1.5271473876115495e-05,
"loss": 1.0447,
"step": 676
},
{
"epoch": 0.48600143575017946,
"grad_norm": 0.4581223726272583,
"learning_rate": 1.526017290706871e-05,
"loss": 1.2175,
"step": 677
},
{
"epoch": 0.4867193108399139,
"grad_norm": 0.2953665554523468,
"learning_rate": 1.5248870605167572e-05,
"loss": 1.0522,
"step": 678
},
{
"epoch": 0.48743718592964824,
"grad_norm": 0.26089370250701904,
"learning_rate": 1.5237567028313263e-05,
"loss": 1.0479,
"step": 679
},
{
"epoch": 0.4881550610193826,
"grad_norm": 0.5252697467803955,
"learning_rate": 1.5226262234413517e-05,
"loss": 1.2823,
"step": 680
},
{
"epoch": 0.48887293610911703,
"grad_norm": 0.38791197538375854,
"learning_rate": 1.5214956281382292e-05,
"loss": 1.1448,
"step": 681
},
{
"epoch": 0.4895908111988514,
"grad_norm": 0.2559528648853302,
"learning_rate": 1.5203649227139491e-05,
"loss": 1.0822,
"step": 682
},
{
"epoch": 0.49030868628858576,
"grad_norm": 0.42794185876846313,
"learning_rate": 1.519234112961066e-05,
"loss": 1.0962,
"step": 683
},
{
"epoch": 0.4910265613783202,
"grad_norm": 1.1970304250717163,
"learning_rate": 1.5181032046726674e-05,
"loss": 1.2315,
"step": 684
},
{
"epoch": 0.49174443646805455,
"grad_norm": 0.43080934882164,
"learning_rate": 1.516972203642348e-05,
"loss": 1.1181,
"step": 685
},
{
"epoch": 0.49246231155778897,
"grad_norm": 0.2969713509082794,
"learning_rate": 1.5158411156641753e-05,
"loss": 1.1076,
"step": 686
},
{
"epoch": 0.49318018664752333,
"grad_norm": 0.5752540826797485,
"learning_rate": 1.5147099465326638e-05,
"loss": 1.2848,
"step": 687
},
{
"epoch": 0.4938980617372577,
"grad_norm": 0.3028205633163452,
"learning_rate": 1.5135787020427432e-05,
"loss": 1.1198,
"step": 688
},
{
"epoch": 0.4946159368269921,
"grad_norm": 0.22824488580226898,
"learning_rate": 1.5124473879897292e-05,
"loss": 1.0493,
"step": 689
},
{
"epoch": 0.4953338119167265,
"grad_norm": 0.2328827828168869,
"learning_rate": 1.5113160101692938e-05,
"loss": 1.1029,
"step": 690
},
{
"epoch": 0.49605168700646085,
"grad_norm": 0.4405105412006378,
"learning_rate": 1.5101845743774362e-05,
"loss": 1.1827,
"step": 691
},
{
"epoch": 0.49676956209619527,
"grad_norm": 0.24878334999084473,
"learning_rate": 1.5090530864104518e-05,
"loss": 1.0473,
"step": 692
},
{
"epoch": 0.49748743718592964,
"grad_norm": 0.25342586636543274,
"learning_rate": 1.5079215520649037e-05,
"loss": 1.084,
"step": 693
},
{
"epoch": 0.49820531227566406,
"grad_norm": 0.29962748289108276,
"learning_rate": 1.5067899771375931e-05,
"loss": 1.1045,
"step": 694
},
{
"epoch": 0.4989231873653984,
"grad_norm": 0.37971824407577515,
"learning_rate": 1.5056583674255281e-05,
"loss": 1.1018,
"step": 695
},
{
"epoch": 0.4996410624551328,
"grad_norm": 0.38523370027542114,
"learning_rate": 1.5045267287258955e-05,
"loss": 1.1416,
"step": 696
},
{
"epoch": 0.5003589375448672,
"grad_norm": 0.2509319484233856,
"learning_rate": 1.5033950668360307e-05,
"loss": 1.1139,
"step": 697
},
{
"epoch": 0.5010768126346016,
"grad_norm": 0.2796666622161865,
"learning_rate": 1.5022633875533879e-05,
"loss": 1.0808,
"step": 698
},
{
"epoch": 0.501794687724336,
"grad_norm": 0.2840481102466583,
"learning_rate": 1.5011316966755103e-05,
"loss": 1.1403,
"step": 699
},
{
"epoch": 0.5025125628140703,
"grad_norm": 0.24237246811389923,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.058,
"step": 700
},
{
"epoch": 0.5032304379038047,
"grad_norm": 0.2952876687049866,
"learning_rate": 1.4988683033244903e-05,
"loss": 1.0582,
"step": 701
},
{
"epoch": 0.5039483129935391,
"grad_norm": 0.2998954951763153,
"learning_rate": 1.4977366124466123e-05,
"loss": 1.0834,
"step": 702
},
{
"epoch": 0.5046661880832735,
"grad_norm": 0.6092143654823303,
"learning_rate": 1.4966049331639695e-05,
"loss": 1.1065,
"step": 703
},
{
"epoch": 0.5053840631730079,
"grad_norm": 0.2629152536392212,
"learning_rate": 1.4954732712741047e-05,
"loss": 1.1233,
"step": 704
},
{
"epoch": 0.5061019382627423,
"grad_norm": 0.2268097698688507,
"learning_rate": 1.4943416325744725e-05,
"loss": 1.0665,
"step": 705
},
{
"epoch": 0.5068198133524767,
"grad_norm": 0.39073359966278076,
"learning_rate": 1.4932100228624073e-05,
"loss": 1.2042,
"step": 706
},
{
"epoch": 0.507537688442211,
"grad_norm": 20.153024673461914,
"learning_rate": 1.4920784479350962e-05,
"loss": 1.0943,
"step": 707
},
{
"epoch": 0.5082555635319455,
"grad_norm": 0.48045504093170166,
"learning_rate": 1.4909469135895486e-05,
"loss": 1.137,
"step": 708
},
{
"epoch": 0.5089734386216799,
"grad_norm": 0.22653381526470184,
"learning_rate": 1.4898154256225644e-05,
"loss": 1.0343,
"step": 709
},
{
"epoch": 0.5096913137114142,
"grad_norm": 0.208787739276886,
"learning_rate": 1.4886839898307065e-05,
"loss": 1.0188,
"step": 710
},
{
"epoch": 0.5104091888011486,
"grad_norm": 0.2916351556777954,
"learning_rate": 1.487552612010271e-05,
"loss": 1.1134,
"step": 711
},
{
"epoch": 0.511127063890883,
"grad_norm": 0.29213395714759827,
"learning_rate": 1.486421297957257e-05,
"loss": 1.0753,
"step": 712
},
{
"epoch": 0.5118449389806173,
"grad_norm": 0.30358755588531494,
"learning_rate": 1.4852900534673364e-05,
"loss": 1.1345,
"step": 713
},
{
"epoch": 0.5125628140703518,
"grad_norm": 0.38216859102249146,
"learning_rate": 1.484158884335825e-05,
"loss": 1.1188,
"step": 714
},
{
"epoch": 0.5132806891600862,
"grad_norm": 0.9039308428764343,
"learning_rate": 1.4830277963576525e-05,
"loss": 1.1187,
"step": 715
},
{
"epoch": 0.5139985642498205,
"grad_norm": 0.2617970108985901,
"learning_rate": 1.4818967953273328e-05,
"loss": 1.0439,
"step": 716
},
{
"epoch": 0.5147164393395549,
"grad_norm": 0.22756274044513702,
"learning_rate": 1.4807658870389346e-05,
"loss": 1.0172,
"step": 717
},
{
"epoch": 0.5154343144292893,
"grad_norm": 0.2641029357910156,
"learning_rate": 1.4796350772860511e-05,
"loss": 1.0294,
"step": 718
},
{
"epoch": 0.5161521895190236,
"grad_norm": 0.26552608609199524,
"learning_rate": 1.4785043718617713e-05,
"loss": 1.092,
"step": 719
},
{
"epoch": 0.5168700646087581,
"grad_norm": 0.5893883109092712,
"learning_rate": 1.4773737765586486e-05,
"loss": 1.2536,
"step": 720
},
{
"epoch": 0.5175879396984925,
"grad_norm": 0.6891839504241943,
"learning_rate": 1.4762432971686743e-05,
"loss": 1.237,
"step": 721
},
{
"epoch": 0.5183058147882269,
"grad_norm": 0.21907195448875427,
"learning_rate": 1.4751129394832432e-05,
"loss": 1.1225,
"step": 722
},
{
"epoch": 0.5190236898779612,
"grad_norm": 0.37343645095825195,
"learning_rate": 1.4739827092931291e-05,
"loss": 1.112,
"step": 723
},
{
"epoch": 0.5197415649676956,
"grad_norm": 0.9030300974845886,
"learning_rate": 1.472852612388451e-05,
"loss": 1.3152,
"step": 724
},
{
"epoch": 0.5204594400574301,
"grad_norm": 0.2366669923067093,
"learning_rate": 1.4717226545586454e-05,
"loss": 1.0398,
"step": 725
},
{
"epoch": 0.5211773151471644,
"grad_norm": 0.6108279824256897,
"learning_rate": 1.4705928415924372e-05,
"loss": 1.323,
"step": 726
},
{
"epoch": 0.5218951902368988,
"grad_norm": 0.29814794659614563,
"learning_rate": 1.4694631792778084e-05,
"loss": 1.071,
"step": 727
},
{
"epoch": 0.5226130653266332,
"grad_norm": 0.2940959334373474,
"learning_rate": 1.4683336734019693e-05,
"loss": 1.0795,
"step": 728
},
{
"epoch": 0.5233309404163675,
"grad_norm": 0.5142949819564819,
"learning_rate": 1.4672043297513288e-05,
"loss": 1.4383,
"step": 729
},
{
"epoch": 0.5240488155061019,
"grad_norm": 0.2306988537311554,
"learning_rate": 1.4660751541114641e-05,
"loss": 0.9869,
"step": 730
},
{
"epoch": 0.5247666905958364,
"grad_norm": 0.24959023296833038,
"learning_rate": 1.4649461522670936e-05,
"loss": 1.0652,
"step": 731
},
{
"epoch": 0.5254845656855707,
"grad_norm": 0.23822470009326935,
"learning_rate": 1.4638173300020433e-05,
"loss": 1.0703,
"step": 732
},
{
"epoch": 0.5262024407753051,
"grad_norm": 0.4634005129337311,
"learning_rate": 1.4626886930992199e-05,
"loss": 1.0548,
"step": 733
},
{
"epoch": 0.5269203158650395,
"grad_norm": 0.23906712234020233,
"learning_rate": 1.4615602473405813e-05,
"loss": 1.0999,
"step": 734
},
{
"epoch": 0.5276381909547738,
"grad_norm": 0.23547573387622833,
"learning_rate": 1.4604319985071047e-05,
"loss": 1.132,
"step": 735
},
{
"epoch": 0.5283560660445082,
"grad_norm": 0.2725988030433655,
"learning_rate": 1.45930395237876e-05,
"loss": 1.0686,
"step": 736
},
{
"epoch": 0.5290739411342427,
"grad_norm": 0.2721105217933655,
"learning_rate": 1.4581761147344776e-05,
"loss": 1.048,
"step": 737
},
{
"epoch": 0.529791816223977,
"grad_norm": 0.22710049152374268,
"learning_rate": 1.4570484913521197e-05,
"loss": 1.1312,
"step": 738
},
{
"epoch": 0.5305096913137114,
"grad_norm": 0.7076483964920044,
"learning_rate": 1.455921088008452e-05,
"loss": 1.247,
"step": 739
},
{
"epoch": 0.5312275664034458,
"grad_norm": 0.23128236830234528,
"learning_rate": 1.454793910479112e-05,
"loss": 1.0295,
"step": 740
},
{
"epoch": 0.5319454414931802,
"grad_norm": 0.3500516414642334,
"learning_rate": 1.4536669645385803e-05,
"loss": 1.0223,
"step": 741
},
{
"epoch": 0.5326633165829145,
"grad_norm": 0.21426959335803986,
"learning_rate": 1.4525402559601517e-05,
"loss": 1.1087,
"step": 742
},
{
"epoch": 0.533381191672649,
"grad_norm": 0.3150583505630493,
"learning_rate": 1.4514137905159048e-05,
"loss": 1.0764,
"step": 743
},
{
"epoch": 0.5340990667623834,
"grad_norm": 0.6355829834938049,
"learning_rate": 1.4502875739766724e-05,
"loss": 1.1481,
"step": 744
},
{
"epoch": 0.5348169418521177,
"grad_norm": 0.3957350254058838,
"learning_rate": 1.4491616121120125e-05,
"loss": 1.1127,
"step": 745
},
{
"epoch": 0.5355348169418521,
"grad_norm": 0.22465340793132782,
"learning_rate": 1.4480359106901776e-05,
"loss": 1.1038,
"step": 746
},
{
"epoch": 0.5362526920315865,
"grad_norm": 0.30386146903038025,
"learning_rate": 1.4469104754780872e-05,
"loss": 1.0665,
"step": 747
},
{
"epoch": 0.5369705671213209,
"grad_norm": 0.2654268145561218,
"learning_rate": 1.4457853122412962e-05,
"loss": 1.1224,
"step": 748
},
{
"epoch": 0.5376884422110553,
"grad_norm": 0.40003833174705505,
"learning_rate": 1.4446604267439663e-05,
"loss": 1.1331,
"step": 749
},
{
"epoch": 0.5384063173007897,
"grad_norm": 0.8957158923149109,
"learning_rate": 1.4435358247488368e-05,
"loss": 1.3964,
"step": 750
},
{
"epoch": 0.539124192390524,
"grad_norm": 0.29989439249038696,
"learning_rate": 1.4424115120171933e-05,
"loss": 1.0824,
"step": 751
},
{
"epoch": 0.5398420674802584,
"grad_norm": 0.2515528202056885,
"learning_rate": 1.4412874943088416e-05,
"loss": 1.0601,
"step": 752
},
{
"epoch": 0.5405599425699928,
"grad_norm": 0.7854285836219788,
"learning_rate": 1.4401637773820744e-05,
"loss": 1.4129,
"step": 753
},
{
"epoch": 0.5412778176597272,
"grad_norm": 0.43629977107048035,
"learning_rate": 1.4390403669936444e-05,
"loss": 1.0334,
"step": 754
},
{
"epoch": 0.5419956927494616,
"grad_norm": 0.3499593734741211,
"learning_rate": 1.4379172688987332e-05,
"loss": 1.133,
"step": 755
},
{
"epoch": 0.542713567839196,
"grad_norm": 0.22085939347743988,
"learning_rate": 1.4367944888509234e-05,
"loss": 1.043,
"step": 756
},
{
"epoch": 0.5434314429289304,
"grad_norm": 0.2278064340353012,
"learning_rate": 1.4356720326021676e-05,
"loss": 1.1319,
"step": 757
},
{
"epoch": 0.5441493180186647,
"grad_norm": 0.6754797101020813,
"learning_rate": 1.4345499059027597e-05,
"loss": 1.1694,
"step": 758
},
{
"epoch": 0.5448671931083992,
"grad_norm": 0.2929733991622925,
"learning_rate": 1.4334281145013056e-05,
"loss": 1.0546,
"step": 759
},
{
"epoch": 0.5455850681981336,
"grad_norm": 0.2639637887477875,
"learning_rate": 1.4323066641446932e-05,
"loss": 1.0479,
"step": 760
},
{
"epoch": 0.5463029432878679,
"grad_norm": 0.4230010509490967,
"learning_rate": 1.4311855605780633e-05,
"loss": 1.1826,
"step": 761
},
{
"epoch": 0.5470208183776023,
"grad_norm": 0.4472368359565735,
"learning_rate": 1.4300648095447807e-05,
"loss": 1.183,
"step": 762
},
{
"epoch": 0.5477386934673367,
"grad_norm": 0.5689032673835754,
"learning_rate": 1.4289444167864028e-05,
"loss": 1.2423,
"step": 763
},
{
"epoch": 0.548456568557071,
"grad_norm": 0.23773930966854095,
"learning_rate": 1.427824388042653e-05,
"loss": 1.0871,
"step": 764
},
{
"epoch": 0.5491744436468055,
"grad_norm": 0.29465198516845703,
"learning_rate": 1.4267047290513894e-05,
"loss": 1.0822,
"step": 765
},
{
"epoch": 0.5498923187365399,
"grad_norm": 0.5034121870994568,
"learning_rate": 1.4255854455485753e-05,
"loss": 1.1953,
"step": 766
},
{
"epoch": 0.5506101938262742,
"grad_norm": 0.317012220621109,
"learning_rate": 1.4244665432682509e-05,
"loss": 1.0673,
"step": 767
},
{
"epoch": 0.5513280689160086,
"grad_norm": 0.2964475452899933,
"learning_rate": 1.423348027942504e-05,
"loss": 1.0656,
"step": 768
},
{
"epoch": 0.552045944005743,
"grad_norm": 0.3850986063480377,
"learning_rate": 1.4222299053014388e-05,
"loss": 1.2415,
"step": 769
},
{
"epoch": 0.5527638190954773,
"grad_norm": 0.25714462995529175,
"learning_rate": 1.4211121810731484e-05,
"loss": 1.1024,
"step": 770
},
{
"epoch": 0.5534816941852118,
"grad_norm": 0.3553484082221985,
"learning_rate": 1.4199948609836855e-05,
"loss": 1.1913,
"step": 771
},
{
"epoch": 0.5541995692749462,
"grad_norm": 0.2542771100997925,
"learning_rate": 1.4188779507570312e-05,
"loss": 1.1042,
"step": 772
},
{
"epoch": 0.5549174443646806,
"grad_norm": 0.3948219120502472,
"learning_rate": 1.4177614561150674e-05,
"loss": 1.0667,
"step": 773
},
{
"epoch": 0.5556353194544149,
"grad_norm": 0.5587366223335266,
"learning_rate": 1.4166453827775474e-05,
"loss": 1.1596,
"step": 774
},
{
"epoch": 0.5563531945441493,
"grad_norm": 0.8336030840873718,
"learning_rate": 1.415529736462066e-05,
"loss": 1.0753,
"step": 775
},
{
"epoch": 0.5570710696338838,
"grad_norm": 0.28086596727371216,
"learning_rate": 1.4144145228840298e-05,
"loss": 1.0443,
"step": 776
},
{
"epoch": 0.5577889447236181,
"grad_norm": 0.4337119162082672,
"learning_rate": 1.4132997477566287e-05,
"loss": 1.0554,
"step": 777
},
{
"epoch": 0.5585068198133525,
"grad_norm": 0.8185058832168579,
"learning_rate": 1.4121854167908073e-05,
"loss": 1.5668,
"step": 778
},
{
"epoch": 0.5592246949030869,
"grad_norm": 0.4682464301586151,
"learning_rate": 1.4110715356952338e-05,
"loss": 1.1536,
"step": 779
},
{
"epoch": 0.5599425699928212,
"grad_norm": 0.33439281582832336,
"learning_rate": 1.409958110176272e-05,
"loss": 1.114,
"step": 780
},
{
"epoch": 0.5606604450825556,
"grad_norm": 1.218021035194397,
"learning_rate": 1.4088451459379522e-05,
"loss": 1.4435,
"step": 781
},
{
"epoch": 0.5613783201722901,
"grad_norm": 0.2248886376619339,
"learning_rate": 1.4077326486819404e-05,
"loss": 1.0371,
"step": 782
},
{
"epoch": 0.5620961952620244,
"grad_norm": 0.2860560417175293,
"learning_rate": 1.4066206241075124e-05,
"loss": 1.0846,
"step": 783
},
{
"epoch": 0.5628140703517588,
"grad_norm": 0.26693689823150635,
"learning_rate": 1.4055090779115204e-05,
"loss": 1.0495,
"step": 784
},
{
"epoch": 0.5635319454414932,
"grad_norm": 0.45003026723861694,
"learning_rate": 1.4043980157883665e-05,
"loss": 1.2092,
"step": 785
},
{
"epoch": 0.5642498205312275,
"grad_norm": 0.295254647731781,
"learning_rate": 1.4032874434299736e-05,
"loss": 1.1434,
"step": 786
},
{
"epoch": 0.5649676956209619,
"grad_norm": 0.2593385875225067,
"learning_rate": 1.4021773665257547e-05,
"loss": 1.0443,
"step": 787
},
{
"epoch": 0.5656855707106964,
"grad_norm": 0.2153460830450058,
"learning_rate": 1.4010677907625852e-05,
"loss": 0.9911,
"step": 788
},
{
"epoch": 0.5664034458004307,
"grad_norm": 0.2601267397403717,
"learning_rate": 1.399958721824773e-05,
"loss": 1.0849,
"step": 789
},
{
"epoch": 0.5671213208901651,
"grad_norm": 0.37321093678474426,
"learning_rate": 1.3988501653940292e-05,
"loss": 1.1383,
"step": 790
},
{
"epoch": 0.5678391959798995,
"grad_norm": 0.3128059506416321,
"learning_rate": 1.3977421271494395e-05,
"loss": 1.0725,
"step": 791
},
{
"epoch": 0.5685570710696339,
"grad_norm": 0.2780809700489044,
"learning_rate": 1.3966346127674357e-05,
"loss": 1.0723,
"step": 792
},
{
"epoch": 0.5692749461593682,
"grad_norm": 0.27749887108802795,
"learning_rate": 1.395527627921765e-05,
"loss": 1.1419,
"step": 793
},
{
"epoch": 0.5699928212491027,
"grad_norm": 0.8342905044555664,
"learning_rate": 1.3944211782834617e-05,
"loss": 1.0857,
"step": 794
},
{
"epoch": 0.5707106963388371,
"grad_norm": 0.3646883964538574,
"learning_rate": 1.3933152695208195e-05,
"loss": 1.0302,
"step": 795
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.5998344421386719,
"learning_rate": 1.3922099072993595e-05,
"loss": 1.2071,
"step": 796
},
{
"epoch": 0.5721464465183058,
"grad_norm": 0.44291579723358154,
"learning_rate": 1.391105097281804e-05,
"loss": 1.0039,
"step": 797
},
{
"epoch": 0.5728643216080402,
"grad_norm": 0.4438372254371643,
"learning_rate": 1.3900008451280463e-05,
"loss": 1.0657,
"step": 798
},
{
"epoch": 0.5735821966977745,
"grad_norm": 0.2658810615539551,
"learning_rate": 1.3888971564951214e-05,
"loss": 1.128,
"step": 799
},
{
"epoch": 0.574300071787509,
"grad_norm": 0.24561521410942078,
"learning_rate": 1.387794037037178e-05,
"loss": 1.0229,
"step": 800
},
{
"epoch": 0.5750179468772434,
"grad_norm": 0.24019256234169006,
"learning_rate": 1.3866914924054484e-05,
"loss": 1.0851,
"step": 801
},
{
"epoch": 0.5757358219669777,
"grad_norm": 0.2257530838251114,
"learning_rate": 1.3855895282482202e-05,
"loss": 1.0273,
"step": 802
},
{
"epoch": 0.5764536970567121,
"grad_norm": 0.36407381296157837,
"learning_rate": 1.3844881502108068e-05,
"loss": 1.0881,
"step": 803
},
{
"epoch": 0.5771715721464465,
"grad_norm": 0.23725035786628723,
"learning_rate": 1.3833873639355205e-05,
"loss": 1.0479,
"step": 804
},
{
"epoch": 0.5778894472361809,
"grad_norm": 0.28036853671073914,
"learning_rate": 1.3822871750616402e-05,
"loss": 1.0658,
"step": 805
},
{
"epoch": 0.5786073223259153,
"grad_norm": 0.2925416827201843,
"learning_rate": 1.3811875892253855e-05,
"loss": 1.0781,
"step": 806
},
{
"epoch": 0.5793251974156497,
"grad_norm": 0.4388463795185089,
"learning_rate": 1.3800886120598859e-05,
"loss": 1.1809,
"step": 807
},
{
"epoch": 0.5800430725053841,
"grad_norm": 0.23365306854248047,
"learning_rate": 1.3789902491951535e-05,
"loss": 1.1173,
"step": 808
},
{
"epoch": 0.5807609475951184,
"grad_norm": 0.3360154330730438,
"learning_rate": 1.3778925062580528e-05,
"loss": 1.0896,
"step": 809
},
{
"epoch": 0.5814788226848528,
"grad_norm": 0.2643462121486664,
"learning_rate": 1.3767953888722726e-05,
"loss": 1.0878,
"step": 810
},
{
"epoch": 0.5821966977745873,
"grad_norm": 0.21491171419620514,
"learning_rate": 1.3756989026582967e-05,
"loss": 1.1152,
"step": 811
},
{
"epoch": 0.5829145728643216,
"grad_norm": 0.9810214638710022,
"learning_rate": 1.3746030532333765e-05,
"loss": 1.2416,
"step": 812
},
{
"epoch": 0.583632447954056,
"grad_norm": 0.3351687788963318,
"learning_rate": 1.3735078462114994e-05,
"loss": 1.0991,
"step": 813
},
{
"epoch": 0.5843503230437904,
"grad_norm": 0.5390233397483826,
"learning_rate": 1.3724132872033637e-05,
"loss": 1.1722,
"step": 814
},
{
"epoch": 0.5850681981335247,
"grad_norm": 0.20129382610321045,
"learning_rate": 1.3713193818163468e-05,
"loss": 1.0588,
"step": 815
},
{
"epoch": 0.5857860732232592,
"grad_norm": 0.2601025700569153,
"learning_rate": 1.3702261356544778e-05,
"loss": 1.0887,
"step": 816
},
{
"epoch": 0.5865039483129936,
"grad_norm": 0.3282659351825714,
"learning_rate": 1.3691335543184087e-05,
"loss": 1.1,
"step": 817
},
{
"epoch": 0.5872218234027279,
"grad_norm": 0.22936835885047913,
"learning_rate": 1.3680416434053854e-05,
"loss": 1.0494,
"step": 818
},
{
"epoch": 0.5879396984924623,
"grad_norm": 0.6817067861557007,
"learning_rate": 1.3669504085092201e-05,
"loss": 1.1669,
"step": 819
},
{
"epoch": 0.5886575735821967,
"grad_norm": 0.25036540627479553,
"learning_rate": 1.365859855220261e-05,
"loss": 1.0679,
"step": 820
},
{
"epoch": 0.589375448671931,
"grad_norm": 0.29915371537208557,
"learning_rate": 1.364769989125364e-05,
"loss": 1.0822,
"step": 821
},
{
"epoch": 0.5900933237616655,
"grad_norm": 0.2721613049507141,
"learning_rate": 1.3636808158078659e-05,
"loss": 1.1265,
"step": 822
},
{
"epoch": 0.5908111988513999,
"grad_norm": 0.26752978563308716,
"learning_rate": 1.3625923408475532e-05,
"loss": 1.0853,
"step": 823
},
{
"epoch": 0.5915290739411343,
"grad_norm": 0.7966066598892212,
"learning_rate": 1.3615045698206357e-05,
"loss": 1.1633,
"step": 824
},
{
"epoch": 0.5922469490308686,
"grad_norm": 0.29881155490875244,
"learning_rate": 1.3604175082997161e-05,
"loss": 1.0575,
"step": 825
},
{
"epoch": 0.592964824120603,
"grad_norm": 0.3675658702850342,
"learning_rate": 1.3593311618537635e-05,
"loss": 1.0024,
"step": 826
},
{
"epoch": 0.5936826992103375,
"grad_norm": 0.25546130537986755,
"learning_rate": 1.3582455360480821e-05,
"loss": 1.1442,
"step": 827
},
{
"epoch": 0.5944005743000718,
"grad_norm": 0.42129403352737427,
"learning_rate": 1.3571606364442858e-05,
"loss": 1.1836,
"step": 828
},
{
"epoch": 0.5951184493898062,
"grad_norm": 0.2958780825138092,
"learning_rate": 1.356076468600267e-05,
"loss": 1.0958,
"step": 829
},
{
"epoch": 0.5958363244795406,
"grad_norm": 0.27691933512687683,
"learning_rate": 1.3549930380701702e-05,
"loss": 1.0744,
"step": 830
},
{
"epoch": 0.5965541995692749,
"grad_norm": 0.5291327834129333,
"learning_rate": 1.3539103504043625e-05,
"loss": 1.0502,
"step": 831
},
{
"epoch": 0.5972720746590093,
"grad_norm": 0.22101576626300812,
"learning_rate": 1.352828411149405e-05,
"loss": 1.0619,
"step": 832
},
{
"epoch": 0.5979899497487438,
"grad_norm": 0.5621711611747742,
"learning_rate": 1.3517472258480251e-05,
"loss": 1.2351,
"step": 833
},
{
"epoch": 0.5987078248384781,
"grad_norm": 0.3979596495628357,
"learning_rate": 1.350666800039087e-05,
"loss": 1.1566,
"step": 834
},
{
"epoch": 0.5994256999282125,
"grad_norm": 0.28108641505241394,
"learning_rate": 1.3495871392575652e-05,
"loss": 1.0411,
"step": 835
},
{
"epoch": 0.6001435750179469,
"grad_norm": 0.3246525228023529,
"learning_rate": 1.3485082490345144e-05,
"loss": 1.0516,
"step": 836
},
{
"epoch": 0.6008614501076812,
"grad_norm": 0.21024064719676971,
"learning_rate": 1.3474301348970415e-05,
"loss": 1.0598,
"step": 837
},
{
"epoch": 0.6015793251974156,
"grad_norm": 0.46584293246269226,
"learning_rate": 1.346352802368278e-05,
"loss": 1.2195,
"step": 838
},
{
"epoch": 0.6022972002871501,
"grad_norm": 0.2288856953382492,
"learning_rate": 1.3452762569673508e-05,
"loss": 1.1058,
"step": 839
},
{
"epoch": 0.6030150753768844,
"grad_norm": 0.23454317450523376,
"learning_rate": 1.3442005042093546e-05,
"loss": 1.0777,
"step": 840
},
{
"epoch": 0.6037329504666188,
"grad_norm": 0.35311251878738403,
"learning_rate": 1.3431255496053241e-05,
"loss": 1.1497,
"step": 841
},
{
"epoch": 0.6044508255563532,
"grad_norm": 0.3204866051673889,
"learning_rate": 1.342051398662204e-05,
"loss": 1.065,
"step": 842
},
{
"epoch": 0.6051687006460876,
"grad_norm": 0.2498190850019455,
"learning_rate": 1.3409780568828223e-05,
"loss": 1.0621,
"step": 843
},
{
"epoch": 0.6058865757358219,
"grad_norm": 0.31979426741600037,
"learning_rate": 1.3399055297658615e-05,
"loss": 1.0562,
"step": 844
},
{
"epoch": 0.6066044508255564,
"grad_norm": 0.5402187705039978,
"learning_rate": 1.3388338228058314e-05,
"loss": 1.0896,
"step": 845
},
{
"epoch": 0.6073223259152908,
"grad_norm": 0.2143259346485138,
"learning_rate": 1.3377629414930397e-05,
"loss": 1.0851,
"step": 846
},
{
"epoch": 0.6080402010050251,
"grad_norm": 0.41969624161720276,
"learning_rate": 1.3366928913135638e-05,
"loss": 1.1066,
"step": 847
},
{
"epoch": 0.6087580760947595,
"grad_norm": 0.20470081269741058,
"learning_rate": 1.3356236777492238e-05,
"loss": 1.1321,
"step": 848
},
{
"epoch": 0.6094759511844939,
"grad_norm": 0.7368186116218567,
"learning_rate": 1.3345553062775536e-05,
"loss": 1.1907,
"step": 849
},
{
"epoch": 0.6101938262742282,
"grad_norm": 0.32906386256217957,
"learning_rate": 1.3334877823717737e-05,
"loss": 1.042,
"step": 850
},
{
"epoch": 0.6109117013639627,
"grad_norm": 0.24521546065807343,
"learning_rate": 1.3324211115007622e-05,
"loss": 1.0404,
"step": 851
},
{
"epoch": 0.6116295764536971,
"grad_norm": 0.2039506733417511,
"learning_rate": 1.3313552991290264e-05,
"loss": 1.0194,
"step": 852
},
{
"epoch": 0.6123474515434314,
"grad_norm": 0.25647541880607605,
"learning_rate": 1.3302903507166768e-05,
"loss": 1.0275,
"step": 853
},
{
"epoch": 0.6130653266331658,
"grad_norm": 0.253371000289917,
"learning_rate": 1.3292262717193973e-05,
"loss": 1.061,
"step": 854
},
{
"epoch": 0.6137832017229002,
"grad_norm": 0.41155293583869934,
"learning_rate": 1.3281630675884172e-05,
"loss": 1.1474,
"step": 855
},
{
"epoch": 0.6145010768126346,
"grad_norm": 0.2841004729270935,
"learning_rate": 1.3271007437704853e-05,
"loss": 1.0809,
"step": 856
},
{
"epoch": 0.615218951902369,
"grad_norm": 0.36953848600387573,
"learning_rate": 1.3260393057078391e-05,
"loss": 0.9836,
"step": 857
},
{
"epoch": 0.6159368269921034,
"grad_norm": 8.750887870788574,
"learning_rate": 1.3249787588381797e-05,
"loss": 1.1018,
"step": 858
},
{
"epoch": 0.6166547020818378,
"grad_norm": 0.30933064222335815,
"learning_rate": 1.3239191085946416e-05,
"loss": 1.0642,
"step": 859
},
{
"epoch": 0.6173725771715721,
"grad_norm": 0.2562330961227417,
"learning_rate": 1.3228603604057666e-05,
"loss": 1.0809,
"step": 860
},
{
"epoch": 0.6180904522613065,
"grad_norm": 0.6192391514778137,
"learning_rate": 1.3218025196954752e-05,
"loss": 1.0715,
"step": 861
},
{
"epoch": 0.618808327351041,
"grad_norm": 0.4535701274871826,
"learning_rate": 1.3207455918830386e-05,
"loss": 1.1654,
"step": 862
},
{
"epoch": 0.6195262024407753,
"grad_norm": 0.2728825807571411,
"learning_rate": 1.3196895823830516e-05,
"loss": 1.0756,
"step": 863
},
{
"epoch": 0.6202440775305097,
"grad_norm": 0.2439497709274292,
"learning_rate": 1.3186344966054048e-05,
"loss": 1.1135,
"step": 864
},
{
"epoch": 0.6209619526202441,
"grad_norm": 0.5989809036254883,
"learning_rate": 1.3175803399552553e-05,
"loss": 1.2831,
"step": 865
},
{
"epoch": 0.6216798277099784,
"grad_norm": 0.2545377314090729,
"learning_rate": 1.3165271178330024e-05,
"loss": 1.053,
"step": 866
},
{
"epoch": 0.6223977027997128,
"grad_norm": 0.27281951904296875,
"learning_rate": 1.3154748356342562e-05,
"loss": 1.1144,
"step": 867
},
{
"epoch": 0.6231155778894473,
"grad_norm": 0.29359978437423706,
"learning_rate": 1.3144234987498118e-05,
"loss": 1.0385,
"step": 868
},
{
"epoch": 0.6238334529791816,
"grad_norm": 0.5095093250274658,
"learning_rate": 1.3133731125656224e-05,
"loss": 1.1486,
"step": 869
},
{
"epoch": 0.624551328068916,
"grad_norm": 0.28710466623306274,
"learning_rate": 1.3123236824627696e-05,
"loss": 1.061,
"step": 870
},
{
"epoch": 0.6252692031586504,
"grad_norm": 0.22760869562625885,
"learning_rate": 1.3112752138174382e-05,
"loss": 1.0662,
"step": 871
},
{
"epoch": 0.6259870782483847,
"grad_norm": 0.298197478055954,
"learning_rate": 1.310227712000887e-05,
"loss": 1.1286,
"step": 872
},
{
"epoch": 0.6267049533381192,
"grad_norm": 0.2910802364349365,
"learning_rate": 1.3091811823794214e-05,
"loss": 1.0378,
"step": 873
},
{
"epoch": 0.6274228284278536,
"grad_norm": 0.3472384214401245,
"learning_rate": 1.3081356303143669e-05,
"loss": 1.2315,
"step": 874
},
{
"epoch": 0.628140703517588,
"grad_norm": 0.3093032240867615,
"learning_rate": 1.3070910611620402e-05,
"loss": 1.0685,
"step": 875
},
{
"epoch": 0.6288585786073223,
"grad_norm": 0.2772822976112366,
"learning_rate": 1.306047480273724e-05,
"loss": 1.042,
"step": 876
},
{
"epoch": 0.6295764536970567,
"grad_norm": 0.2791323959827423,
"learning_rate": 1.3050048929956367e-05,
"loss": 1.0863,
"step": 877
},
{
"epoch": 0.6302943287867911,
"grad_norm": 0.26562055945396423,
"learning_rate": 1.3039633046689071e-05,
"loss": 1.0598,
"step": 878
},
{
"epoch": 0.6310122038765255,
"grad_norm": 0.23874737322330475,
"learning_rate": 1.3029227206295465e-05,
"loss": 0.9481,
"step": 879
},
{
"epoch": 0.6317300789662599,
"grad_norm": 0.5026124715805054,
"learning_rate": 1.3018831462084211e-05,
"loss": 1.1357,
"step": 880
},
{
"epoch": 0.6324479540559943,
"grad_norm": 0.24282878637313843,
"learning_rate": 1.3008445867312251e-05,
"loss": 1.0439,
"step": 881
},
{
"epoch": 0.6331658291457286,
"grad_norm": 0.2905985713005066,
"learning_rate": 1.2998070475184533e-05,
"loss": 1.0526,
"step": 882
},
{
"epoch": 0.633883704235463,
"grad_norm": 0.25067904591560364,
"learning_rate": 1.2987705338853724e-05,
"loss": 1.0396,
"step": 883
},
{
"epoch": 0.6346015793251975,
"grad_norm": 0.2646895945072174,
"learning_rate": 1.2977350511419973e-05,
"loss": 1.0337,
"step": 884
},
{
"epoch": 0.6353194544149318,
"grad_norm": 0.7052388787269592,
"learning_rate": 1.2967006045930602e-05,
"loss": 1.1606,
"step": 885
},
{
"epoch": 0.6360373295046662,
"grad_norm": 0.23824433982372284,
"learning_rate": 1.2956671995379848e-05,
"loss": 1.0932,
"step": 886
},
{
"epoch": 0.6367552045944006,
"grad_norm": 0.3025887906551361,
"learning_rate": 1.2946348412708604e-05,
"loss": 1.0138,
"step": 887
},
{
"epoch": 0.6374730796841349,
"grad_norm": 0.43609052896499634,
"learning_rate": 1.2936035350804127e-05,
"loss": 1.2193,
"step": 888
},
{
"epoch": 0.6381909547738693,
"grad_norm": 0.6852173209190369,
"learning_rate": 1.292573286249978e-05,
"loss": 1.1668,
"step": 889
},
{
"epoch": 0.6389088298636038,
"grad_norm": 0.3734978437423706,
"learning_rate": 1.291544100057476e-05,
"loss": 1.0624,
"step": 890
},
{
"epoch": 0.6396267049533381,
"grad_norm": 0.22185556590557098,
"learning_rate": 1.2905159817753816e-05,
"loss": 1.009,
"step": 891
},
{
"epoch": 0.6403445800430725,
"grad_norm": 0.586806058883667,
"learning_rate": 1.2894889366707001e-05,
"loss": 1.3122,
"step": 892
},
{
"epoch": 0.6410624551328069,
"grad_norm": 0.5803049802780151,
"learning_rate": 1.2884629700049385e-05,
"loss": 1.3217,
"step": 893
},
{
"epoch": 0.6417803302225413,
"grad_norm": 0.29484057426452637,
"learning_rate": 1.2874380870340789e-05,
"loss": 1.1699,
"step": 894
},
{
"epoch": 0.6424982053122756,
"grad_norm": 0.4079907536506653,
"learning_rate": 1.286414293008551e-05,
"loss": 1.0945,
"step": 895
},
{
"epoch": 0.6432160804020101,
"grad_norm": 0.3369198441505432,
"learning_rate": 1.2853915931732073e-05,
"loss": 1.0572,
"step": 896
},
{
"epoch": 0.6439339554917445,
"grad_norm": 0.34682968258857727,
"learning_rate": 1.2843699927672941e-05,
"loss": 1.0567,
"step": 897
},
{
"epoch": 0.6446518305814788,
"grad_norm": 0.22931355237960815,
"learning_rate": 1.2833494970244249e-05,
"loss": 1.0373,
"step": 898
},
{
"epoch": 0.6453697056712132,
"grad_norm": 0.3325895369052887,
"learning_rate": 1.2823301111725547e-05,
"loss": 1.0604,
"step": 899
},
{
"epoch": 0.6460875807609476,
"grad_norm": 0.33054086565971375,
"learning_rate": 1.2813118404339526e-05,
"loss": 1.0503,
"step": 900
},
{
"epoch": 0.6468054558506819,
"grad_norm": 0.22238993644714355,
"learning_rate": 1.2802946900251743e-05,
"loss": 1.0339,
"step": 901
},
{
"epoch": 0.6475233309404164,
"grad_norm": 0.4186948835849762,
"learning_rate": 1.2792786651570373e-05,
"loss": 1.0557,
"step": 902
},
{
"epoch": 0.6482412060301508,
"grad_norm": 0.574847400188446,
"learning_rate": 1.2782637710345917e-05,
"loss": 1.2136,
"step": 903
},
{
"epoch": 0.6489590811198851,
"grad_norm": 0.2611059844493866,
"learning_rate": 1.2772500128570955e-05,
"loss": 1.046,
"step": 904
},
{
"epoch": 0.6496769562096195,
"grad_norm": 0.2530059814453125,
"learning_rate": 1.2762373958179878e-05,
"loss": 1.0891,
"step": 905
},
{
"epoch": 0.6503948312993539,
"grad_norm": 0.23121270537376404,
"learning_rate": 1.2752259251048607e-05,
"loss": 1.1423,
"step": 906
},
{
"epoch": 0.6511127063890882,
"grad_norm": 0.36725449562072754,
"learning_rate": 1.2742156058994343e-05,
"loss": 1.0788,
"step": 907
},
{
"epoch": 0.6518305814788227,
"grad_norm": 0.2668430209159851,
"learning_rate": 1.2732064433775297e-05,
"loss": 1.0675,
"step": 908
},
{
"epoch": 0.6525484565685571,
"grad_norm": 0.696319580078125,
"learning_rate": 1.272198442709042e-05,
"loss": 1.5091,
"step": 909
},
{
"epoch": 0.6532663316582915,
"grad_norm": 0.31487971544265747,
"learning_rate": 1.2711916090579137e-05,
"loss": 1.061,
"step": 910
},
{
"epoch": 0.6539842067480258,
"grad_norm": 0.36551928520202637,
"learning_rate": 1.2701859475821101e-05,
"loss": 1.2043,
"step": 911
},
{
"epoch": 0.6547020818377602,
"grad_norm": 0.22510908544063568,
"learning_rate": 1.2691814634335904e-05,
"loss": 1.0643,
"step": 912
},
{
"epoch": 0.6554199569274947,
"grad_norm": 0.2475784569978714,
"learning_rate": 1.2681781617582827e-05,
"loss": 1.0952,
"step": 913
},
{
"epoch": 0.656137832017229,
"grad_norm": 0.19480234384536743,
"learning_rate": 1.267176047696057e-05,
"loss": 1.047,
"step": 914
},
{
"epoch": 0.6568557071069634,
"grad_norm": 0.5187938213348389,
"learning_rate": 1.2661751263807004e-05,
"loss": 1.1726,
"step": 915
},
{
"epoch": 0.6575735821966978,
"grad_norm": 0.2590419054031372,
"learning_rate": 1.2651754029398884e-05,
"loss": 1.0265,
"step": 916
},
{
"epoch": 0.6582914572864321,
"grad_norm": 0.31151992082595825,
"learning_rate": 1.2641768824951599e-05,
"loss": 1.0771,
"step": 917
},
{
"epoch": 0.6590093323761665,
"grad_norm": 0.5828796625137329,
"learning_rate": 1.2631795701618916e-05,
"loss": 1.1212,
"step": 918
},
{
"epoch": 0.659727207465901,
"grad_norm": 0.28743061423301697,
"learning_rate": 1.2621834710492706e-05,
"loss": 1.0482,
"step": 919
},
{
"epoch": 0.6604450825556353,
"grad_norm": 0.2772831916809082,
"learning_rate": 1.261188590260269e-05,
"loss": 0.9182,
"step": 920
},
{
"epoch": 0.6611629576453697,
"grad_norm": 0.24524107575416565,
"learning_rate": 1.2601949328916173e-05,
"loss": 1.0694,
"step": 921
},
{
"epoch": 0.6618808327351041,
"grad_norm": 0.8398991227149963,
"learning_rate": 1.2592025040337782e-05,
"loss": 1.0917,
"step": 922
},
{
"epoch": 0.6625987078248384,
"grad_norm": 0.21522751450538635,
"learning_rate": 1.2582113087709211e-05,
"loss": 1.0471,
"step": 923
},
{
"epoch": 0.6633165829145728,
"grad_norm": 0.6712577939033508,
"learning_rate": 1.257221352180896e-05,
"loss": 1.2126,
"step": 924
},
{
"epoch": 0.6640344580043073,
"grad_norm": 0.2957814931869507,
"learning_rate": 1.2562326393352071e-05,
"loss": 1.0818,
"step": 925
},
{
"epoch": 0.6647523330940417,
"grad_norm": 0.30718493461608887,
"learning_rate": 1.2552451752989866e-05,
"loss": 1.0719,
"step": 926
},
{
"epoch": 0.665470208183776,
"grad_norm": 0.3204226791858673,
"learning_rate": 1.2542589651309692e-05,
"loss": 1.1139,
"step": 927
},
{
"epoch": 0.6661880832735104,
"grad_norm": 0.6014212965965271,
"learning_rate": 1.2532740138834667e-05,
"loss": 1.1985,
"step": 928
},
{
"epoch": 0.6669059583632448,
"grad_norm": 0.37291762232780457,
"learning_rate": 1.2522903266023402e-05,
"loss": 1.0647,
"step": 929
},
{
"epoch": 0.6676238334529792,
"grad_norm": 0.2623102068901062,
"learning_rate": 1.2513079083269774e-05,
"loss": 1.0804,
"step": 930
},
{
"epoch": 0.6683417085427136,
"grad_norm": 0.3214200437068939,
"learning_rate": 1.2503267640902634e-05,
"loss": 1.0345,
"step": 931
},
{
"epoch": 0.669059583632448,
"grad_norm": 0.32022109627723694,
"learning_rate": 1.2493468989185566e-05,
"loss": 1.1192,
"step": 932
},
{
"epoch": 0.6697774587221823,
"grad_norm": 0.20529402792453766,
"learning_rate": 1.248368317831664e-05,
"loss": 1.099,
"step": 933
},
{
"epoch": 0.6704953338119167,
"grad_norm": 0.3141288757324219,
"learning_rate": 1.2473910258428128e-05,
"loss": 0.9543,
"step": 934
},
{
"epoch": 0.6712132089016511,
"grad_norm": 0.2604081928730011,
"learning_rate": 1.2464150279586269e-05,
"loss": 1.0645,
"step": 935
},
{
"epoch": 0.6719310839913855,
"grad_norm": 0.4819868505001068,
"learning_rate": 1.2454403291791011e-05,
"loss": 1.1232,
"step": 936
},
{
"epoch": 0.6726489590811199,
"grad_norm": 0.5241478085517883,
"learning_rate": 1.2444669344975736e-05,
"loss": 1.1311,
"step": 937
},
{
"epoch": 0.6733668341708543,
"grad_norm": 0.23811385035514832,
"learning_rate": 1.2434948489007036e-05,
"loss": 1.0201,
"step": 938
},
{
"epoch": 0.6740847092605886,
"grad_norm": 0.5078403353691101,
"learning_rate": 1.2425240773684421e-05,
"loss": 1.1095,
"step": 939
},
{
"epoch": 0.674802584350323,
"grad_norm": 0.23643115162849426,
"learning_rate": 1.241554624874009e-05,
"loss": 1.0722,
"step": 940
},
{
"epoch": 0.6755204594400575,
"grad_norm": 0.2984163761138916,
"learning_rate": 1.240586496383867e-05,
"loss": 1.1243,
"step": 941
},
{
"epoch": 0.6762383345297918,
"grad_norm": 0.3374740481376648,
"learning_rate": 1.2396196968576958e-05,
"loss": 1.1053,
"step": 942
},
{
"epoch": 0.6769562096195262,
"grad_norm": 0.5124977827072144,
"learning_rate": 1.2386542312483665e-05,
"loss": 1.2845,
"step": 943
},
{
"epoch": 0.6776740847092606,
"grad_norm": 0.24151629209518433,
"learning_rate": 1.2376901045019172e-05,
"loss": 1.076,
"step": 944
},
{
"epoch": 0.678391959798995,
"grad_norm": 0.3060801029205322,
"learning_rate": 1.2367273215575268e-05,
"loss": 1.0524,
"step": 945
},
{
"epoch": 0.6791098348887293,
"grad_norm": 0.4811931848526001,
"learning_rate": 1.2357658873474902e-05,
"loss": 1.2068,
"step": 946
},
{
"epoch": 0.6798277099784638,
"grad_norm": 0.20122970640659332,
"learning_rate": 1.2348058067971924e-05,
"loss": 1.0434,
"step": 947
},
{
"epoch": 0.6805455850681982,
"grad_norm": 0.5476710200309753,
"learning_rate": 1.2338470848250838e-05,
"loss": 1.1639,
"step": 948
},
{
"epoch": 0.6812634601579325,
"grad_norm": 0.528336763381958,
"learning_rate": 1.2328897263426549e-05,
"loss": 1.0719,
"step": 949
},
{
"epoch": 0.6819813352476669,
"grad_norm": 0.2582956850528717,
"learning_rate": 1.2319337362544113e-05,
"loss": 1.0254,
"step": 950
},
{
"epoch": 0.6826992103374013,
"grad_norm": 0.48091012239456177,
"learning_rate": 1.2309791194578478e-05,
"loss": 1.0316,
"step": 951
},
{
"epoch": 0.6834170854271356,
"grad_norm": 0.2367895096540451,
"learning_rate": 1.2300258808434247e-05,
"loss": 1.0122,
"step": 952
},
{
"epoch": 0.6841349605168701,
"grad_norm": 0.3652105927467346,
"learning_rate": 1.229074025294541e-05,
"loss": 1.0398,
"step": 953
},
{
"epoch": 0.6848528356066045,
"grad_norm": 0.30559220910072327,
"learning_rate": 1.228123557687511e-05,
"loss": 1.1184,
"step": 954
},
{
"epoch": 0.6855707106963388,
"grad_norm": 0.3181764781475067,
"learning_rate": 1.227174482891539e-05,
"loss": 1.0641,
"step": 955
},
{
"epoch": 0.6862885857860732,
"grad_norm": 0.6308692693710327,
"learning_rate": 1.2262268057686925e-05,
"loss": 1.1922,
"step": 956
},
{
"epoch": 0.6870064608758076,
"grad_norm": 0.2383725345134735,
"learning_rate": 1.2252805311738807e-05,
"loss": 1.1138,
"step": 957
},
{
"epoch": 0.6877243359655419,
"grad_norm": 0.38409408926963806,
"learning_rate": 1.2243356639548258e-05,
"loss": 1.1791,
"step": 958
},
{
"epoch": 0.6884422110552764,
"grad_norm": 0.24311621487140656,
"learning_rate": 1.2233922089520419e-05,
"loss": 1.044,
"step": 959
},
{
"epoch": 0.6891600861450108,
"grad_norm": 0.4635167717933655,
"learning_rate": 1.2224501709988069e-05,
"loss": 1.162,
"step": 960
},
{
"epoch": 0.6898779612347452,
"grad_norm": 0.22711965441703796,
"learning_rate": 1.2215095549211398e-05,
"loss": 1.0656,
"step": 961
},
{
"epoch": 0.6905958363244795,
"grad_norm": 0.5316239595413208,
"learning_rate": 1.2205703655377756e-05,
"loss": 1.1734,
"step": 962
},
{
"epoch": 0.6913137114142139,
"grad_norm": 0.3557806611061096,
"learning_rate": 1.21963260766014e-05,
"loss": 1.1013,
"step": 963
},
{
"epoch": 0.6920315865039484,
"grad_norm": 5.730634689331055,
"learning_rate": 1.2186962860923259e-05,
"loss": 1.0092,
"step": 964
},
{
"epoch": 0.6927494615936827,
"grad_norm": 0.245683953166008,
"learning_rate": 1.217761405631067e-05,
"loss": 1.0494,
"step": 965
},
{
"epoch": 0.6934673366834171,
"grad_norm": 0.24788272380828857,
"learning_rate": 1.2168279710657149e-05,
"loss": 1.0461,
"step": 966
},
{
"epoch": 0.6941852117731515,
"grad_norm": 1.2116427421569824,
"learning_rate": 1.2158959871782142e-05,
"loss": 1.3337,
"step": 967
},
{
"epoch": 0.6949030868628858,
"grad_norm": 0.20844808220863342,
"learning_rate": 1.2149654587430767e-05,
"loss": 1.0395,
"step": 968
},
{
"epoch": 0.6956209619526202,
"grad_norm": 0.47707536816596985,
"learning_rate": 1.2140363905273586e-05,
"loss": 1.2071,
"step": 969
},
{
"epoch": 0.6963388370423547,
"grad_norm": 0.3193322420120239,
"learning_rate": 1.2131087872906364e-05,
"loss": 1.0471,
"step": 970
},
{
"epoch": 0.697056712132089,
"grad_norm": 0.29196181893348694,
"learning_rate": 1.2121826537849803e-05,
"loss": 1.0051,
"step": 971
},
{
"epoch": 0.6977745872218234,
"grad_norm": 0.30033227801322937,
"learning_rate": 1.2112579947549313e-05,
"loss": 1.05,
"step": 972
},
{
"epoch": 0.6984924623115578,
"grad_norm": 3.0059380531311035,
"learning_rate": 1.210334814937477e-05,
"loss": 1.2449,
"step": 973
},
{
"epoch": 0.6992103374012921,
"grad_norm": 0.24989262223243713,
"learning_rate": 1.2094131190620268e-05,
"loss": 1.123,
"step": 974
},
{
"epoch": 0.6999282124910265,
"grad_norm": 0.22237707674503326,
"learning_rate": 1.2084929118503888e-05,
"loss": 1.0222,
"step": 975
},
{
"epoch": 0.700646087580761,
"grad_norm": 0.34449517726898193,
"learning_rate": 1.2075741980167432e-05,
"loss": 1.0971,
"step": 976
},
{
"epoch": 0.7013639626704954,
"grad_norm": 0.2557564675807953,
"learning_rate": 1.2066569822676212e-05,
"loss": 1.0674,
"step": 977
},
{
"epoch": 0.7020818377602297,
"grad_norm": 0.24336853623390198,
"learning_rate": 1.2057412693018788e-05,
"loss": 1.0314,
"step": 978
},
{
"epoch": 0.7027997128499641,
"grad_norm": 0.44501352310180664,
"learning_rate": 1.2048270638106729e-05,
"loss": 1.0492,
"step": 979
},
{
"epoch": 0.7035175879396985,
"grad_norm": 0.39975109696388245,
"learning_rate": 1.2039143704774383e-05,
"loss": 1.1048,
"step": 980
},
{
"epoch": 0.7042354630294329,
"grad_norm": 0.2642357349395752,
"learning_rate": 1.2030031939778627e-05,
"loss": 1.0703,
"step": 981
},
{
"epoch": 0.7049533381191673,
"grad_norm": 0.23442108929157257,
"learning_rate": 1.202093538979863e-05,
"loss": 1.0812,
"step": 982
},
{
"epoch": 0.7056712132089017,
"grad_norm": 0.23268234729766846,
"learning_rate": 1.2011854101435621e-05,
"loss": 1.0745,
"step": 983
},
{
"epoch": 0.706389088298636,
"grad_norm": 0.49588751792907715,
"learning_rate": 1.2002788121212636e-05,
"loss": 1.2131,
"step": 984
},
{
"epoch": 0.7071069633883704,
"grad_norm": 0.5172017812728882,
"learning_rate": 1.19937374955743e-05,
"loss": 1.2266,
"step": 985
},
{
"epoch": 0.7078248384781048,
"grad_norm": 0.25582894682884216,
"learning_rate": 1.1984702270886567e-05,
"loss": 1.0799,
"step": 986
},
{
"epoch": 0.7085427135678392,
"grad_norm": 0.2163057029247284,
"learning_rate": 1.1975682493436494e-05,
"loss": 1.0636,
"step": 987
},
{
"epoch": 0.7092605886575736,
"grad_norm": 0.25939279794692993,
"learning_rate": 1.1966678209432005e-05,
"loss": 1.0804,
"step": 988
},
{
"epoch": 0.709978463747308,
"grad_norm": 0.7320495843887329,
"learning_rate": 1.1957689465001651e-05,
"loss": 0.9866,
"step": 989
},
{
"epoch": 0.7106963388370423,
"grad_norm": 0.24018999934196472,
"learning_rate": 1.1948716306194377e-05,
"loss": 1.0456,
"step": 990
},
{
"epoch": 0.7114142139267767,
"grad_norm": 0.46888262033462524,
"learning_rate": 1.1939758778979278e-05,
"loss": 1.0181,
"step": 991
},
{
"epoch": 0.7121320890165111,
"grad_norm": 0.342295378446579,
"learning_rate": 1.1930816929245372e-05,
"loss": 1.0113,
"step": 992
},
{
"epoch": 0.7128499641062455,
"grad_norm": 3.6695358753204346,
"learning_rate": 1.1921890802801366e-05,
"loss": 1.0917,
"step": 993
},
{
"epoch": 0.7135678391959799,
"grad_norm": 0.2448820322751999,
"learning_rate": 1.1912980445375407e-05,
"loss": 1.0759,
"step": 994
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.260084331035614,
"learning_rate": 1.1904085902614869e-05,
"loss": 1.1095,
"step": 995
},
{
"epoch": 0.7150035893754487,
"grad_norm": 0.3287331759929657,
"learning_rate": 1.18952072200861e-05,
"loss": 1.0174,
"step": 996
},
{
"epoch": 0.715721464465183,
"grad_norm": 0.5043054819107056,
"learning_rate": 1.18863444432742e-05,
"loss": 1.0854,
"step": 997
},
{
"epoch": 0.7164393395549175,
"grad_norm": 0.31419816613197327,
"learning_rate": 1.1877497617582789e-05,
"loss": 1.1022,
"step": 998
},
{
"epoch": 0.7171572146446519,
"grad_norm": 0.6088225841522217,
"learning_rate": 1.1868666788333765e-05,
"loss": 1.1555,
"step": 999
},
{
"epoch": 0.7178750897343862,
"grad_norm": 0.5905667543411255,
"learning_rate": 1.1859852000767077e-05,
"loss": 1.1458,
"step": 1000
},
{
"epoch": 0.7185929648241206,
"grad_norm": 0.20521029829978943,
"learning_rate": 1.1851053300040492e-05,
"loss": 1.0299,
"step": 1001
},
{
"epoch": 0.719310839913855,
"grad_norm": 0.3856906592845917,
"learning_rate": 1.1842270731229365e-05,
"loss": 1.1342,
"step": 1002
},
{
"epoch": 0.7200287150035893,
"grad_norm": 0.2432117462158203,
"learning_rate": 1.1833504339326419e-05,
"loss": 1.0733,
"step": 1003
},
{
"epoch": 0.7207465900933238,
"grad_norm": 0.20327435433864594,
"learning_rate": 1.1824754169241487e-05,
"loss": 1.0116,
"step": 1004
},
{
"epoch": 0.7214644651830582,
"grad_norm": 0.21202722191810608,
"learning_rate": 1.1816020265801305e-05,
"loss": 1.0366,
"step": 1005
},
{
"epoch": 0.7221823402727925,
"grad_norm": 0.3296683132648468,
"learning_rate": 1.180730267374928e-05,
"loss": 1.0156,
"step": 1006
},
{
"epoch": 0.7229002153625269,
"grad_norm": 0.7693004608154297,
"learning_rate": 1.1798601437745247e-05,
"loss": 1.3622,
"step": 1007
},
{
"epoch": 0.7236180904522613,
"grad_norm": 0.24518100917339325,
"learning_rate": 1.1789916602365264e-05,
"loss": 1.0569,
"step": 1008
},
{
"epoch": 0.7243359655419956,
"grad_norm": 0.4782348573207855,
"learning_rate": 1.1781248212101354e-05,
"loss": 1.1106,
"step": 1009
},
{
"epoch": 0.7250538406317301,
"grad_norm": 0.25384142994880676,
"learning_rate": 1.1772596311361299e-05,
"loss": 1.0966,
"step": 1010
},
{
"epoch": 0.7257717157214645,
"grad_norm": 0.23078513145446777,
"learning_rate": 1.1763960944468411e-05,
"loss": 0.9975,
"step": 1011
},
{
"epoch": 0.7264895908111989,
"grad_norm": 0.2533474862575531,
"learning_rate": 1.1755342155661293e-05,
"loss": 1.029,
"step": 1012
},
{
"epoch": 0.7272074659009332,
"grad_norm": 0.3993743658065796,
"learning_rate": 1.1746739989093619e-05,
"loss": 1.1461,
"step": 1013
},
{
"epoch": 0.7279253409906676,
"grad_norm": 0.3081437349319458,
"learning_rate": 1.1738154488833911e-05,
"loss": 1.0549,
"step": 1014
},
{
"epoch": 0.7286432160804021,
"grad_norm": 1.5382530689239502,
"learning_rate": 1.1729585698865308e-05,
"loss": 1.4259,
"step": 1015
},
{
"epoch": 0.7293610911701364,
"grad_norm": 0.31933000683784485,
"learning_rate": 1.1721033663085345e-05,
"loss": 1.1181,
"step": 1016
},
{
"epoch": 0.7300789662598708,
"grad_norm": 0.2206488400697708,
"learning_rate": 1.1712498425305729e-05,
"loss": 1.0774,
"step": 1017
},
{
"epoch": 0.7307968413496052,
"grad_norm": 1.0508358478546143,
"learning_rate": 1.17039800292521e-05,
"loss": 1.2551,
"step": 1018
},
{
"epoch": 0.7315147164393395,
"grad_norm": 0.2564982771873474,
"learning_rate": 1.1695478518563835e-05,
"loss": 1.0844,
"step": 1019
},
{
"epoch": 0.7322325915290739,
"grad_norm": 0.2107759267091751,
"learning_rate": 1.1686993936793792e-05,
"loss": 1.0931,
"step": 1020
},
{
"epoch": 0.7329504666188084,
"grad_norm": 0.26491451263427734,
"learning_rate": 1.1678526327408114e-05,
"loss": 1.022,
"step": 1021
},
{
"epoch": 0.7336683417085427,
"grad_norm": 0.31346040964126587,
"learning_rate": 1.1670075733785993e-05,
"loss": 0.9739,
"step": 1022
},
{
"epoch": 0.7343862167982771,
"grad_norm": 0.3513311743736267,
"learning_rate": 1.1661642199219446e-05,
"loss": 1.0528,
"step": 1023
},
{
"epoch": 0.7351040918880115,
"grad_norm": 0.2472268044948578,
"learning_rate": 1.1653225766913096e-05,
"loss": 1.0371,
"step": 1024
},
{
"epoch": 0.7358219669777458,
"grad_norm": 0.38804471492767334,
"learning_rate": 1.1644826479983964e-05,
"loss": 1.1377,
"step": 1025
},
{
"epoch": 0.7365398420674802,
"grad_norm": 0.26781803369522095,
"learning_rate": 1.1636444381461223e-05,
"loss": 1.0295,
"step": 1026
},
{
"epoch": 0.7372577171572147,
"grad_norm": 0.26370298862457275,
"learning_rate": 1.1628079514285995e-05,
"loss": 1.0728,
"step": 1027
},
{
"epoch": 0.7379755922469491,
"grad_norm": 0.29120156168937683,
"learning_rate": 1.1619731921311124e-05,
"loss": 1.077,
"step": 1028
},
{
"epoch": 0.7386934673366834,
"grad_norm": 0.4208977520465851,
"learning_rate": 1.1611401645300968e-05,
"loss": 1.2434,
"step": 1029
},
{
"epoch": 0.7394113424264178,
"grad_norm": 0.4654211401939392,
"learning_rate": 1.1603088728931162e-05,
"loss": 1.2709,
"step": 1030
},
{
"epoch": 0.7401292175161522,
"grad_norm": 0.41849827766418457,
"learning_rate": 1.1594793214788406e-05,
"loss": 1.207,
"step": 1031
},
{
"epoch": 0.7408470926058865,
"grad_norm": 0.2996397018432617,
"learning_rate": 1.1586515145370264e-05,
"loss": 1.0608,
"step": 1032
},
{
"epoch": 0.741564967695621,
"grad_norm": 0.2890361547470093,
"learning_rate": 1.1578254563084914e-05,
"loss": 1.0606,
"step": 1033
},
{
"epoch": 0.7422828427853554,
"grad_norm": 0.2434931993484497,
"learning_rate": 1.1570011510250958e-05,
"loss": 1.029,
"step": 1034
},
{
"epoch": 0.7430007178750897,
"grad_norm": 0.319659024477005,
"learning_rate": 1.1561786029097193e-05,
"loss": 1.0207,
"step": 1035
},
{
"epoch": 0.7437185929648241,
"grad_norm": 0.2164667695760727,
"learning_rate": 1.1553578161762395e-05,
"loss": 1.052,
"step": 1036
},
{
"epoch": 0.7444364680545585,
"grad_norm": 0.2233264297246933,
"learning_rate": 1.1545387950295112e-05,
"loss": 1.0718,
"step": 1037
},
{
"epoch": 0.7451543431442929,
"grad_norm": 0.5795763731002808,
"learning_rate": 1.1537215436653432e-05,
"loss": 1.133,
"step": 1038
},
{
"epoch": 0.7458722182340273,
"grad_norm": 0.46533235907554626,
"learning_rate": 1.152906066270479e-05,
"loss": 1.0287,
"step": 1039
},
{
"epoch": 0.7465900933237617,
"grad_norm": 0.27343669533729553,
"learning_rate": 1.152092367022573e-05,
"loss": 1.1057,
"step": 1040
},
{
"epoch": 0.747307968413496,
"grad_norm": 0.21523387730121613,
"learning_rate": 1.1512804500901704e-05,
"loss": 1.0251,
"step": 1041
},
{
"epoch": 0.7480258435032304,
"grad_norm": 0.20506061613559723,
"learning_rate": 1.1504703196326864e-05,
"loss": 1.0761,
"step": 1042
},
{
"epoch": 0.7487437185929648,
"grad_norm": 0.2571380138397217,
"learning_rate": 1.1496619798003836e-05,
"loss": 1.1193,
"step": 1043
},
{
"epoch": 0.7494615936826992,
"grad_norm": 0.19685836136341095,
"learning_rate": 1.148855434734351e-05,
"loss": 1.0488,
"step": 1044
},
{
"epoch": 0.7501794687724336,
"grad_norm": 0.4925697147846222,
"learning_rate": 1.1480506885664839e-05,
"loss": 1.0208,
"step": 1045
},
{
"epoch": 0.750897343862168,
"grad_norm": 0.3596290647983551,
"learning_rate": 1.1472477454194616e-05,
"loss": 1.0781,
"step": 1046
},
{
"epoch": 0.7516152189519024,
"grad_norm": 0.2801119387149811,
"learning_rate": 1.1464466094067263e-05,
"loss": 1.1354,
"step": 1047
},
{
"epoch": 0.7523330940416367,
"grad_norm": 0.32316145300865173,
"learning_rate": 1.1456472846324629e-05,
"loss": 1.1104,
"step": 1048
},
{
"epoch": 0.7530509691313712,
"grad_norm": 0.22448933124542236,
"learning_rate": 1.1448497751915766e-05,
"loss": 1.0667,
"step": 1049
},
{
"epoch": 0.7537688442211056,
"grad_norm": 0.253033310174942,
"learning_rate": 1.1440540851696734e-05,
"loss": 1.1096,
"step": 1050
},
{
"epoch": 0.7544867193108399,
"grad_norm": 0.3583433926105499,
"learning_rate": 1.1432602186430381e-05,
"loss": 1.0997,
"step": 1051
},
{
"epoch": 0.7552045944005743,
"grad_norm": 0.39129480719566345,
"learning_rate": 1.1424681796786147e-05,
"loss": 1.115,
"step": 1052
},
{
"epoch": 0.7559224694903087,
"grad_norm": 0.5168975591659546,
"learning_rate": 1.1416779723339833e-05,
"loss": 1.2004,
"step": 1053
},
{
"epoch": 0.756640344580043,
"grad_norm": 0.5105748772621155,
"learning_rate": 1.1408896006573414e-05,
"loss": 1.0845,
"step": 1054
},
{
"epoch": 0.7573582196697775,
"grad_norm": 0.23934470117092133,
"learning_rate": 1.1401030686874828e-05,
"loss": 1.0144,
"step": 1055
},
{
"epoch": 0.7580760947595119,
"grad_norm": 0.3562638461589813,
"learning_rate": 1.139318380453776e-05,
"loss": 1.0327,
"step": 1056
},
{
"epoch": 0.7587939698492462,
"grad_norm": 0.2495163232088089,
"learning_rate": 1.138535539976144e-05,
"loss": 1.028,
"step": 1057
},
{
"epoch": 0.7595118449389806,
"grad_norm": 0.2820242643356323,
"learning_rate": 1.1377545512650447e-05,
"loss": 1.0469,
"step": 1058
},
{
"epoch": 0.760229720028715,
"grad_norm": 0.22608397901058197,
"learning_rate": 1.1369754183214485e-05,
"loss": 1.0533,
"step": 1059
},
{
"epoch": 0.7609475951184493,
"grad_norm": 0.24175365269184113,
"learning_rate": 1.1361981451368196e-05,
"loss": 1.062,
"step": 1060
},
{
"epoch": 0.7616654702081838,
"grad_norm": 0.21232882142066956,
"learning_rate": 1.135422735693094e-05,
"loss": 0.9987,
"step": 1061
},
{
"epoch": 0.7623833452979182,
"grad_norm": 0.22817876935005188,
"learning_rate": 1.1346491939626602e-05,
"loss": 1.0101,
"step": 1062
},
{
"epoch": 0.7631012203876526,
"grad_norm": 0.22691325843334198,
"learning_rate": 1.1338775239083386e-05,
"loss": 1.0819,
"step": 1063
},
{
"epoch": 0.7638190954773869,
"grad_norm": 0.19690930843353271,
"learning_rate": 1.133107729483361e-05,
"loss": 1.0059,
"step": 1064
},
{
"epoch": 0.7645369705671213,
"grad_norm": 0.19578997790813446,
"learning_rate": 1.1323398146313502e-05,
"loss": 1.0486,
"step": 1065
},
{
"epoch": 0.7652548456568558,
"grad_norm": 0.2787719964981079,
"learning_rate": 1.1315737832863003e-05,
"loss": 1.0245,
"step": 1066
},
{
"epoch": 0.7659727207465901,
"grad_norm": 0.2493918538093567,
"learning_rate": 1.1308096393725561e-05,
"loss": 1.022,
"step": 1067
},
{
"epoch": 0.7666905958363245,
"grad_norm": 0.4457206130027771,
"learning_rate": 1.1300473868047937e-05,
"loss": 1.2294,
"step": 1068
},
{
"epoch": 0.7674084709260589,
"grad_norm": 0.21728824079036713,
"learning_rate": 1.129287029487999e-05,
"loss": 1.0013,
"step": 1069
},
{
"epoch": 0.7681263460157932,
"grad_norm": 0.26787298917770386,
"learning_rate": 1.128528571317449e-05,
"loss": 1.0784,
"step": 1070
},
{
"epoch": 0.7688442211055276,
"grad_norm": 0.5104184150695801,
"learning_rate": 1.127772016178692e-05,
"loss": 1.2148,
"step": 1071
},
{
"epoch": 0.7695620961952621,
"grad_norm": 0.2178836315870285,
"learning_rate": 1.1270173679475265e-05,
"loss": 1.0524,
"step": 1072
},
{
"epoch": 0.7702799712849964,
"grad_norm": 0.7400010228157043,
"learning_rate": 1.1262646304899823e-05,
"loss": 1.0788,
"step": 1073
},
{
"epoch": 0.7709978463747308,
"grad_norm": 0.30098220705986023,
"learning_rate": 1.1255138076623001e-05,
"loss": 1.0991,
"step": 1074
},
{
"epoch": 0.7717157214644652,
"grad_norm": 0.24715134501457214,
"learning_rate": 1.1247649033109123e-05,
"loss": 1.0742,
"step": 1075
},
{
"epoch": 0.7724335965541995,
"grad_norm": 0.2401043027639389,
"learning_rate": 1.124017921272423e-05,
"loss": 0.9687,
"step": 1076
},
{
"epoch": 0.7731514716439339,
"grad_norm": 0.22840212285518646,
"learning_rate": 1.123272865373588e-05,
"loss": 1.0217,
"step": 1077
},
{
"epoch": 0.7738693467336684,
"grad_norm": 0.22718718647956848,
"learning_rate": 1.1225297394312966e-05,
"loss": 1.1272,
"step": 1078
},
{
"epoch": 0.7745872218234028,
"grad_norm": 0.3085003197193146,
"learning_rate": 1.12178854725255e-05,
"loss": 1.1134,
"step": 1079
},
{
"epoch": 0.7753050969131371,
"grad_norm": 0.21174356341362,
"learning_rate": 1.1210492926344427e-05,
"loss": 0.9903,
"step": 1080
},
{
"epoch": 0.7760229720028715,
"grad_norm": 0.20416200160980225,
"learning_rate": 1.1203119793641443e-05,
"loss": 0.9685,
"step": 1081
},
{
"epoch": 0.7767408470926059,
"grad_norm": 0.9170592427253723,
"learning_rate": 1.119576611218878e-05,
"loss": 1.0274,
"step": 1082
},
{
"epoch": 0.7774587221823402,
"grad_norm": 0.44593900442123413,
"learning_rate": 1.1188431919659022e-05,
"loss": 1.1094,
"step": 1083
},
{
"epoch": 0.7781765972720747,
"grad_norm": 0.2174079269170761,
"learning_rate": 1.1181117253624917e-05,
"loss": 1.0575,
"step": 1084
},
{
"epoch": 0.7788944723618091,
"grad_norm": 2.0982675552368164,
"learning_rate": 1.1173822151559176e-05,
"loss": 1.3757,
"step": 1085
},
{
"epoch": 0.7796123474515434,
"grad_norm": 0.3535577356815338,
"learning_rate": 1.1166546650834289e-05,
"loss": 1.1288,
"step": 1086
},
{
"epoch": 0.7803302225412778,
"grad_norm": 0.23848174512386322,
"learning_rate": 1.1159290788722323e-05,
"loss": 1.0412,
"step": 1087
},
{
"epoch": 0.7810480976310122,
"grad_norm": 0.6610331535339355,
"learning_rate": 1.1152054602394742e-05,
"loss": 1.1939,
"step": 1088
},
{
"epoch": 0.7817659727207465,
"grad_norm": 0.25315794348716736,
"learning_rate": 1.1144838128922214e-05,
"loss": 1.0254,
"step": 1089
},
{
"epoch": 0.782483847810481,
"grad_norm": 0.1975483000278473,
"learning_rate": 1.1137641405274407e-05,
"loss": 0.9531,
"step": 1090
},
{
"epoch": 0.7832017229002154,
"grad_norm": 0.43651190400123596,
"learning_rate": 1.1130464468319827e-05,
"loss": 1.1897,
"step": 1091
},
{
"epoch": 0.7839195979899497,
"grad_norm": 0.2116590291261673,
"learning_rate": 1.1123307354825603e-05,
"loss": 1.0327,
"step": 1092
},
{
"epoch": 0.7846374730796841,
"grad_norm": 0.33360204100608826,
"learning_rate": 1.1116170101457313e-05,
"loss": 1.1048,
"step": 1093
},
{
"epoch": 0.7853553481694185,
"grad_norm": 0.24831229448318481,
"learning_rate": 1.1109052744778795e-05,
"loss": 1.0805,
"step": 1094
},
{
"epoch": 0.7860732232591529,
"grad_norm": 0.3498152494430542,
"learning_rate": 1.1101955321251946e-05,
"loss": 1.101,
"step": 1095
},
{
"epoch": 0.7867910983488873,
"grad_norm": 0.2755047678947449,
"learning_rate": 1.1094877867236567e-05,
"loss": 1.0784,
"step": 1096
},
{
"epoch": 0.7875089734386217,
"grad_norm": 0.30536141991615295,
"learning_rate": 1.1087820418990133e-05,
"loss": 1.1185,
"step": 1097
},
{
"epoch": 0.7882268485283561,
"grad_norm": 0.2670566737651825,
"learning_rate": 1.1080783012667645e-05,
"loss": 0.9521,
"step": 1098
},
{
"epoch": 0.7889447236180904,
"grad_norm": 0.4963916540145874,
"learning_rate": 1.1073765684321426e-05,
"loss": 1.0873,
"step": 1099
},
{
"epoch": 0.7896625987078248,
"grad_norm": 0.6098265051841736,
"learning_rate": 1.1066768469900944e-05,
"loss": 1.1082,
"step": 1100
},
{
"epoch": 0.7903804737975593,
"grad_norm": 0.2066364884376526,
"learning_rate": 1.1059791405252616e-05,
"loss": 1.0428,
"step": 1101
},
{
"epoch": 0.7910983488872936,
"grad_norm": 0.4424298405647278,
"learning_rate": 1.1052834526119638e-05,
"loss": 1.1162,
"step": 1102
},
{
"epoch": 0.791816223977028,
"grad_norm": 0.5996174812316895,
"learning_rate": 1.1045897868141797e-05,
"loss": 1.148,
"step": 1103
},
{
"epoch": 0.7925340990667624,
"grad_norm": 0.21078869700431824,
"learning_rate": 1.1038981466855287e-05,
"loss": 0.984,
"step": 1104
},
{
"epoch": 0.7932519741564967,
"grad_norm": 0.2863714098930359,
"learning_rate": 1.1032085357692526e-05,
"loss": 1.0208,
"step": 1105
},
{
"epoch": 0.7939698492462312,
"grad_norm": 0.1967342495918274,
"learning_rate": 1.102520957598198e-05,
"loss": 1.0659,
"step": 1106
},
{
"epoch": 0.7946877243359656,
"grad_norm": 0.261958509683609,
"learning_rate": 1.1018354156947975e-05,
"loss": 1.0927,
"step": 1107
},
{
"epoch": 0.7954055994256999,
"grad_norm": 0.46918782591819763,
"learning_rate": 1.101151913571052e-05,
"loss": 1.1254,
"step": 1108
},
{
"epoch": 0.7961234745154343,
"grad_norm": 0.23906533420085907,
"learning_rate": 1.1004704547285132e-05,
"loss": 1.0926,
"step": 1109
},
{
"epoch": 0.7968413496051687,
"grad_norm": 0.4308425188064575,
"learning_rate": 1.0997910426582646e-05,
"loss": 1.4434,
"step": 1110
},
{
"epoch": 0.797559224694903,
"grad_norm": 0.22563466429710388,
"learning_rate": 1.099113680840904e-05,
"loss": 0.9977,
"step": 1111
},
{
"epoch": 0.7982770997846375,
"grad_norm": 0.199822336435318,
"learning_rate": 1.098438372746527e-05,
"loss": 1.0224,
"step": 1112
},
{
"epoch": 0.7989949748743719,
"grad_norm": 0.5321314334869385,
"learning_rate": 1.0977651218347063e-05,
"loss": 1.0101,
"step": 1113
},
{
"epoch": 0.7997128499641063,
"grad_norm": 0.3771994709968567,
"learning_rate": 1.0970939315544772e-05,
"loss": 1.0763,
"step": 1114
},
{
"epoch": 0.8004307250538406,
"grad_norm": 0.254106342792511,
"learning_rate": 1.0964248053443185e-05,
"loss": 1.052,
"step": 1115
},
{
"epoch": 0.801148600143575,
"grad_norm": 0.33369532227516174,
"learning_rate": 1.0957577466321335e-05,
"loss": 1.1463,
"step": 1116
},
{
"epoch": 0.8018664752333095,
"grad_norm": 1.6308503150939941,
"learning_rate": 1.0950927588352349e-05,
"loss": 1.3882,
"step": 1117
},
{
"epoch": 0.8025843503230438,
"grad_norm": 0.2860555946826935,
"learning_rate": 1.0944298453603261e-05,
"loss": 1.0642,
"step": 1118
},
{
"epoch": 0.8033022254127782,
"grad_norm": 0.24643976986408234,
"learning_rate": 1.0937690096034837e-05,
"loss": 1.0377,
"step": 1119
},
{
"epoch": 0.8040201005025126,
"grad_norm": 0.5750579237937927,
"learning_rate": 1.09311025495014e-05,
"loss": 1.1976,
"step": 1120
},
{
"epoch": 0.8047379755922469,
"grad_norm": 0.23937703669071198,
"learning_rate": 1.0924535847750661e-05,
"loss": 1.1295,
"step": 1121
},
{
"epoch": 0.8054558506819813,
"grad_norm": 0.6504701375961304,
"learning_rate": 1.091799002442355e-05,
"loss": 1.1366,
"step": 1122
},
{
"epoch": 0.8061737257717158,
"grad_norm": 0.3447827100753784,
"learning_rate": 1.0911465113054024e-05,
"loss": 1.1147,
"step": 1123
},
{
"epoch": 0.8068916008614501,
"grad_norm": 0.29246342182159424,
"learning_rate": 1.0904961147068924e-05,
"loss": 1.0298,
"step": 1124
},
{
"epoch": 0.8076094759511845,
"grad_norm": 0.2509633004665375,
"learning_rate": 1.0898478159787777e-05,
"loss": 1.0329,
"step": 1125
},
{
"epoch": 0.8083273510409189,
"grad_norm": 0.39475613832473755,
"learning_rate": 1.0892016184422643e-05,
"loss": 1.1515,
"step": 1126
},
{
"epoch": 0.8090452261306532,
"grad_norm": 0.4760308563709259,
"learning_rate": 1.0885575254077939e-05,
"loss": 1.0339,
"step": 1127
},
{
"epoch": 0.8097631012203876,
"grad_norm": 0.6620875597000122,
"learning_rate": 1.0879155401750264e-05,
"loss": 1.2161,
"step": 1128
},
{
"epoch": 0.8104809763101221,
"grad_norm": 0.42044660449028015,
"learning_rate": 1.0872756660328242e-05,
"loss": 1.0563,
"step": 1129
},
{
"epoch": 0.8111988513998565,
"grad_norm": 0.23910948634147644,
"learning_rate": 1.0866379062592346e-05,
"loss": 1.0107,
"step": 1130
},
{
"epoch": 0.8119167264895908,
"grad_norm": 1.1729207038879395,
"learning_rate": 1.0860022641214725e-05,
"loss": 1.4796,
"step": 1131
},
{
"epoch": 0.8126346015793252,
"grad_norm": 0.2483389973640442,
"learning_rate": 1.0853687428759047e-05,
"loss": 1.0933,
"step": 1132
},
{
"epoch": 0.8133524766690596,
"grad_norm": 0.20855219662189484,
"learning_rate": 1.084737345768033e-05,
"loss": 1.0492,
"step": 1133
},
{
"epoch": 0.8140703517587939,
"grad_norm": 0.2710930109024048,
"learning_rate": 1.0841080760324767e-05,
"loss": 0.9982,
"step": 1134
},
{
"epoch": 0.8147882268485284,
"grad_norm": 0.23391245305538177,
"learning_rate": 1.0834809368929569e-05,
"loss": 1.0699,
"step": 1135
},
{
"epoch": 0.8155061019382628,
"grad_norm": 0.5731115341186523,
"learning_rate": 1.0828559315622802e-05,
"loss": 1.1237,
"step": 1136
},
{
"epoch": 0.8162239770279971,
"grad_norm": 0.27705612778663635,
"learning_rate": 1.0822330632423208e-05,
"loss": 1.0773,
"step": 1137
},
{
"epoch": 0.8169418521177315,
"grad_norm": 0.2135666012763977,
"learning_rate": 1.0816123351240065e-05,
"loss": 0.9935,
"step": 1138
},
{
"epoch": 0.8176597272074659,
"grad_norm": 0.6682529449462891,
"learning_rate": 1.0809937503872996e-05,
"loss": 1.117,
"step": 1139
},
{
"epoch": 0.8183776022972002,
"grad_norm": 0.22851616144180298,
"learning_rate": 1.0803773122011827e-05,
"loss": 1.0399,
"step": 1140
},
{
"epoch": 0.8190954773869347,
"grad_norm": 0.3621741235256195,
"learning_rate": 1.0797630237236414e-05,
"loss": 1.0938,
"step": 1141
},
{
"epoch": 0.8198133524766691,
"grad_norm": 0.20737534761428833,
"learning_rate": 1.0791508881016485e-05,
"loss": 0.9742,
"step": 1142
},
{
"epoch": 0.8205312275664034,
"grad_norm": 1.6472431421279907,
"learning_rate": 1.0785409084711485e-05,
"loss": 1.4099,
"step": 1143
},
{
"epoch": 0.8212491026561378,
"grad_norm": 0.47031158208847046,
"learning_rate": 1.0779330879570394e-05,
"loss": 1.0591,
"step": 1144
},
{
"epoch": 0.8219669777458722,
"grad_norm": 0.22210556268692017,
"learning_rate": 1.0773274296731592e-05,
"loss": 1.099,
"step": 1145
},
{
"epoch": 0.8226848528356066,
"grad_norm": 27.147884368896484,
"learning_rate": 1.076723936722269e-05,
"loss": 1.1498,
"step": 1146
},
{
"epoch": 0.823402727925341,
"grad_norm": 0.4893867075443268,
"learning_rate": 1.0761226121960365e-05,
"loss": 1.0403,
"step": 1147
},
{
"epoch": 0.8241206030150754,
"grad_norm": 0.2474132627248764,
"learning_rate": 1.0755234591750203e-05,
"loss": 1.083,
"step": 1148
},
{
"epoch": 0.8248384781048098,
"grad_norm": 0.22155289351940155,
"learning_rate": 1.0749264807286557e-05,
"loss": 1.0682,
"step": 1149
},
{
"epoch": 0.8255563531945441,
"grad_norm": 0.3100580871105194,
"learning_rate": 1.0743316799152363e-05,
"loss": 1.1295,
"step": 1150
},
{
"epoch": 0.8262742282842785,
"grad_norm": 0.23339876532554626,
"learning_rate": 1.0737390597819013e-05,
"loss": 1.078,
"step": 1151
},
{
"epoch": 0.826992103374013,
"grad_norm": 0.21114078164100647,
"learning_rate": 1.0731486233646172e-05,
"loss": 1.0884,
"step": 1152
},
{
"epoch": 0.8277099784637473,
"grad_norm": 0.21384155750274658,
"learning_rate": 1.0725603736881636e-05,
"loss": 1.0205,
"step": 1153
},
{
"epoch": 0.8284278535534817,
"grad_norm": 0.28435850143432617,
"learning_rate": 1.071974313766118e-05,
"loss": 1.0634,
"step": 1154
},
{
"epoch": 0.8291457286432161,
"grad_norm": 0.2539917528629303,
"learning_rate": 1.0713904466008397e-05,
"loss": 1.0601,
"step": 1155
},
{
"epoch": 0.8298636037329504,
"grad_norm": 0.2543018162250519,
"learning_rate": 1.070808775183454e-05,
"loss": 1.083,
"step": 1156
},
{
"epoch": 0.8305814788226848,
"grad_norm": 0.5512329936027527,
"learning_rate": 1.0702293024938383e-05,
"loss": 1.1601,
"step": 1157
},
{
"epoch": 0.8312993539124193,
"grad_norm": 0.20794658362865448,
"learning_rate": 1.069652031500606e-05,
"loss": 1.0004,
"step": 1158
},
{
"epoch": 0.8320172290021536,
"grad_norm": 0.3217058479785919,
"learning_rate": 1.0690769651610906e-05,
"loss": 1.0814,
"step": 1159
},
{
"epoch": 0.832735104091888,
"grad_norm": 0.5152641534805298,
"learning_rate": 1.0685041064213318e-05,
"loss": 1.1193,
"step": 1160
},
{
"epoch": 0.8334529791816224,
"grad_norm": 0.31184303760528564,
"learning_rate": 1.0679334582160599e-05,
"loss": 1.0972,
"step": 1161
},
{
"epoch": 0.8341708542713567,
"grad_norm": 0.2713521122932434,
"learning_rate": 1.0673650234686803e-05,
"loss": 1.0193,
"step": 1162
},
{
"epoch": 0.8348887293610912,
"grad_norm": 0.23324035108089447,
"learning_rate": 1.0667988050912591e-05,
"loss": 1.0967,
"step": 1163
},
{
"epoch": 0.8356066044508256,
"grad_norm": 0.2763028144836426,
"learning_rate": 1.0662348059845088e-05,
"loss": 1.0406,
"step": 1164
},
{
"epoch": 0.83632447954056,
"grad_norm": 0.2278720736503601,
"learning_rate": 1.0656730290377712e-05,
"loss": 1.1152,
"step": 1165
},
{
"epoch": 0.8370423546302943,
"grad_norm": 0.3886187672615051,
"learning_rate": 1.0651134771290055e-05,
"loss": 1.0825,
"step": 1166
},
{
"epoch": 0.8377602297200287,
"grad_norm": 0.589653730392456,
"learning_rate": 1.0645561531247713e-05,
"loss": 1.1862,
"step": 1167
},
{
"epoch": 0.8384781048097631,
"grad_norm": 0.2417033612728119,
"learning_rate": 1.0640010598802148e-05,
"loss": 1.0709,
"step": 1168
},
{
"epoch": 0.8391959798994975,
"grad_norm": 0.641553521156311,
"learning_rate": 1.0634482002390544e-05,
"loss": 1.0698,
"step": 1169
},
{
"epoch": 0.8399138549892319,
"grad_norm": 0.3284968435764313,
"learning_rate": 1.0628975770335662e-05,
"loss": 1.0752,
"step": 1170
},
{
"epoch": 0.8406317300789663,
"grad_norm": 0.2477286159992218,
"learning_rate": 1.0623491930845678e-05,
"loss": 1.0343,
"step": 1171
},
{
"epoch": 0.8413496051687006,
"grad_norm": 0.1869351863861084,
"learning_rate": 1.0618030512014065e-05,
"loss": 1.0395,
"step": 1172
},
{
"epoch": 0.842067480258435,
"grad_norm": 0.24983994662761688,
"learning_rate": 1.0612591541819432e-05,
"loss": 1.0368,
"step": 1173
},
{
"epoch": 0.8427853553481695,
"grad_norm": 0.23476620018482208,
"learning_rate": 1.0607175048125384e-05,
"loss": 0.9778,
"step": 1174
},
{
"epoch": 0.8435032304379038,
"grad_norm": 0.26927655935287476,
"learning_rate": 1.060178105868038e-05,
"loss": 0.9862,
"step": 1175
},
{
"epoch": 0.8442211055276382,
"grad_norm": 0.20364007353782654,
"learning_rate": 1.059640960111759e-05,
"loss": 1.1004,
"step": 1176
},
{
"epoch": 0.8449389806173726,
"grad_norm": 0.511143684387207,
"learning_rate": 1.0591060702954757e-05,
"loss": 1.0001,
"step": 1177
},
{
"epoch": 0.8456568557071069,
"grad_norm": 0.4864940941333771,
"learning_rate": 1.0585734391594045e-05,
"loss": 1.0297,
"step": 1178
},
{
"epoch": 0.8463747307968413,
"grad_norm": 0.21658964455127716,
"learning_rate": 1.0580430694321918e-05,
"loss": 0.9969,
"step": 1179
},
{
"epoch": 0.8470926058865758,
"grad_norm": 0.5197054743766785,
"learning_rate": 1.0575149638308983e-05,
"loss": 1.0246,
"step": 1180
},
{
"epoch": 0.8478104809763102,
"grad_norm": 0.44801488518714905,
"learning_rate": 1.0569891250609858e-05,
"loss": 1.1387,
"step": 1181
},
{
"epoch": 0.8485283560660445,
"grad_norm": 1.1570936441421509,
"learning_rate": 1.0564655558163032e-05,
"loss": 1.3054,
"step": 1182
},
{
"epoch": 0.8492462311557789,
"grad_norm": 0.33803439140319824,
"learning_rate": 1.0559442587790727e-05,
"loss": 1.0835,
"step": 1183
},
{
"epoch": 0.8499641062455133,
"grad_norm": 0.2663414776325226,
"learning_rate": 1.0554252366198759e-05,
"loss": 1.0631,
"step": 1184
},
{
"epoch": 0.8506819813352476,
"grad_norm": 0.8476252555847168,
"learning_rate": 1.0549084919976409e-05,
"loss": 1.3256,
"step": 1185
},
{
"epoch": 0.8513998564249821,
"grad_norm": 0.30568942427635193,
"learning_rate": 1.0543940275596274e-05,
"loss": 1.0559,
"step": 1186
},
{
"epoch": 0.8521177315147165,
"grad_norm": 2.704880714416504,
"learning_rate": 1.0538818459414139e-05,
"loss": 1.0905,
"step": 1187
},
{
"epoch": 0.8528356066044508,
"grad_norm": 0.22311435639858246,
"learning_rate": 1.0533719497668846e-05,
"loss": 1.06,
"step": 1188
},
{
"epoch": 0.8535534816941852,
"grad_norm": 0.2369927316904068,
"learning_rate": 1.0528643416482152e-05,
"loss": 1.0359,
"step": 1189
},
{
"epoch": 0.8542713567839196,
"grad_norm": 0.29164135456085205,
"learning_rate": 1.0523590241858597e-05,
"loss": 1.0999,
"step": 1190
},
{
"epoch": 0.8549892318736539,
"grad_norm": 0.23516468703746796,
"learning_rate": 1.0518559999685371e-05,
"loss": 1.0599,
"step": 1191
},
{
"epoch": 0.8557071069633884,
"grad_norm": 0.332119345664978,
"learning_rate": 1.0513552715732186e-05,
"loss": 1.1189,
"step": 1192
},
{
"epoch": 0.8564249820531228,
"grad_norm": 0.23043331503868103,
"learning_rate": 1.0508568415651135e-05,
"loss": 0.9978,
"step": 1193
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.2612220048904419,
"learning_rate": 1.0503607124976569e-05,
"loss": 1.0405,
"step": 1194
},
{
"epoch": 0.8578607322325915,
"grad_norm": 0.27752968668937683,
"learning_rate": 1.0498668869124962e-05,
"loss": 1.0965,
"step": 1195
},
{
"epoch": 0.8585786073223259,
"grad_norm": 0.22212113440036774,
"learning_rate": 1.0493753673394782e-05,
"loss": 1.0576,
"step": 1196
},
{
"epoch": 0.8592964824120602,
"grad_norm": 0.24385105073451996,
"learning_rate": 1.0488861562966361e-05,
"loss": 1.0124,
"step": 1197
},
{
"epoch": 0.8600143575017947,
"grad_norm": 0.419596403837204,
"learning_rate": 1.0483992562901765e-05,
"loss": 1.0921,
"step": 1198
},
{
"epoch": 0.8607322325915291,
"grad_norm": 0.39705750346183777,
"learning_rate": 1.0479146698144667e-05,
"loss": 1.0565,
"step": 1199
},
{
"epoch": 0.8614501076812635,
"grad_norm": 0.2379753440618515,
"learning_rate": 1.0474323993520216e-05,
"loss": 1.0736,
"step": 1200
},
{
"epoch": 0.8621679827709978,
"grad_norm": 16.06524658203125,
"learning_rate": 1.0469524473734922e-05,
"loss": 1.08,
"step": 1201
},
{
"epoch": 0.8628858578607322,
"grad_norm": 0.5211506485939026,
"learning_rate": 1.046474816337651e-05,
"loss": 1.1859,
"step": 1202
},
{
"epoch": 0.8636037329504667,
"grad_norm": 0.3241841197013855,
"learning_rate": 1.0459995086913808e-05,
"loss": 1.0459,
"step": 1203
},
{
"epoch": 0.864321608040201,
"grad_norm": 0.4102221429347992,
"learning_rate": 1.0455265268696617e-05,
"loss": 1.1504,
"step": 1204
},
{
"epoch": 0.8650394831299354,
"grad_norm": 0.48446452617645264,
"learning_rate": 1.0450558732955591e-05,
"loss": 1.2409,
"step": 1205
},
{
"epoch": 0.8657573582196698,
"grad_norm": 0.32923561334609985,
"learning_rate": 1.0445875503802103e-05,
"loss": 1.086,
"step": 1206
},
{
"epoch": 0.8664752333094041,
"grad_norm": 0.23721420764923096,
"learning_rate": 1.0441215605228133e-05,
"loss": 1.0702,
"step": 1207
},
{
"epoch": 0.8671931083991385,
"grad_norm": 0.2165788859128952,
"learning_rate": 1.043657906110613e-05,
"loss": 1.0695,
"step": 1208
},
{
"epoch": 0.867910983488873,
"grad_norm": 0.4671003520488739,
"learning_rate": 1.0431965895188912e-05,
"loss": 1.0955,
"step": 1209
},
{
"epoch": 0.8686288585786073,
"grad_norm": 0.28291141986846924,
"learning_rate": 1.0427376131109522e-05,
"loss": 1.078,
"step": 1210
},
{
"epoch": 0.8693467336683417,
"grad_norm": 1.1975141763687134,
"learning_rate": 1.0422809792381119e-05,
"loss": 1.0479,
"step": 1211
},
{
"epoch": 0.8700646087580761,
"grad_norm": 0.21140481531620026,
"learning_rate": 1.0418266902396856e-05,
"loss": 1.0283,
"step": 1212
},
{
"epoch": 0.8707824838478104,
"grad_norm": 0.22778762876987457,
"learning_rate": 1.0413747484429762e-05,
"loss": 1.0478,
"step": 1213
},
{
"epoch": 0.8715003589375449,
"grad_norm": 0.21997831761837006,
"learning_rate": 1.0409251561632614e-05,
"loss": 1.0211,
"step": 1214
},
{
"epoch": 0.8722182340272793,
"grad_norm": 0.2546640932559967,
"learning_rate": 1.0404779157037823e-05,
"loss": 0.9999,
"step": 1215
},
{
"epoch": 0.8729361091170137,
"grad_norm": 0.44324609637260437,
"learning_rate": 1.040033029355733e-05,
"loss": 1.2576,
"step": 1216
},
{
"epoch": 0.873653984206748,
"grad_norm": 0.39237719774246216,
"learning_rate": 1.0395904993982458e-05,
"loss": 1.0859,
"step": 1217
},
{
"epoch": 0.8743718592964824,
"grad_norm": 0.23994913697242737,
"learning_rate": 1.0391503280983826e-05,
"loss": 1.1024,
"step": 1218
},
{
"epoch": 0.8750897343862168,
"grad_norm": 0.19337226450443268,
"learning_rate": 1.0387125177111217e-05,
"loss": 1.1061,
"step": 1219
},
{
"epoch": 0.8758076094759512,
"grad_norm": 0.24154868721961975,
"learning_rate": 1.0382770704793464e-05,
"loss": 1.0196,
"step": 1220
},
{
"epoch": 0.8765254845656856,
"grad_norm": 0.36926552653312683,
"learning_rate": 1.0378439886338336e-05,
"loss": 1.1087,
"step": 1221
},
{
"epoch": 0.87724335965542,
"grad_norm": 0.3358927369117737,
"learning_rate": 1.0374132743932424e-05,
"loss": 1.0951,
"step": 1222
},
{
"epoch": 0.8779612347451543,
"grad_norm": 0.22058887779712677,
"learning_rate": 1.0369849299641033e-05,
"loss": 0.9991,
"step": 1223
},
{
"epoch": 0.8786791098348887,
"grad_norm": 0.6595388054847717,
"learning_rate": 1.036558957540806e-05,
"loss": 1.2939,
"step": 1224
},
{
"epoch": 0.8793969849246231,
"grad_norm": 0.2563266158103943,
"learning_rate": 1.0361353593055878e-05,
"loss": 0.9876,
"step": 1225
},
{
"epoch": 0.8801148600143575,
"grad_norm": 0.24463306367397308,
"learning_rate": 1.0357141374285248e-05,
"loss": 1.1054,
"step": 1226
},
{
"epoch": 0.8808327351040919,
"grad_norm": 0.4125424027442932,
"learning_rate": 1.0352952940675177e-05,
"loss": 1.1515,
"step": 1227
},
{
"epoch": 0.8815506101938263,
"grad_norm": 0.2915627360343933,
"learning_rate": 1.0348788313682823e-05,
"loss": 1.069,
"step": 1228
},
{
"epoch": 0.8822684852835606,
"grad_norm": 0.3884046673774719,
"learning_rate": 1.0344647514643391e-05,
"loss": 1.0746,
"step": 1229
},
{
"epoch": 0.882986360373295,
"grad_norm": 0.21901565790176392,
"learning_rate": 1.0340530564770011e-05,
"loss": 1.0689,
"step": 1230
},
{
"epoch": 0.8837042354630295,
"grad_norm": 0.24854253232479095,
"learning_rate": 1.0336437485153634e-05,
"loss": 1.0305,
"step": 1231
},
{
"epoch": 0.8844221105527639,
"grad_norm": 0.2502192556858063,
"learning_rate": 1.0332368296762933e-05,
"loss": 1.0243,
"step": 1232
},
{
"epoch": 0.8851399856424982,
"grad_norm": 0.7768297791481018,
"learning_rate": 1.0328323020444176e-05,
"loss": 1.5945,
"step": 1233
},
{
"epoch": 0.8858578607322326,
"grad_norm": 0.4446689188480377,
"learning_rate": 1.0324301676921138e-05,
"loss": 1.1623,
"step": 1234
},
{
"epoch": 0.886575735821967,
"grad_norm": 0.2070520520210266,
"learning_rate": 1.0320304286794983e-05,
"loss": 1.0624,
"step": 1235
},
{
"epoch": 0.8872936109117013,
"grad_norm": 0.5530170202255249,
"learning_rate": 1.0316330870544165e-05,
"loss": 1.1009,
"step": 1236
},
{
"epoch": 0.8880114860014358,
"grad_norm": 0.22923225164413452,
"learning_rate": 1.0312381448524325e-05,
"loss": 1.0953,
"step": 1237
},
{
"epoch": 0.8887293610911702,
"grad_norm": 0.5102125406265259,
"learning_rate": 1.030845604096817e-05,
"loss": 1.178,
"step": 1238
},
{
"epoch": 0.8894472361809045,
"grad_norm": 0.25947073101997375,
"learning_rate": 1.03045546679854e-05,
"loss": 1.0017,
"step": 1239
},
{
"epoch": 0.8901651112706389,
"grad_norm": 0.2402765452861786,
"learning_rate": 1.0300677349562569e-05,
"loss": 1.0853,
"step": 1240
},
{
"epoch": 0.8908829863603733,
"grad_norm": 0.2136669158935547,
"learning_rate": 1.0296824105563014e-05,
"loss": 1.0527,
"step": 1241
},
{
"epoch": 0.8916008614501076,
"grad_norm": 0.7674557566642761,
"learning_rate": 1.0292994955726734e-05,
"loss": 1.1384,
"step": 1242
},
{
"epoch": 0.8923187365398421,
"grad_norm": 0.20754656195640564,
"learning_rate": 1.0289189919670293e-05,
"loss": 0.9825,
"step": 1243
},
{
"epoch": 0.8930366116295765,
"grad_norm": 0.4037124216556549,
"learning_rate": 1.0285409016886723e-05,
"loss": 1.0195,
"step": 1244
},
{
"epoch": 0.8937544867193108,
"grad_norm": 0.23209604620933533,
"learning_rate": 1.0281652266745426e-05,
"loss": 1.0857,
"step": 1245
},
{
"epoch": 0.8944723618090452,
"grad_norm": 0.4080345928668976,
"learning_rate": 1.0277919688492063e-05,
"loss": 1.1215,
"step": 1246
},
{
"epoch": 0.8951902368987796,
"grad_norm": 0.6738815307617188,
"learning_rate": 1.0274211301248476e-05,
"loss": 1.0,
"step": 1247
},
{
"epoch": 0.8959081119885139,
"grad_norm": 0.4783581495285034,
"learning_rate": 1.0270527124012562e-05,
"loss": 1.2057,
"step": 1248
},
{
"epoch": 0.8966259870782484,
"grad_norm": 0.7883797287940979,
"learning_rate": 1.0266867175658202e-05,
"loss": 1.2773,
"step": 1249
},
{
"epoch": 0.8973438621679828,
"grad_norm": 0.2835233509540558,
"learning_rate": 1.026323147493515e-05,
"loss": 1.0791,
"step": 1250
},
{
"epoch": 0.8980617372577172,
"grad_norm": 0.29707083106040955,
"learning_rate": 1.0259620040468941e-05,
"loss": 1.0785,
"step": 1251
},
{
"epoch": 0.8987796123474515,
"grad_norm": 0.24541136622428894,
"learning_rate": 1.0256032890760795e-05,
"loss": 1.023,
"step": 1252
},
{
"epoch": 0.8994974874371859,
"grad_norm": 0.2777314782142639,
"learning_rate": 1.0252470044187522e-05,
"loss": 1.0646,
"step": 1253
},
{
"epoch": 0.9002153625269204,
"grad_norm": 0.21893638372421265,
"learning_rate": 1.024893151900143e-05,
"loss": 1.0038,
"step": 1254
},
{
"epoch": 0.9009332376166547,
"grad_norm": 0.2998167872428894,
"learning_rate": 1.0245417333330225e-05,
"loss": 1.0233,
"step": 1255
},
{
"epoch": 0.9016511127063891,
"grad_norm": 0.2272305190563202,
"learning_rate": 1.024192750517693e-05,
"loss": 1.0,
"step": 1256
},
{
"epoch": 0.9023689877961235,
"grad_norm": 1.0981707572937012,
"learning_rate": 1.0238462052419775e-05,
"loss": 1.0438,
"step": 1257
},
{
"epoch": 0.9030868628858578,
"grad_norm": 0.25254082679748535,
"learning_rate": 1.0235020992812128e-05,
"loss": 1.078,
"step": 1258
},
{
"epoch": 0.9038047379755922,
"grad_norm": 0.4673773944377899,
"learning_rate": 1.0231604343982382e-05,
"loss": 1.1441,
"step": 1259
},
{
"epoch": 0.9045226130653267,
"grad_norm": 0.26296040415763855,
"learning_rate": 1.0228212123433882e-05,
"loss": 0.9469,
"step": 1260
},
{
"epoch": 0.905240488155061,
"grad_norm": 0.3453957140445709,
"learning_rate": 1.022484434854482e-05,
"loss": 1.061,
"step": 1261
},
{
"epoch": 0.9059583632447954,
"grad_norm": 0.21403883397579193,
"learning_rate": 1.0221501036568164e-05,
"loss": 1.0177,
"step": 1262
},
{
"epoch": 0.9066762383345298,
"grad_norm": 0.2306700348854065,
"learning_rate": 1.0218182204631551e-05,
"loss": 0.9859,
"step": 1263
},
{
"epoch": 0.9073941134242641,
"grad_norm": 0.25937196612358093,
"learning_rate": 1.0214887869737212e-05,
"loss": 1.0946,
"step": 1264
},
{
"epoch": 0.9081119885139985,
"grad_norm": 0.3202129602432251,
"learning_rate": 1.0211618048761879e-05,
"loss": 1.1946,
"step": 1265
},
{
"epoch": 0.908829863603733,
"grad_norm": 0.24940723180770874,
"learning_rate": 1.0208372758456702e-05,
"loss": 1.025,
"step": 1266
},
{
"epoch": 0.9095477386934674,
"grad_norm": 0.22642754018306732,
"learning_rate": 1.0205152015447162e-05,
"loss": 1.0356,
"step": 1267
},
{
"epoch": 0.9102656137832017,
"grad_norm": 0.23893611133098602,
"learning_rate": 1.020195583623298e-05,
"loss": 1.0562,
"step": 1268
},
{
"epoch": 0.9109834888729361,
"grad_norm": 0.46950802206993103,
"learning_rate": 1.0198784237188042e-05,
"loss": 1.0755,
"step": 1269
},
{
"epoch": 0.9117013639626705,
"grad_norm": 0.2589515745639801,
"learning_rate": 1.0195637234560314e-05,
"loss": 1.0922,
"step": 1270
},
{
"epoch": 0.9124192390524049,
"grad_norm": 0.2856101095676422,
"learning_rate": 1.0192514844471751e-05,
"loss": 1.0005,
"step": 1271
},
{
"epoch": 0.9131371141421393,
"grad_norm": 0.25440752506256104,
"learning_rate": 1.0189417082918216e-05,
"loss": 1.039,
"step": 1272
},
{
"epoch": 0.9138549892318737,
"grad_norm": 0.39413267374038696,
"learning_rate": 1.0186343965769411e-05,
"loss": 1.0912,
"step": 1273
},
{
"epoch": 0.914572864321608,
"grad_norm": 0.27434098720550537,
"learning_rate": 1.0183295508768775e-05,
"loss": 0.9959,
"step": 1274
},
{
"epoch": 0.9152907394113424,
"grad_norm": 0.23922573029994965,
"learning_rate": 1.0180271727533424e-05,
"loss": 1.0208,
"step": 1275
},
{
"epoch": 0.9160086145010768,
"grad_norm": 0.5556219220161438,
"learning_rate": 1.0177272637554052e-05,
"loss": 1.2056,
"step": 1276
},
{
"epoch": 0.9167264895908112,
"grad_norm": 0.19860997796058655,
"learning_rate": 1.0174298254194868e-05,
"loss": 0.9879,
"step": 1277
},
{
"epoch": 0.9174443646805456,
"grad_norm": 0.2307528555393219,
"learning_rate": 1.0171348592693507e-05,
"loss": 1.0108,
"step": 1278
},
{
"epoch": 0.91816223977028,
"grad_norm": 0.5415950417518616,
"learning_rate": 1.0168423668160951e-05,
"loss": 1.1183,
"step": 1279
},
{
"epoch": 0.9188801148600143,
"grad_norm": 0.21727311611175537,
"learning_rate": 1.0165523495581465e-05,
"loss": 1.0008,
"step": 1280
},
{
"epoch": 0.9195979899497487,
"grad_norm": 0.6700248718261719,
"learning_rate": 1.0162648089812504e-05,
"loss": 1.3594,
"step": 1281
},
{
"epoch": 0.9203158650394831,
"grad_norm": 0.25267294049263,
"learning_rate": 1.0159797465584642e-05,
"loss": 1.0131,
"step": 1282
},
{
"epoch": 0.9210337401292176,
"grad_norm": 0.39105260372161865,
"learning_rate": 1.0156971637501508e-05,
"loss": 0.9928,
"step": 1283
},
{
"epoch": 0.9217516152189519,
"grad_norm": 0.32855525612831116,
"learning_rate": 1.015417062003969e-05,
"loss": 1.0152,
"step": 1284
},
{
"epoch": 0.9224694903086863,
"grad_norm": 0.21902427077293396,
"learning_rate": 1.0151394427548677e-05,
"loss": 1.1189,
"step": 1285
},
{
"epoch": 0.9231873653984207,
"grad_norm": 0.5886435508728027,
"learning_rate": 1.0148643074250783e-05,
"loss": 1.2409,
"step": 1286
},
{
"epoch": 0.923905240488155,
"grad_norm": 0.3337082862854004,
"learning_rate": 1.014591657424107e-05,
"loss": 0.9816,
"step": 1287
},
{
"epoch": 0.9246231155778895,
"grad_norm": 0.22968193888664246,
"learning_rate": 1.014321494148728e-05,
"loss": 1.0607,
"step": 1288
},
{
"epoch": 0.9253409906676239,
"grad_norm": 0.5318540334701538,
"learning_rate": 1.0140538189829754e-05,
"loss": 1.1525,
"step": 1289
},
{
"epoch": 0.9260588657573582,
"grad_norm": 0.20607882738113403,
"learning_rate": 1.0137886332981374e-05,
"loss": 1.0562,
"step": 1290
},
{
"epoch": 0.9267767408470926,
"grad_norm": 0.23362112045288086,
"learning_rate": 1.0135259384527487e-05,
"loss": 1.0274,
"step": 1291
},
{
"epoch": 0.927494615936827,
"grad_norm": 0.3322995603084564,
"learning_rate": 1.0132657357925835e-05,
"loss": 1.0506,
"step": 1292
},
{
"epoch": 0.9282124910265613,
"grad_norm": 0.23184844851493835,
"learning_rate": 1.0130080266506486e-05,
"loss": 1.074,
"step": 1293
},
{
"epoch": 0.9289303661162958,
"grad_norm": 0.2552621066570282,
"learning_rate": 1.0127528123471767e-05,
"loss": 1.0205,
"step": 1294
},
{
"epoch": 0.9296482412060302,
"grad_norm": 0.588615357875824,
"learning_rate": 1.0125000941896191e-05,
"loss": 1.077,
"step": 1295
},
{
"epoch": 0.9303661162957645,
"grad_norm": 0.2588033676147461,
"learning_rate": 1.0122498734726398e-05,
"loss": 0.9751,
"step": 1296
},
{
"epoch": 0.9310839913854989,
"grad_norm": 0.32702207565307617,
"learning_rate": 1.0120021514781091e-05,
"loss": 1.0659,
"step": 1297
},
{
"epoch": 0.9318018664752333,
"grad_norm": 0.3707316517829895,
"learning_rate": 1.0117569294750953e-05,
"loss": 1.1125,
"step": 1298
},
{
"epoch": 0.9325197415649676,
"grad_norm": 0.44583168625831604,
"learning_rate": 1.0115142087198602e-05,
"loss": 0.9946,
"step": 1299
},
{
"epoch": 0.9332376166547021,
"grad_norm": 0.4182652533054352,
"learning_rate": 1.0112739904558513e-05,
"loss": 1.0689,
"step": 1300
},
{
"epoch": 0.9339554917444365,
"grad_norm": 0.6997318267822266,
"learning_rate": 1.0110362759136967e-05,
"loss": 1.2802,
"step": 1301
},
{
"epoch": 0.9346733668341709,
"grad_norm": 0.22859402000904083,
"learning_rate": 1.0108010663111968e-05,
"loss": 1.019,
"step": 1302
},
{
"epoch": 0.9353912419239052,
"grad_norm": 0.42435020208358765,
"learning_rate": 1.010568362853321e-05,
"loss": 1.1617,
"step": 1303
},
{
"epoch": 0.9361091170136396,
"grad_norm": 0.22650395333766937,
"learning_rate": 1.010338166732198e-05,
"loss": 1.069,
"step": 1304
},
{
"epoch": 0.9368269921033741,
"grad_norm": 0.3500579595565796,
"learning_rate": 1.010110479127113e-05,
"loss": 1.1423,
"step": 1305
},
{
"epoch": 0.9375448671931084,
"grad_norm": 0.6740381121635437,
"learning_rate": 1.0098853012044994e-05,
"loss": 1.154,
"step": 1306
},
{
"epoch": 0.9382627422828428,
"grad_norm": 0.25104475021362305,
"learning_rate": 1.0096626341179337e-05,
"loss": 1.0328,
"step": 1307
},
{
"epoch": 0.9389806173725772,
"grad_norm": 0.26676034927368164,
"learning_rate": 1.0094424790081296e-05,
"loss": 1.0511,
"step": 1308
},
{
"epoch": 0.9396984924623115,
"grad_norm": 0.30099838972091675,
"learning_rate": 1.0092248370029322e-05,
"loss": 1.0397,
"step": 1309
},
{
"epoch": 0.9404163675520459,
"grad_norm": 0.30963248014450073,
"learning_rate": 1.009009709217312e-05,
"loss": 1.0699,
"step": 1310
},
{
"epoch": 0.9411342426417804,
"grad_norm": 0.22851693630218506,
"learning_rate": 1.0087970967533588e-05,
"loss": 1.0638,
"step": 1311
},
{
"epoch": 0.9418521177315147,
"grad_norm": 0.4070218503475189,
"learning_rate": 1.008587000700277e-05,
"loss": 1.0829,
"step": 1312
},
{
"epoch": 0.9425699928212491,
"grad_norm": 0.2321423441171646,
"learning_rate": 1.0083794221343794e-05,
"loss": 1.0626,
"step": 1313
},
{
"epoch": 0.9432878679109835,
"grad_norm": 0.4742386043071747,
"learning_rate": 1.0081743621190822e-05,
"loss": 1.0692,
"step": 1314
},
{
"epoch": 0.9440057430007178,
"grad_norm": 0.199398010969162,
"learning_rate": 1.0079718217048988e-05,
"loss": 1.1122,
"step": 1315
},
{
"epoch": 0.9447236180904522,
"grad_norm": 0.22731392085552216,
"learning_rate": 1.0077718019294348e-05,
"loss": 1.115,
"step": 1316
},
{
"epoch": 0.9454414931801867,
"grad_norm": 0.24799813330173492,
"learning_rate": 1.0075743038173823e-05,
"loss": 1.0634,
"step": 1317
},
{
"epoch": 0.9461593682699211,
"grad_norm": 0.4763537049293518,
"learning_rate": 1.007379328380516e-05,
"loss": 1.0495,
"step": 1318
},
{
"epoch": 0.9468772433596554,
"grad_norm": 0.28331029415130615,
"learning_rate": 1.007186876617686e-05,
"loss": 1.0004,
"step": 1319
},
{
"epoch": 0.9475951184493898,
"grad_norm": 0.32906895875930786,
"learning_rate": 1.0069969495148146e-05,
"loss": 1.0571,
"step": 1320
},
{
"epoch": 0.9483129935391242,
"grad_norm": 0.23255948722362518,
"learning_rate": 1.0068095480448896e-05,
"loss": 0.9746,
"step": 1321
},
{
"epoch": 0.9490308686288585,
"grad_norm": 0.22251664102077484,
"learning_rate": 1.0066246731679609e-05,
"loss": 1.112,
"step": 1322
},
{
"epoch": 0.949748743718593,
"grad_norm": 0.35669928789138794,
"learning_rate": 1.0064423258311345e-05,
"loss": 1.0062,
"step": 1323
},
{
"epoch": 0.9504666188083274,
"grad_norm": 0.28206053376197815,
"learning_rate": 1.0062625069685673e-05,
"loss": 1.0383,
"step": 1324
},
{
"epoch": 0.9511844938980617,
"grad_norm": 0.4400699734687805,
"learning_rate": 1.0060852175014635e-05,
"loss": 1.1769,
"step": 1325
},
{
"epoch": 0.9519023689877961,
"grad_norm": 0.3857263922691345,
"learning_rate": 1.0059104583380692e-05,
"loss": 1.1565,
"step": 1326
},
{
"epoch": 0.9526202440775305,
"grad_norm": 0.2161598801612854,
"learning_rate": 1.0057382303736683e-05,
"loss": 0.9872,
"step": 1327
},
{
"epoch": 0.9533381191672649,
"grad_norm": 1.8909188508987427,
"learning_rate": 1.005568534490577e-05,
"loss": 1.5161,
"step": 1328
},
{
"epoch": 0.9540559942569993,
"grad_norm": 0.2845224142074585,
"learning_rate": 1.0054013715581394e-05,
"loss": 1.0019,
"step": 1329
},
{
"epoch": 0.9547738693467337,
"grad_norm": 0.45807337760925293,
"learning_rate": 1.0052367424327244e-05,
"loss": 1.114,
"step": 1330
},
{
"epoch": 0.955491744436468,
"grad_norm": 0.24572716653347015,
"learning_rate": 1.0050746479577194e-05,
"loss": 1.0517,
"step": 1331
},
{
"epoch": 0.9562096195262024,
"grad_norm": 0.22645676136016846,
"learning_rate": 1.0049150889635272e-05,
"loss": 1.0494,
"step": 1332
},
{
"epoch": 0.9569274946159368,
"grad_norm": 0.6566616296768188,
"learning_rate": 1.004758066267562e-05,
"loss": 1.1728,
"step": 1333
},
{
"epoch": 0.9576453697056713,
"grad_norm": 0.20157983899116516,
"learning_rate": 1.0046035806742438e-05,
"loss": 1.0669,
"step": 1334
},
{
"epoch": 0.9583632447954056,
"grad_norm": 0.5226853489875793,
"learning_rate": 1.0044516329749954e-05,
"loss": 1.1016,
"step": 1335
},
{
"epoch": 0.95908111988514,
"grad_norm": 0.5047028660774231,
"learning_rate": 1.0043022239482385e-05,
"loss": 1.1733,
"step": 1336
},
{
"epoch": 0.9597989949748744,
"grad_norm": 0.290243536233902,
"learning_rate": 1.0041553543593887e-05,
"loss": 1.0053,
"step": 1337
},
{
"epoch": 0.9605168700646087,
"grad_norm": 0.4698556065559387,
"learning_rate": 1.0040110249608527e-05,
"loss": 1.0902,
"step": 1338
},
{
"epoch": 0.9612347451543432,
"grad_norm": 0.23319129645824432,
"learning_rate": 1.0038692364920234e-05,
"loss": 1.0855,
"step": 1339
},
{
"epoch": 0.9619526202440776,
"grad_norm": 0.37685346603393555,
"learning_rate": 1.0037299896792772e-05,
"loss": 0.9907,
"step": 1340
},
{
"epoch": 0.9626704953338119,
"grad_norm": 0.22720623016357422,
"learning_rate": 1.0035932852359691e-05,
"loss": 1.0305,
"step": 1341
},
{
"epoch": 0.9633883704235463,
"grad_norm": 0.6094557642936707,
"learning_rate": 1.00345912386243e-05,
"loss": 1.1798,
"step": 1342
},
{
"epoch": 0.9641062455132807,
"grad_norm": 0.23081520199775696,
"learning_rate": 1.003327506245963e-05,
"loss": 0.9901,
"step": 1343
},
{
"epoch": 0.964824120603015,
"grad_norm": 0.24824342131614685,
"learning_rate": 1.0031984330608389e-05,
"loss": 1.0054,
"step": 1344
},
{
"epoch": 0.9655419956927495,
"grad_norm": 0.5930601954460144,
"learning_rate": 1.0030719049682942e-05,
"loss": 1.2524,
"step": 1345
},
{
"epoch": 0.9662598707824839,
"grad_norm": 0.21535564959049225,
"learning_rate": 1.0029479226165268e-05,
"loss": 1.0293,
"step": 1346
},
{
"epoch": 0.9669777458722182,
"grad_norm": 0.26974886655807495,
"learning_rate": 1.0028264866406929e-05,
"loss": 1.0031,
"step": 1347
},
{
"epoch": 0.9676956209619526,
"grad_norm": 0.3035675287246704,
"learning_rate": 1.0027075976629035e-05,
"loss": 1.0578,
"step": 1348
},
{
"epoch": 0.968413496051687,
"grad_norm": 0.3711431324481964,
"learning_rate": 1.0025912562922216e-05,
"loss": 1.1601,
"step": 1349
},
{
"epoch": 0.9691313711414213,
"grad_norm": 0.24194128811359406,
"learning_rate": 1.002477463124659e-05,
"loss": 1.0591,
"step": 1350
},
{
"epoch": 0.9698492462311558,
"grad_norm": 0.20900464057922363,
"learning_rate": 1.0023662187431731e-05,
"loss": 1.0051,
"step": 1351
},
{
"epoch": 0.9705671213208902,
"grad_norm": 0.28793370723724365,
"learning_rate": 1.0022575237176638e-05,
"loss": 0.969,
"step": 1352
},
{
"epoch": 0.9712849964106246,
"grad_norm": 0.5335143208503723,
"learning_rate": 1.0021513786049712e-05,
"loss": 1.1496,
"step": 1353
},
{
"epoch": 0.9720028715003589,
"grad_norm": 0.23671367764472961,
"learning_rate": 1.0020477839488718e-05,
"loss": 0.9809,
"step": 1354
},
{
"epoch": 0.9727207465900933,
"grad_norm": 0.2911614775657654,
"learning_rate": 1.001946740280077e-05,
"loss": 1.0569,
"step": 1355
},
{
"epoch": 0.9734386216798278,
"grad_norm": 0.18712709844112396,
"learning_rate": 1.0018482481162282e-05,
"loss": 1.028,
"step": 1356
},
{
"epoch": 0.9741564967695621,
"grad_norm": 0.2784719467163086,
"learning_rate": 1.001752307961897e-05,
"loss": 1.0851,
"step": 1357
},
{
"epoch": 0.9748743718592965,
"grad_norm": 0.2440050095319748,
"learning_rate": 1.0016589203085805e-05,
"loss": 0.997,
"step": 1358
},
{
"epoch": 0.9755922469490309,
"grad_norm": 0.4130586087703705,
"learning_rate": 1.0015680856346996e-05,
"loss": 1.1313,
"step": 1359
},
{
"epoch": 0.9763101220387652,
"grad_norm": 0.5140645503997803,
"learning_rate": 1.0014798044055963e-05,
"loss": 1.1613,
"step": 1360
},
{
"epoch": 0.9770279971284996,
"grad_norm": 0.3041968047618866,
"learning_rate": 1.0013940770735313e-05,
"loss": 1.0595,
"step": 1361
},
{
"epoch": 0.9777458722182341,
"grad_norm": 0.47146379947662354,
"learning_rate": 1.001310904077682e-05,
"loss": 1.1859,
"step": 1362
},
{
"epoch": 0.9784637473079684,
"grad_norm": 0.2464003711938858,
"learning_rate": 1.0012302858441401e-05,
"loss": 1.0211,
"step": 1363
},
{
"epoch": 0.9791816223977028,
"grad_norm": 0.2065819799900055,
"learning_rate": 1.0011522227859094e-05,
"loss": 1.0402,
"step": 1364
},
{
"epoch": 0.9798994974874372,
"grad_norm": 0.2763268053531647,
"learning_rate": 1.001076715302903e-05,
"loss": 1.0706,
"step": 1365
},
{
"epoch": 0.9806173725771715,
"grad_norm": 0.20548270642757416,
"learning_rate": 1.0010037637819431e-05,
"loss": 1.0235,
"step": 1366
},
{
"epoch": 0.9813352476669059,
"grad_norm": 0.2394370138645172,
"learning_rate": 1.0009333685967568e-05,
"loss": 0.9968,
"step": 1367
},
{
"epoch": 0.9820531227566404,
"grad_norm": 0.2873808443546295,
"learning_rate": 1.0008655301079755e-05,
"loss": 1.1086,
"step": 1368
},
{
"epoch": 0.9827709978463748,
"grad_norm": 0.23559843003749847,
"learning_rate": 1.0008002486631328e-05,
"loss": 1.0684,
"step": 1369
},
{
"epoch": 0.9834888729361091,
"grad_norm": 0.28017380833625793,
"learning_rate": 1.0007375245966625e-05,
"loss": 1.1085,
"step": 1370
},
{
"epoch": 0.9842067480258435,
"grad_norm": 0.27574074268341064,
"learning_rate": 1.0006773582298974e-05,
"loss": 1.0373,
"step": 1371
},
{
"epoch": 0.9849246231155779,
"grad_norm": 0.2589809000492096,
"learning_rate": 1.0006197498710674e-05,
"loss": 1.0808,
"step": 1372
},
{
"epoch": 0.9856424982053122,
"grad_norm": 0.8424662351608276,
"learning_rate": 1.0005646998152973e-05,
"loss": 1.1736,
"step": 1373
},
{
"epoch": 0.9863603732950467,
"grad_norm": 0.1887986660003662,
"learning_rate": 1.0005122083446064e-05,
"loss": 1.028,
"step": 1374
},
{
"epoch": 0.9870782483847811,
"grad_norm": 0.49798160791397095,
"learning_rate": 1.0004622757279057e-05,
"loss": 1.2697,
"step": 1375
},
{
"epoch": 0.9877961234745154,
"grad_norm": 0.6020227670669556,
"learning_rate": 1.0004149022209982e-05,
"loss": 1.1899,
"step": 1376
},
{
"epoch": 0.9885139985642498,
"grad_norm": 0.3774351179599762,
"learning_rate": 1.0003700880665761e-05,
"loss": 1.101,
"step": 1377
},
{
"epoch": 0.9892318736539842,
"grad_norm": 0.22604165971279144,
"learning_rate": 1.0003278334942206e-05,
"loss": 1.0164,
"step": 1378
},
{
"epoch": 0.9899497487437185,
"grad_norm": 0.4209281802177429,
"learning_rate": 1.0002881387203995e-05,
"loss": 1.0941,
"step": 1379
},
{
"epoch": 0.990667623833453,
"grad_norm": 0.22722220420837402,
"learning_rate": 1.0002510039484682e-05,
"loss": 1.0887,
"step": 1380
},
{
"epoch": 0.9913854989231874,
"grad_norm": 0.2620321810245514,
"learning_rate": 1.0002164293686655e-05,
"loss": 1.0303,
"step": 1381
},
{
"epoch": 0.9921033740129217,
"grad_norm": 0.32566314935684204,
"learning_rate": 1.0001844151581162e-05,
"loss": 1.1484,
"step": 1382
},
{
"epoch": 0.9928212491026561,
"grad_norm": 0.2737228274345398,
"learning_rate": 1.0001549614808275e-05,
"loss": 1.1292,
"step": 1383
},
{
"epoch": 0.9935391241923905,
"grad_norm": 0.3287229835987091,
"learning_rate": 1.0001280684876891e-05,
"loss": 1.0606,
"step": 1384
},
{
"epoch": 0.994256999282125,
"grad_norm": 0.28717947006225586,
"learning_rate": 1.000103736316473e-05,
"loss": 0.9971,
"step": 1385
},
{
"epoch": 0.9949748743718593,
"grad_norm": 1.312623381614685,
"learning_rate": 1.0000819650918314e-05,
"loss": 1.1861,
"step": 1386
},
{
"epoch": 0.9956927494615937,
"grad_norm": 0.27503907680511475,
"learning_rate": 1.0000627549252978e-05,
"loss": 1.04,
"step": 1387
},
{
"epoch": 0.9964106245513281,
"grad_norm": 0.21305954456329346,
"learning_rate": 1.0000461059152846e-05,
"loss": 0.9879,
"step": 1388
},
{
"epoch": 0.9971284996410624,
"grad_norm": 0.19111517071723938,
"learning_rate": 1.0000320181470842e-05,
"loss": 1.0412,
"step": 1389
},
{
"epoch": 0.9978463747307968,
"grad_norm": 0.18863187730312347,
"learning_rate": 1.0000204916928675e-05,
"loss": 1.0658,
"step": 1390
},
{
"epoch": 0.9985642498205313,
"grad_norm": 0.560884952545166,
"learning_rate": 1.000011526611684e-05,
"loss": 1.1132,
"step": 1391
},
{
"epoch": 0.9992821249102656,
"grad_norm": 0.23772947490215302,
"learning_rate": 1.0000051229494616e-05,
"loss": 0.9555,
"step": 1392
},
{
"epoch": 1.0,
"grad_norm": 0.3461064100265503,
"learning_rate": 1.0000012807390056e-05,
"loss": 1.0363,
"step": 1393
},
{
"epoch": 1.0,
"step": 1393,
"total_flos": 2.538580367925838e+18,
"train_loss": 1.1330506813586059,
"train_runtime": 2210.1404,
"train_samples_per_second": 40.323,
"train_steps_per_second": 0.63
}
],
"logging_steps": 1.0,
"max_steps": 1393,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.538580367925838e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}