{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9017713365539453, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012882447665056361, "grad_norm": 5.764990329742432, "learning_rate": 0.0, "loss": 10.1729, "step": 1 }, { "epoch": 0.0025764895330112722, "grad_norm": 5.699520111083984, "learning_rate": 4e-05, "loss": 9.9457, "step": 2 }, { "epoch": 0.003864734299516908, "grad_norm": 5.760854721069336, "learning_rate": 8e-05, "loss": 9.7561, "step": 3 }, { "epoch": 0.0051529790660225444, "grad_norm": 6.234244346618652, "learning_rate": 0.00012, "loss": 9.0043, "step": 4 }, { "epoch": 0.00644122383252818, "grad_norm": 6.719285011291504, "learning_rate": 0.00016, "loss": 7.4539, "step": 5 }, { "epoch": 0.007729468599033816, "grad_norm": 5.1464948654174805, "learning_rate": 0.0002, "loss": 5.7333, "step": 6 }, { "epoch": 0.009017713365539453, "grad_norm": 3.3606348037719727, "learning_rate": 0.00019974059662775616, "loss": 3.8319, "step": 7 }, { "epoch": 0.010305958132045089, "grad_norm": 2.361740827560425, "learning_rate": 0.00019948119325551234, "loss": 3.3392, "step": 8 }, { "epoch": 0.011594202898550725, "grad_norm": 2.378281831741333, "learning_rate": 0.0001992217898832685, "loss": 2.8643, "step": 9 }, { "epoch": 0.01288244766505636, "grad_norm": 1.9084206819534302, "learning_rate": 0.00019896238651102467, "loss": 2.4692, "step": 10 }, { "epoch": 0.014170692431561997, "grad_norm": 2.3616507053375244, "learning_rate": 0.00019870298313878082, "loss": 2.2057, "step": 11 }, { "epoch": 0.015458937198067632, "grad_norm": 2.7130489349365234, "learning_rate": 0.00019844357976653697, "loss": 1.8781, "step": 12 }, { "epoch": 0.01674718196457327, "grad_norm": 5.479770183563232, "learning_rate": 0.00019818417639429315, "loss": 1.6427, "step": 13 }, { "epoch": 0.018035426731078906, "grad_norm": 2.0840210914611816, "learning_rate": 0.0001979247730220493, "loss": 1.6455, "step": 14 }, { "epoch": 0.01932367149758454, "grad_norm": 17.294357299804688, "learning_rate": 0.00019766536964980547, "loss": 1.9366, "step": 15 }, { "epoch": 0.020611916264090178, "grad_norm": 3.7959189414978027, "learning_rate": 0.00019740596627756162, "loss": 1.9272, "step": 16 }, { "epoch": 0.021900161030595812, "grad_norm": 9.078225135803223, "learning_rate": 0.00019714656290531778, "loss": 1.8734, "step": 17 }, { "epoch": 0.02318840579710145, "grad_norm": 2.7898125648498535, "learning_rate": 0.00019688715953307395, "loss": 1.8415, "step": 18 }, { "epoch": 0.024476650563607084, "grad_norm": 5.833450794219971, "learning_rate": 0.00019662775616083008, "loss": 1.6641, "step": 19 }, { "epoch": 0.02576489533011272, "grad_norm": 1.286916971206665, "learning_rate": 0.00019636835278858625, "loss": 1.6488, "step": 20 }, { "epoch": 0.02705314009661836, "grad_norm": 1.4083938598632812, "learning_rate": 0.0001961089494163424, "loss": 1.6369, "step": 21 }, { "epoch": 0.028341384863123993, "grad_norm": 11.11021900177002, "learning_rate": 0.00019584954604409858, "loss": 1.0877, "step": 22 }, { "epoch": 0.02962962962962963, "grad_norm": 4.023814678192139, "learning_rate": 0.00019559014267185473, "loss": 1.172, "step": 23 }, { "epoch": 0.030917874396135265, "grad_norm": 1.5380833148956299, "learning_rate": 0.0001953307392996109, "loss": 1.2489, "step": 24 }, { "epoch": 0.0322061191626409, "grad_norm": 3.5287179946899414, "learning_rate": 0.00019507133592736706, "loss": 1.059, "step": 25 }, { "epoch": 0.03349436392914654, "grad_norm": 0.4443202316761017, "learning_rate": 0.0001948119325551232, "loss": 1.0324, "step": 26 }, { "epoch": 0.034782608695652174, "grad_norm": 4.4658098220825195, "learning_rate": 0.0001945525291828794, "loss": 0.9281, "step": 27 }, { "epoch": 0.03607085346215781, "grad_norm": 0.6924022436141968, "learning_rate": 0.00019429312581063554, "loss": 0.7725, "step": 28 }, { "epoch": 0.03735909822866344, "grad_norm": 0.39130347967147827, "learning_rate": 0.00019403372243839172, "loss": 0.9851, "step": 29 }, { "epoch": 0.03864734299516908, "grad_norm": 0.478762149810791, "learning_rate": 0.00019377431906614787, "loss": 0.9003, "step": 30 }, { "epoch": 0.03993558776167472, "grad_norm": 0.485379695892334, "learning_rate": 0.00019351491569390402, "loss": 0.9038, "step": 31 }, { "epoch": 0.041223832528180356, "grad_norm": 0.4116724729537964, "learning_rate": 0.0001932555123216602, "loss": 0.7782, "step": 32 }, { "epoch": 0.04251207729468599, "grad_norm": 0.35044676065444946, "learning_rate": 0.00019299610894941635, "loss": 0.7145, "step": 33 }, { "epoch": 0.043800322061191624, "grad_norm": 0.34671103954315186, "learning_rate": 0.00019273670557717253, "loss": 0.838, "step": 34 }, { "epoch": 0.04508856682769726, "grad_norm": 0.3169376850128174, "learning_rate": 0.00019247730220492868, "loss": 0.9283, "step": 35 }, { "epoch": 0.0463768115942029, "grad_norm": 0.3791329860687256, "learning_rate": 0.00019221789883268483, "loss": 0.9332, "step": 36 }, { "epoch": 0.04766505636070854, "grad_norm": 0.39683282375335693, "learning_rate": 0.000191958495460441, "loss": 0.837, "step": 37 }, { "epoch": 0.04895330112721417, "grad_norm": 0.4130147099494934, "learning_rate": 0.00019169909208819716, "loss": 0.688, "step": 38 }, { "epoch": 0.050241545893719805, "grad_norm": 0.535886824131012, "learning_rate": 0.00019143968871595333, "loss": 0.819, "step": 39 }, { "epoch": 0.05152979066022544, "grad_norm": 0.41564154624938965, "learning_rate": 0.00019118028534370949, "loss": 1.0323, "step": 40 }, { "epoch": 0.05281803542673108, "grad_norm": 0.38580086827278137, "learning_rate": 0.00019092088197146564, "loss": 0.9947, "step": 41 }, { "epoch": 0.05410628019323672, "grad_norm": 0.3614998757839203, "learning_rate": 0.00019066147859922181, "loss": 0.8925, "step": 42 }, { "epoch": 0.05539452495974235, "grad_norm": 0.3364286422729492, "learning_rate": 0.00019040207522697794, "loss": 0.8473, "step": 43 }, { "epoch": 0.056682769726247986, "grad_norm": 0.3541828393936157, "learning_rate": 0.00019014267185473412, "loss": 0.8477, "step": 44 }, { "epoch": 0.057971014492753624, "grad_norm": 0.35495537519454956, "learning_rate": 0.00018988326848249027, "loss": 0.8881, "step": 45 }, { "epoch": 0.05925925925925926, "grad_norm": 0.43733540177345276, "learning_rate": 0.00018962386511024644, "loss": 0.8743, "step": 46 }, { "epoch": 0.06054750402576489, "grad_norm": 0.3078387975692749, "learning_rate": 0.0001893644617380026, "loss": 0.7479, "step": 47 }, { "epoch": 0.06183574879227053, "grad_norm": 0.36661794781684875, "learning_rate": 0.00018910505836575875, "loss": 0.8258, "step": 48 }, { "epoch": 0.06312399355877617, "grad_norm": 0.34701570868492126, "learning_rate": 0.00018884565499351492, "loss": 0.8085, "step": 49 }, { "epoch": 0.0644122383252818, "grad_norm": 0.30905681848526, "learning_rate": 0.00018858625162127107, "loss": 0.7474, "step": 50 }, { "epoch": 0.06570048309178744, "grad_norm": 0.47441986203193665, "learning_rate": 0.00018832684824902725, "loss": 1.0549, "step": 51 }, { "epoch": 0.06698872785829307, "grad_norm": 0.2966022491455078, "learning_rate": 0.0001880674448767834, "loss": 0.8517, "step": 52 }, { "epoch": 0.06827697262479872, "grad_norm": 0.33785632252693176, "learning_rate": 0.00018780804150453958, "loss": 0.8858, "step": 53 }, { "epoch": 0.06956521739130435, "grad_norm": 0.33717742562294006, "learning_rate": 0.00018754863813229573, "loss": 0.7961, "step": 54 }, { "epoch": 0.07085346215780998, "grad_norm": 0.4235801100730896, "learning_rate": 0.00018728923476005188, "loss": 0.9562, "step": 55 }, { "epoch": 0.07214170692431562, "grad_norm": 0.40099507570266724, "learning_rate": 0.00018702983138780806, "loss": 0.6817, "step": 56 }, { "epoch": 0.07342995169082125, "grad_norm": 0.3041292428970337, "learning_rate": 0.0001867704280155642, "loss": 0.6219, "step": 57 }, { "epoch": 0.07471819645732689, "grad_norm": 0.428120493888855, "learning_rate": 0.0001865110246433204, "loss": 0.7911, "step": 58 }, { "epoch": 0.07600644122383253, "grad_norm": 0.39466729760169983, "learning_rate": 0.00018625162127107654, "loss": 0.7883, "step": 59 }, { "epoch": 0.07729468599033816, "grad_norm": 0.3272225856781006, "learning_rate": 0.0001859922178988327, "loss": 0.736, "step": 60 }, { "epoch": 0.0785829307568438, "grad_norm": 0.3868604898452759, "learning_rate": 0.00018573281452658887, "loss": 0.772, "step": 61 }, { "epoch": 0.07987117552334944, "grad_norm": 0.4111652970314026, "learning_rate": 0.00018547341115434502, "loss": 0.7715, "step": 62 }, { "epoch": 0.08115942028985507, "grad_norm": 0.367587149143219, "learning_rate": 0.0001852140077821012, "loss": 0.8887, "step": 63 }, { "epoch": 0.08244766505636071, "grad_norm": 0.36358535289764404, "learning_rate": 0.00018495460440985735, "loss": 0.6756, "step": 64 }, { "epoch": 0.08373590982286634, "grad_norm": 0.3693746030330658, "learning_rate": 0.0001846952010376135, "loss": 0.7451, "step": 65 }, { "epoch": 0.08502415458937199, "grad_norm": 0.33801788091659546, "learning_rate": 0.00018443579766536967, "loss": 0.7722, "step": 66 }, { "epoch": 0.08631239935587762, "grad_norm": 0.40920770168304443, "learning_rate": 0.0001841763942931258, "loss": 0.6399, "step": 67 }, { "epoch": 0.08760064412238325, "grad_norm": 0.36758852005004883, "learning_rate": 0.00018391699092088198, "loss": 0.7686, "step": 68 }, { "epoch": 0.08888888888888889, "grad_norm": 0.37189269065856934, "learning_rate": 0.00018365758754863813, "loss": 0.8437, "step": 69 }, { "epoch": 0.09017713365539452, "grad_norm": 0.42477479577064514, "learning_rate": 0.0001833981841763943, "loss": 0.7571, "step": 70 }, { "epoch": 0.09146537842190017, "grad_norm": 0.34100234508514404, "learning_rate": 0.00018313878080415046, "loss": 0.733, "step": 71 }, { "epoch": 0.0927536231884058, "grad_norm": 0.42223483324050903, "learning_rate": 0.0001828793774319066, "loss": 0.7981, "step": 72 }, { "epoch": 0.09404186795491143, "grad_norm": 0.40469273924827576, "learning_rate": 0.00018261997405966278, "loss": 0.8633, "step": 73 }, { "epoch": 0.09533011272141707, "grad_norm": 0.35466790199279785, "learning_rate": 0.00018236057068741893, "loss": 0.7359, "step": 74 }, { "epoch": 0.0966183574879227, "grad_norm": 0.3824892044067383, "learning_rate": 0.0001821011673151751, "loss": 0.846, "step": 75 }, { "epoch": 0.09790660225442833, "grad_norm": 0.4101675748825073, "learning_rate": 0.00018184176394293126, "loss": 1.0094, "step": 76 }, { "epoch": 0.09919484702093398, "grad_norm": 0.3373378813266754, "learning_rate": 0.0001815823605706874, "loss": 0.6908, "step": 77 }, { "epoch": 0.10048309178743961, "grad_norm": 0.41473421454429626, "learning_rate": 0.0001813229571984436, "loss": 0.7753, "step": 78 }, { "epoch": 0.10177133655394525, "grad_norm": 0.3552979826927185, "learning_rate": 0.00018106355382619974, "loss": 0.883, "step": 79 }, { "epoch": 0.10305958132045089, "grad_norm": 0.3655754029750824, "learning_rate": 0.00018080415045395592, "loss": 0.7978, "step": 80 }, { "epoch": 0.10434782608695652, "grad_norm": 0.398554265499115, "learning_rate": 0.00018054474708171207, "loss": 0.9426, "step": 81 }, { "epoch": 0.10563607085346216, "grad_norm": 0.4098765552043915, "learning_rate": 0.00018028534370946825, "loss": 0.7127, "step": 82 }, { "epoch": 0.10692431561996779, "grad_norm": 0.38591381907463074, "learning_rate": 0.0001800259403372244, "loss": 0.7994, "step": 83 }, { "epoch": 0.10821256038647344, "grad_norm": 0.42177343368530273, "learning_rate": 0.00017976653696498055, "loss": 0.8032, "step": 84 }, { "epoch": 0.10950080515297907, "grad_norm": 0.38358885049819946, "learning_rate": 0.00017950713359273673, "loss": 0.8478, "step": 85 }, { "epoch": 0.1107890499194847, "grad_norm": 0.4549978971481323, "learning_rate": 0.00017924773022049288, "loss": 0.813, "step": 86 }, { "epoch": 0.11207729468599034, "grad_norm": 0.4372895359992981, "learning_rate": 0.00017898832684824906, "loss": 0.8797, "step": 87 }, { "epoch": 0.11336553945249597, "grad_norm": 0.4454326033592224, "learning_rate": 0.0001787289234760052, "loss": 0.8554, "step": 88 }, { "epoch": 0.11465378421900162, "grad_norm": 0.3808746933937073, "learning_rate": 0.00017846952010376136, "loss": 0.5919, "step": 89 }, { "epoch": 0.11594202898550725, "grad_norm": 0.4146284759044647, "learning_rate": 0.00017821011673151754, "loss": 0.8823, "step": 90 }, { "epoch": 0.11723027375201288, "grad_norm": 0.47205957770347595, "learning_rate": 0.00017795071335927366, "loss": 0.6284, "step": 91 }, { "epoch": 0.11851851851851852, "grad_norm": 0.4155535101890564, "learning_rate": 0.00017769130998702984, "loss": 0.8945, "step": 92 }, { "epoch": 0.11980676328502415, "grad_norm": 0.4152592420578003, "learning_rate": 0.000177431906614786, "loss": 0.818, "step": 93 }, { "epoch": 0.12109500805152978, "grad_norm": 0.4558146297931671, "learning_rate": 0.00017717250324254217, "loss": 0.651, "step": 94 }, { "epoch": 0.12238325281803543, "grad_norm": 0.4004950523376465, "learning_rate": 0.00017691309987029832, "loss": 0.7546, "step": 95 }, { "epoch": 0.12367149758454106, "grad_norm": 0.35895851254463196, "learning_rate": 0.00017665369649805447, "loss": 0.6174, "step": 96 }, { "epoch": 0.1249597423510467, "grad_norm": 0.4626515209674835, "learning_rate": 0.00017639429312581064, "loss": 0.716, "step": 97 }, { "epoch": 0.12624798711755233, "grad_norm": 0.47447800636291504, "learning_rate": 0.0001761348897535668, "loss": 0.9699, "step": 98 }, { "epoch": 0.12753623188405797, "grad_norm": 0.4361920654773712, "learning_rate": 0.00017587548638132297, "loss": 0.9217, "step": 99 }, { "epoch": 0.1288244766505636, "grad_norm": 0.42450228333473206, "learning_rate": 0.00017561608300907912, "loss": 0.6938, "step": 100 }, { "epoch": 0.13011272141706925, "grad_norm": 0.4310356080532074, "learning_rate": 0.00017535667963683527, "loss": 0.7263, "step": 101 }, { "epoch": 0.13140096618357489, "grad_norm": 0.5808001756668091, "learning_rate": 0.00017509727626459145, "loss": 0.9891, "step": 102 }, { "epoch": 0.13268921095008052, "grad_norm": 0.49347755312919617, "learning_rate": 0.0001748378728923476, "loss": 0.7918, "step": 103 }, { "epoch": 0.13397745571658615, "grad_norm": 0.42868706583976746, "learning_rate": 0.00017457846952010378, "loss": 0.7067, "step": 104 }, { "epoch": 0.13526570048309178, "grad_norm": 0.4322398900985718, "learning_rate": 0.00017431906614785993, "loss": 0.6705, "step": 105 }, { "epoch": 0.13655394524959744, "grad_norm": 0.41033244132995605, "learning_rate": 0.00017405966277561608, "loss": 0.6878, "step": 106 }, { "epoch": 0.13784219001610307, "grad_norm": 0.536390483379364, "learning_rate": 0.00017380025940337226, "loss": 0.6961, "step": 107 }, { "epoch": 0.1391304347826087, "grad_norm": 0.4299734830856323, "learning_rate": 0.0001735408560311284, "loss": 0.7065, "step": 108 }, { "epoch": 0.14041867954911433, "grad_norm": 0.4070943593978882, "learning_rate": 0.0001732814526588846, "loss": 0.6975, "step": 109 }, { "epoch": 0.14170692431561996, "grad_norm": 0.46637794375419617, "learning_rate": 0.00017302204928664074, "loss": 0.7583, "step": 110 }, { "epoch": 0.14299516908212562, "grad_norm": 0.38566964864730835, "learning_rate": 0.00017276264591439692, "loss": 0.7765, "step": 111 }, { "epoch": 0.14428341384863125, "grad_norm": 0.38054248690605164, "learning_rate": 0.00017250324254215307, "loss": 0.6291, "step": 112 }, { "epoch": 0.14557165861513688, "grad_norm": 0.5447641015052795, "learning_rate": 0.00017224383916990922, "loss": 1.0189, "step": 113 }, { "epoch": 0.1468599033816425, "grad_norm": 0.4753653109073639, "learning_rate": 0.0001719844357976654, "loss": 0.83, "step": 114 }, { "epoch": 0.14814814814814814, "grad_norm": 0.4890337288379669, "learning_rate": 0.00017172503242542152, "loss": 0.8586, "step": 115 }, { "epoch": 0.14943639291465377, "grad_norm": 0.42376580834388733, "learning_rate": 0.0001714656290531777, "loss": 0.805, "step": 116 }, { "epoch": 0.15072463768115943, "grad_norm": 0.4509013295173645, "learning_rate": 0.00017120622568093385, "loss": 0.8794, "step": 117 }, { "epoch": 0.15201288244766506, "grad_norm": 0.4562942385673523, "learning_rate": 0.00017094682230869003, "loss": 0.6392, "step": 118 }, { "epoch": 0.1533011272141707, "grad_norm": 0.48996442556381226, "learning_rate": 0.00017068741893644618, "loss": 0.7655, "step": 119 }, { "epoch": 0.15458937198067632, "grad_norm": 0.5451317429542542, "learning_rate": 0.00017042801556420233, "loss": 0.796, "step": 120 }, { "epoch": 0.15587761674718195, "grad_norm": 0.45719748735427856, "learning_rate": 0.0001701686121919585, "loss": 0.7704, "step": 121 }, { "epoch": 0.1571658615136876, "grad_norm": 0.5048899054527283, "learning_rate": 0.00016990920881971466, "loss": 0.7396, "step": 122 }, { "epoch": 0.15845410628019324, "grad_norm": 0.4184553921222687, "learning_rate": 0.00016964980544747083, "loss": 0.9926, "step": 123 }, { "epoch": 0.15974235104669887, "grad_norm": 0.4456348717212677, "learning_rate": 0.00016939040207522698, "loss": 0.7654, "step": 124 }, { "epoch": 0.1610305958132045, "grad_norm": 0.4423070251941681, "learning_rate": 0.00016913099870298313, "loss": 0.7832, "step": 125 }, { "epoch": 0.16231884057971013, "grad_norm": 0.5408623218536377, "learning_rate": 0.0001688715953307393, "loss": 0.9074, "step": 126 }, { "epoch": 0.1636070853462158, "grad_norm": 0.5411691665649414, "learning_rate": 0.00016861219195849546, "loss": 0.9271, "step": 127 }, { "epoch": 0.16489533011272142, "grad_norm": 0.41004684567451477, "learning_rate": 0.00016835278858625164, "loss": 0.5686, "step": 128 }, { "epoch": 0.16618357487922705, "grad_norm": 0.43191105127334595, "learning_rate": 0.0001680933852140078, "loss": 0.7841, "step": 129 }, { "epoch": 0.16747181964573268, "grad_norm": 0.46590304374694824, "learning_rate": 0.00016783398184176394, "loss": 0.6283, "step": 130 }, { "epoch": 0.16876006441223831, "grad_norm": 0.4356256425380707, "learning_rate": 0.00016757457846952012, "loss": 0.5977, "step": 131 }, { "epoch": 0.17004830917874397, "grad_norm": 0.44105201959609985, "learning_rate": 0.00016731517509727627, "loss": 0.8701, "step": 132 }, { "epoch": 0.1713365539452496, "grad_norm": 0.496669739484787, "learning_rate": 0.00016705577172503245, "loss": 0.8613, "step": 133 }, { "epoch": 0.17262479871175523, "grad_norm": 0.41839754581451416, "learning_rate": 0.0001667963683527886, "loss": 0.5693, "step": 134 }, { "epoch": 0.17391304347826086, "grad_norm": 0.42133820056915283, "learning_rate": 0.00016653696498054475, "loss": 0.7622, "step": 135 }, { "epoch": 0.1752012882447665, "grad_norm": 0.45265620946884155, "learning_rate": 0.00016627756160830093, "loss": 0.8501, "step": 136 }, { "epoch": 0.17648953301127215, "grad_norm": 0.45904725790023804, "learning_rate": 0.00016601815823605708, "loss": 0.9041, "step": 137 }, { "epoch": 0.17777777777777778, "grad_norm": 0.42884427309036255, "learning_rate": 0.00016575875486381326, "loss": 0.7386, "step": 138 }, { "epoch": 0.17906602254428342, "grad_norm": 0.44365760684013367, "learning_rate": 0.00016549935149156938, "loss": 0.7196, "step": 139 }, { "epoch": 0.18035426731078905, "grad_norm": 0.38908517360687256, "learning_rate": 0.00016523994811932556, "loss": 0.6251, "step": 140 }, { "epoch": 0.18164251207729468, "grad_norm": 0.3956596255302429, "learning_rate": 0.0001649805447470817, "loss": 0.6839, "step": 141 }, { "epoch": 0.18293075684380034, "grad_norm": 0.46725159883499146, "learning_rate": 0.00016472114137483789, "loss": 0.7637, "step": 142 }, { "epoch": 0.18421900161030597, "grad_norm": 0.4984063506126404, "learning_rate": 0.00016446173800259404, "loss": 0.818, "step": 143 }, { "epoch": 0.1855072463768116, "grad_norm": 0.40556883811950684, "learning_rate": 0.0001642023346303502, "loss": 0.5996, "step": 144 }, { "epoch": 0.18679549114331723, "grad_norm": 0.4421241581439972, "learning_rate": 0.00016394293125810637, "loss": 0.8339, "step": 145 }, { "epoch": 0.18808373590982286, "grad_norm": 0.4321085512638092, "learning_rate": 0.00016368352788586252, "loss": 0.8028, "step": 146 }, { "epoch": 0.18937198067632852, "grad_norm": 0.4498562514781952, "learning_rate": 0.0001634241245136187, "loss": 0.8976, "step": 147 }, { "epoch": 0.19066022544283415, "grad_norm": 0.45957380533218384, "learning_rate": 0.00016316472114137484, "loss": 0.9271, "step": 148 }, { "epoch": 0.19194847020933978, "grad_norm": 0.4764615595340729, "learning_rate": 0.000162905317769131, "loss": 0.9667, "step": 149 }, { "epoch": 0.1932367149758454, "grad_norm": 0.4241081774234772, "learning_rate": 0.00016264591439688717, "loss": 0.6945, "step": 150 }, { "epoch": 0.19452495974235104, "grad_norm": 0.5130481123924255, "learning_rate": 0.00016238651102464332, "loss": 0.7224, "step": 151 }, { "epoch": 0.19581320450885667, "grad_norm": 0.4727570116519928, "learning_rate": 0.0001621271076523995, "loss": 0.7714, "step": 152 }, { "epoch": 0.19710144927536233, "grad_norm": 0.5420963764190674, "learning_rate": 0.00016186770428015565, "loss": 0.9346, "step": 153 }, { "epoch": 0.19838969404186796, "grad_norm": 0.4338800013065338, "learning_rate": 0.0001616083009079118, "loss": 0.8527, "step": 154 }, { "epoch": 0.1996779388083736, "grad_norm": 0.45830976963043213, "learning_rate": 0.00016134889753566798, "loss": 0.8171, "step": 155 }, { "epoch": 0.20096618357487922, "grad_norm": 0.48107942938804626, "learning_rate": 0.00016108949416342413, "loss": 0.8537, "step": 156 }, { "epoch": 0.20225442834138485, "grad_norm": 0.4447987973690033, "learning_rate": 0.0001608300907911803, "loss": 0.7909, "step": 157 }, { "epoch": 0.2035426731078905, "grad_norm": 0.4311445653438568, "learning_rate": 0.00016057068741893646, "loss": 0.7733, "step": 158 }, { "epoch": 0.20483091787439614, "grad_norm": 0.5173223614692688, "learning_rate": 0.0001603112840466926, "loss": 0.9312, "step": 159 }, { "epoch": 0.20611916264090177, "grad_norm": 0.5143957734107971, "learning_rate": 0.0001600518806744488, "loss": 0.804, "step": 160 }, { "epoch": 0.2074074074074074, "grad_norm": 0.4494340121746063, "learning_rate": 0.00015979247730220494, "loss": 0.7741, "step": 161 }, { "epoch": 0.20869565217391303, "grad_norm": 0.5051131248474121, "learning_rate": 0.00015953307392996112, "loss": 0.8216, "step": 162 }, { "epoch": 0.2099838969404187, "grad_norm": 0.48853760957717896, "learning_rate": 0.00015927367055771724, "loss": 0.7958, "step": 163 }, { "epoch": 0.21127214170692432, "grad_norm": 0.4491981863975525, "learning_rate": 0.00015901426718547342, "loss": 0.8012, "step": 164 }, { "epoch": 0.21256038647342995, "grad_norm": 0.41452592611312866, "learning_rate": 0.00015875486381322957, "loss": 0.6653, "step": 165 }, { "epoch": 0.21384863123993558, "grad_norm": 0.4610249400138855, "learning_rate": 0.00015849546044098572, "loss": 0.7559, "step": 166 }, { "epoch": 0.2151368760064412, "grad_norm": 0.46895861625671387, "learning_rate": 0.0001582360570687419, "loss": 0.7393, "step": 167 }, { "epoch": 0.21642512077294687, "grad_norm": 0.44812971353530884, "learning_rate": 0.00015797665369649805, "loss": 0.7904, "step": 168 }, { "epoch": 0.2177133655394525, "grad_norm": 0.4483109712600708, "learning_rate": 0.00015771725032425423, "loss": 0.7503, "step": 169 }, { "epoch": 0.21900161030595813, "grad_norm": 0.4433995485305786, "learning_rate": 0.00015745784695201038, "loss": 0.7791, "step": 170 }, { "epoch": 0.22028985507246376, "grad_norm": 0.5305430889129639, "learning_rate": 0.00015719844357976655, "loss": 0.9014, "step": 171 }, { "epoch": 0.2215780998389694, "grad_norm": 0.4747445285320282, "learning_rate": 0.0001569390402075227, "loss": 0.6446, "step": 172 }, { "epoch": 0.22286634460547505, "grad_norm": 0.5174173712730408, "learning_rate": 0.00015667963683527886, "loss": 0.6938, "step": 173 }, { "epoch": 0.22415458937198068, "grad_norm": 0.5461775660514832, "learning_rate": 0.00015642023346303503, "loss": 0.9596, "step": 174 }, { "epoch": 0.22544283413848631, "grad_norm": 0.5394182205200195, "learning_rate": 0.00015616083009079118, "loss": 0.8632, "step": 175 }, { "epoch": 0.22673107890499195, "grad_norm": 0.4866770803928375, "learning_rate": 0.00015590142671854736, "loss": 0.8799, "step": 176 }, { "epoch": 0.22801932367149758, "grad_norm": 0.4386501908302307, "learning_rate": 0.0001556420233463035, "loss": 0.7341, "step": 177 }, { "epoch": 0.22930756843800323, "grad_norm": 0.5443551540374756, "learning_rate": 0.00015538261997405966, "loss": 0.771, "step": 178 }, { "epoch": 0.23059581320450886, "grad_norm": 0.45818325877189636, "learning_rate": 0.00015512321660181584, "loss": 0.8682, "step": 179 }, { "epoch": 0.2318840579710145, "grad_norm": 0.501369297504425, "learning_rate": 0.000154863813229572, "loss": 0.7586, "step": 180 }, { "epoch": 0.23317230273752013, "grad_norm": 0.4658907651901245, "learning_rate": 0.00015460440985732817, "loss": 0.6609, "step": 181 }, { "epoch": 0.23446054750402576, "grad_norm": 0.4543883800506592, "learning_rate": 0.00015434500648508432, "loss": 0.5404, "step": 182 }, { "epoch": 0.2357487922705314, "grad_norm": 0.4215242862701416, "learning_rate": 0.00015408560311284047, "loss": 0.7295, "step": 183 }, { "epoch": 0.23703703703703705, "grad_norm": 0.4865438640117645, "learning_rate": 0.00015382619974059665, "loss": 0.8251, "step": 184 }, { "epoch": 0.23832528180354268, "grad_norm": 0.4978322386741638, "learning_rate": 0.0001535667963683528, "loss": 0.9334, "step": 185 }, { "epoch": 0.2396135265700483, "grad_norm": 0.434435099363327, "learning_rate": 0.00015330739299610898, "loss": 0.9299, "step": 186 }, { "epoch": 0.24090177133655394, "grad_norm": 0.5044904947280884, "learning_rate": 0.0001530479896238651, "loss": 0.7411, "step": 187 }, { "epoch": 0.24219001610305957, "grad_norm": 0.4364910423755646, "learning_rate": 0.00015278858625162128, "loss": 0.8248, "step": 188 }, { "epoch": 0.24347826086956523, "grad_norm": 0.46096572279930115, "learning_rate": 0.00015252918287937743, "loss": 0.8211, "step": 189 }, { "epoch": 0.24476650563607086, "grad_norm": 0.4325025677680969, "learning_rate": 0.00015226977950713358, "loss": 0.7043, "step": 190 }, { "epoch": 0.2460547504025765, "grad_norm": 0.4898943305015564, "learning_rate": 0.00015201037613488976, "loss": 0.7608, "step": 191 }, { "epoch": 0.24734299516908212, "grad_norm": 0.47487872838974, "learning_rate": 0.0001517509727626459, "loss": 0.7175, "step": 192 }, { "epoch": 0.24863123993558775, "grad_norm": 0.4339347779750824, "learning_rate": 0.0001514915693904021, "loss": 0.8499, "step": 193 }, { "epoch": 0.2499194847020934, "grad_norm": 0.46825259923934937, "learning_rate": 0.00015123216601815824, "loss": 0.621, "step": 194 }, { "epoch": 0.25120772946859904, "grad_norm": 0.4948033094406128, "learning_rate": 0.0001509727626459144, "loss": 0.6888, "step": 195 }, { "epoch": 0.25249597423510467, "grad_norm": 0.4327951967716217, "learning_rate": 0.00015071335927367057, "loss": 0.6128, "step": 196 }, { "epoch": 0.2537842190016103, "grad_norm": 0.569115161895752, "learning_rate": 0.00015045395590142672, "loss": 0.8251, "step": 197 }, { "epoch": 0.25507246376811593, "grad_norm": 0.47008320689201355, "learning_rate": 0.0001501945525291829, "loss": 0.8214, "step": 198 }, { "epoch": 0.25636070853462156, "grad_norm": 0.4881947636604309, "learning_rate": 0.00014993514915693904, "loss": 0.6731, "step": 199 }, { "epoch": 0.2576489533011272, "grad_norm": 0.5395270586013794, "learning_rate": 0.00014967574578469522, "loss": 0.8095, "step": 200 }, { "epoch": 0.2589371980676328, "grad_norm": 0.44902658462524414, "learning_rate": 0.00014941634241245137, "loss": 0.7042, "step": 201 }, { "epoch": 0.2602254428341385, "grad_norm": 0.5789260268211365, "learning_rate": 0.00014915693904020752, "loss": 0.9071, "step": 202 }, { "epoch": 0.26151368760064414, "grad_norm": 0.48466676473617554, "learning_rate": 0.0001488975356679637, "loss": 0.7318, "step": 203 }, { "epoch": 0.26280193236714977, "grad_norm": 0.4419580101966858, "learning_rate": 0.00014863813229571985, "loss": 0.7128, "step": 204 }, { "epoch": 0.2640901771336554, "grad_norm": 0.4542410969734192, "learning_rate": 0.00014837872892347603, "loss": 0.7075, "step": 205 }, { "epoch": 0.26537842190016103, "grad_norm": 0.49915802478790283, "learning_rate": 0.00014811932555123218, "loss": 0.8091, "step": 206 }, { "epoch": 0.26666666666666666, "grad_norm": 0.39728543162345886, "learning_rate": 0.00014785992217898833, "loss": 0.6258, "step": 207 }, { "epoch": 0.2679549114331723, "grad_norm": 0.524169385433197, "learning_rate": 0.0001476005188067445, "loss": 0.73, "step": 208 }, { "epoch": 0.2692431561996779, "grad_norm": 0.4486137330532074, "learning_rate": 0.00014734111543450066, "loss": 0.7607, "step": 209 }, { "epoch": 0.27053140096618356, "grad_norm": 0.5274791717529297, "learning_rate": 0.00014708171206225684, "loss": 0.6731, "step": 210 }, { "epoch": 0.2718196457326892, "grad_norm": 0.44794782996177673, "learning_rate": 0.00014682230869001296, "loss": 0.5291, "step": 211 }, { "epoch": 0.27310789049919487, "grad_norm": 0.48657894134521484, "learning_rate": 0.00014656290531776914, "loss": 0.6754, "step": 212 }, { "epoch": 0.2743961352657005, "grad_norm": 0.49806416034698486, "learning_rate": 0.0001463035019455253, "loss": 0.7096, "step": 213 }, { "epoch": 0.27568438003220613, "grad_norm": 0.49381333589553833, "learning_rate": 0.00014604409857328144, "loss": 0.5939, "step": 214 }, { "epoch": 0.27697262479871176, "grad_norm": 0.4638739824295044, "learning_rate": 0.00014578469520103762, "loss": 0.6444, "step": 215 }, { "epoch": 0.2782608695652174, "grad_norm": 0.5256271362304688, "learning_rate": 0.00014552529182879377, "loss": 0.7595, "step": 216 }, { "epoch": 0.279549114331723, "grad_norm": 0.47106048464775085, "learning_rate": 0.00014526588845654995, "loss": 0.6394, "step": 217 }, { "epoch": 0.28083735909822866, "grad_norm": 0.5482437610626221, "learning_rate": 0.0001450064850843061, "loss": 0.7181, "step": 218 }, { "epoch": 0.2821256038647343, "grad_norm": 0.4711976945400238, "learning_rate": 0.00014474708171206225, "loss": 0.7207, "step": 219 }, { "epoch": 0.2834138486312399, "grad_norm": 0.5149180293083191, "learning_rate": 0.00014448767833981843, "loss": 0.8199, "step": 220 }, { "epoch": 0.28470209339774555, "grad_norm": 0.452908992767334, "learning_rate": 0.00014422827496757458, "loss": 0.6987, "step": 221 }, { "epoch": 0.28599033816425123, "grad_norm": 0.5486910343170166, "learning_rate": 0.00014396887159533075, "loss": 0.7726, "step": 222 }, { "epoch": 0.28727858293075687, "grad_norm": 0.5290431380271912, "learning_rate": 0.0001437094682230869, "loss": 0.8298, "step": 223 }, { "epoch": 0.2885668276972625, "grad_norm": 0.49307680130004883, "learning_rate": 0.00014345006485084306, "loss": 0.7525, "step": 224 }, { "epoch": 0.2898550724637681, "grad_norm": 0.5979593396186829, "learning_rate": 0.00014319066147859923, "loss": 0.8451, "step": 225 }, { "epoch": 0.29114331723027376, "grad_norm": 0.49994269013404846, "learning_rate": 0.00014293125810635538, "loss": 0.6975, "step": 226 }, { "epoch": 0.2924315619967794, "grad_norm": 0.5523327589035034, "learning_rate": 0.00014267185473411156, "loss": 0.7264, "step": 227 }, { "epoch": 0.293719806763285, "grad_norm": 0.5106574296951294, "learning_rate": 0.0001424124513618677, "loss": 0.7794, "step": 228 }, { "epoch": 0.29500805152979065, "grad_norm": 0.458646297454834, "learning_rate": 0.0001421530479896239, "loss": 0.8118, "step": 229 }, { "epoch": 0.2962962962962963, "grad_norm": 0.5162986516952515, "learning_rate": 0.00014189364461738004, "loss": 0.8167, "step": 230 }, { "epoch": 0.2975845410628019, "grad_norm": 0.47405433654785156, "learning_rate": 0.0001416342412451362, "loss": 0.7852, "step": 231 }, { "epoch": 0.29887278582930754, "grad_norm": 0.5881102681159973, "learning_rate": 0.00014137483787289237, "loss": 0.9897, "step": 232 }, { "epoch": 0.3001610305958132, "grad_norm": 0.4673059582710266, "learning_rate": 0.00014111543450064852, "loss": 0.7341, "step": 233 }, { "epoch": 0.30144927536231886, "grad_norm": 0.48171284794807434, "learning_rate": 0.0001408560311284047, "loss": 0.7156, "step": 234 }, { "epoch": 0.3027375201288245, "grad_norm": 0.43746286630630493, "learning_rate": 0.00014059662775616082, "loss": 0.6003, "step": 235 }, { "epoch": 0.3040257648953301, "grad_norm": 0.46966665983200073, "learning_rate": 0.000140337224383917, "loss": 0.718, "step": 236 }, { "epoch": 0.30531400966183575, "grad_norm": 0.4956988990306854, "learning_rate": 0.00014007782101167315, "loss": 0.6542, "step": 237 }, { "epoch": 0.3066022544283414, "grad_norm": 0.5336653590202332, "learning_rate": 0.0001398184176394293, "loss": 0.7719, "step": 238 }, { "epoch": 0.307890499194847, "grad_norm": 0.510515034198761, "learning_rate": 0.00013955901426718548, "loss": 0.8369, "step": 239 }, { "epoch": 0.30917874396135264, "grad_norm": 0.4901074469089508, "learning_rate": 0.00013929961089494163, "loss": 0.7973, "step": 240 }, { "epoch": 0.3104669887278583, "grad_norm": 0.5074118375778198, "learning_rate": 0.0001390402075226978, "loss": 0.8418, "step": 241 }, { "epoch": 0.3117552334943639, "grad_norm": 0.48613104224205017, "learning_rate": 0.00013878080415045396, "loss": 0.7661, "step": 242 }, { "epoch": 0.3130434782608696, "grad_norm": 0.527791440486908, "learning_rate": 0.0001385214007782101, "loss": 0.6461, "step": 243 }, { "epoch": 0.3143317230273752, "grad_norm": 0.539172887802124, "learning_rate": 0.0001382619974059663, "loss": 0.7588, "step": 244 }, { "epoch": 0.31561996779388085, "grad_norm": 0.4465171694755554, "learning_rate": 0.00013800259403372244, "loss": 0.6897, "step": 245 }, { "epoch": 0.3169082125603865, "grad_norm": 0.44620922207832336, "learning_rate": 0.00013774319066147862, "loss": 0.591, "step": 246 }, { "epoch": 0.3181964573268921, "grad_norm": 0.44383737444877625, "learning_rate": 0.00013748378728923477, "loss": 0.822, "step": 247 }, { "epoch": 0.31948470209339774, "grad_norm": 0.5062816739082336, "learning_rate": 0.00013722438391699092, "loss": 0.7657, "step": 248 }, { "epoch": 0.3207729468599034, "grad_norm": 0.4794199764728546, "learning_rate": 0.0001369649805447471, "loss": 0.6533, "step": 249 }, { "epoch": 0.322061191626409, "grad_norm": 0.506678581237793, "learning_rate": 0.00013670557717250325, "loss": 0.6881, "step": 250 }, { "epoch": 0.32334943639291464, "grad_norm": 0.5363421440124512, "learning_rate": 0.00013644617380025942, "loss": 0.7263, "step": 251 }, { "epoch": 0.32463768115942027, "grad_norm": 0.4600725769996643, "learning_rate": 0.00013618677042801557, "loss": 0.6522, "step": 252 }, { "epoch": 0.32592592592592595, "grad_norm": 0.4250006377696991, "learning_rate": 0.00013592736705577172, "loss": 0.5492, "step": 253 }, { "epoch": 0.3272141706924316, "grad_norm": 0.5984755158424377, "learning_rate": 0.0001356679636835279, "loss": 0.7152, "step": 254 }, { "epoch": 0.3285024154589372, "grad_norm": 0.4653768241405487, "learning_rate": 0.00013540856031128405, "loss": 0.6651, "step": 255 }, { "epoch": 0.32979066022544284, "grad_norm": 0.5344521403312683, "learning_rate": 0.00013514915693904023, "loss": 0.736, "step": 256 }, { "epoch": 0.3310789049919485, "grad_norm": 0.469061017036438, "learning_rate": 0.00013488975356679638, "loss": 0.5771, "step": 257 }, { "epoch": 0.3323671497584541, "grad_norm": 0.46232855319976807, "learning_rate": 0.00013463035019455256, "loss": 0.6887, "step": 258 }, { "epoch": 0.33365539452495974, "grad_norm": 0.4812975525856018, "learning_rate": 0.00013437094682230868, "loss": 0.8316, "step": 259 }, { "epoch": 0.33494363929146537, "grad_norm": 0.5068632960319519, "learning_rate": 0.00013411154345006486, "loss": 0.7372, "step": 260 }, { "epoch": 0.336231884057971, "grad_norm": 0.42497095465660095, "learning_rate": 0.000133852140077821, "loss": 0.7469, "step": 261 }, { "epoch": 0.33752012882447663, "grad_norm": 0.49439537525177, "learning_rate": 0.00013359273670557716, "loss": 0.6429, "step": 262 }, { "epoch": 0.33880837359098226, "grad_norm": 0.4804583787918091, "learning_rate": 0.00013333333333333334, "loss": 0.7772, "step": 263 }, { "epoch": 0.34009661835748795, "grad_norm": 0.46911564469337463, "learning_rate": 0.0001330739299610895, "loss": 0.5994, "step": 264 }, { "epoch": 0.3413848631239936, "grad_norm": 0.5286073088645935, "learning_rate": 0.00013281452658884567, "loss": 0.6459, "step": 265 }, { "epoch": 0.3426731078904992, "grad_norm": 0.48704788088798523, "learning_rate": 0.00013255512321660182, "loss": 0.6466, "step": 266 }, { "epoch": 0.34396135265700484, "grad_norm": 0.5040203332901001, "learning_rate": 0.00013229571984435797, "loss": 0.7436, "step": 267 }, { "epoch": 0.34524959742351047, "grad_norm": 0.48882773518562317, "learning_rate": 0.00013203631647211415, "loss": 0.7009, "step": 268 }, { "epoch": 0.3465378421900161, "grad_norm": 0.5158678889274597, "learning_rate": 0.0001317769130998703, "loss": 0.6862, "step": 269 }, { "epoch": 0.34782608695652173, "grad_norm": 0.489501416683197, "learning_rate": 0.00013151750972762648, "loss": 0.6378, "step": 270 }, { "epoch": 0.34911433172302736, "grad_norm": 0.42305371165275574, "learning_rate": 0.00013125810635538263, "loss": 0.593, "step": 271 }, { "epoch": 0.350402576489533, "grad_norm": 0.5226255059242249, "learning_rate": 0.00013099870298313878, "loss": 0.7828, "step": 272 }, { "epoch": 0.3516908212560386, "grad_norm": 0.4217074513435364, "learning_rate": 0.00013073929961089496, "loss": 0.6397, "step": 273 }, { "epoch": 0.3529790660225443, "grad_norm": 0.46896272897720337, "learning_rate": 0.0001304798962386511, "loss": 0.614, "step": 274 }, { "epoch": 0.35426731078904994, "grad_norm": 0.47062304615974426, "learning_rate": 0.00013022049286640728, "loss": 0.6892, "step": 275 }, { "epoch": 0.35555555555555557, "grad_norm": 0.4669751822948456, "learning_rate": 0.00012996108949416343, "loss": 0.8675, "step": 276 }, { "epoch": 0.3568438003220612, "grad_norm": 0.4246136546134949, "learning_rate": 0.00012970168612191958, "loss": 0.6467, "step": 277 }, { "epoch": 0.35813204508856683, "grad_norm": 0.42293113470077515, "learning_rate": 0.00012944228274967576, "loss": 0.6006, "step": 278 }, { "epoch": 0.35942028985507246, "grad_norm": 0.44599637389183044, "learning_rate": 0.0001291828793774319, "loss": 0.6241, "step": 279 }, { "epoch": 0.3607085346215781, "grad_norm": 0.4490668773651123, "learning_rate": 0.0001289234760051881, "loss": 0.5644, "step": 280 }, { "epoch": 0.3619967793880837, "grad_norm": 0.5100782513618469, "learning_rate": 0.00012866407263294424, "loss": 0.7378, "step": 281 }, { "epoch": 0.36328502415458935, "grad_norm": 0.4394833445549011, "learning_rate": 0.0001284046692607004, "loss": 0.6662, "step": 282 }, { "epoch": 0.364573268921095, "grad_norm": 0.49244457483291626, "learning_rate": 0.00012814526588845657, "loss": 0.7512, "step": 283 }, { "epoch": 0.36586151368760067, "grad_norm": 0.4558521807193756, "learning_rate": 0.0001278858625162127, "loss": 0.7213, "step": 284 }, { "epoch": 0.3671497584541063, "grad_norm": 0.6079721450805664, "learning_rate": 0.00012762645914396887, "loss": 0.7615, "step": 285 }, { "epoch": 0.36843800322061193, "grad_norm": 0.5249935984611511, "learning_rate": 0.00012736705577172502, "loss": 0.8172, "step": 286 }, { "epoch": 0.36972624798711756, "grad_norm": 0.5798977613449097, "learning_rate": 0.0001271076523994812, "loss": 0.8244, "step": 287 }, { "epoch": 0.3710144927536232, "grad_norm": 0.496056467294693, "learning_rate": 0.00012684824902723735, "loss": 0.8799, "step": 288 }, { "epoch": 0.3723027375201288, "grad_norm": 0.47068995237350464, "learning_rate": 0.00012658884565499353, "loss": 0.8069, "step": 289 }, { "epoch": 0.37359098228663445, "grad_norm": 0.5302271842956543, "learning_rate": 0.00012632944228274968, "loss": 0.7593, "step": 290 }, { "epoch": 0.3748792270531401, "grad_norm": 0.5044103860855103, "learning_rate": 0.00012607003891050583, "loss": 0.7462, "step": 291 }, { "epoch": 0.3761674718196457, "grad_norm": 0.4707060158252716, "learning_rate": 0.000125810635538262, "loss": 0.6593, "step": 292 }, { "epoch": 0.37745571658615135, "grad_norm": 0.5337527394294739, "learning_rate": 0.00012555123216601816, "loss": 0.6138, "step": 293 }, { "epoch": 0.37874396135265703, "grad_norm": 0.5467652082443237, "learning_rate": 0.00012529182879377434, "loss": 0.8375, "step": 294 }, { "epoch": 0.38003220611916266, "grad_norm": 0.48266416788101196, "learning_rate": 0.0001250324254215305, "loss": 0.6897, "step": 295 }, { "epoch": 0.3813204508856683, "grad_norm": 0.49726054072380066, "learning_rate": 0.00012477302204928664, "loss": 0.8202, "step": 296 }, { "epoch": 0.3826086956521739, "grad_norm": 0.5109860301017761, "learning_rate": 0.00012451361867704282, "loss": 0.7937, "step": 297 }, { "epoch": 0.38389694041867956, "grad_norm": 0.44613054394721985, "learning_rate": 0.00012425421530479897, "loss": 0.7205, "step": 298 }, { "epoch": 0.3851851851851852, "grad_norm": 0.5678048729896545, "learning_rate": 0.00012399481193255514, "loss": 0.8164, "step": 299 }, { "epoch": 0.3864734299516908, "grad_norm": 0.4355293810367584, "learning_rate": 0.0001237354085603113, "loss": 0.5967, "step": 300 }, { "epoch": 0.38776167471819645, "grad_norm": 0.5225346088409424, "learning_rate": 0.00012347600518806745, "loss": 0.7688, "step": 301 }, { "epoch": 0.3890499194847021, "grad_norm": 0.47630950808525085, "learning_rate": 0.00012321660181582362, "loss": 0.6535, "step": 302 }, { "epoch": 0.3903381642512077, "grad_norm": 0.48992452025413513, "learning_rate": 0.00012295719844357977, "loss": 0.652, "step": 303 }, { "epoch": 0.39162640901771334, "grad_norm": 0.4927466809749603, "learning_rate": 0.00012269779507133595, "loss": 0.6116, "step": 304 }, { "epoch": 0.392914653784219, "grad_norm": 0.4766499400138855, "learning_rate": 0.0001224383916990921, "loss": 0.6457, "step": 305 }, { "epoch": 0.39420289855072466, "grad_norm": 0.49338245391845703, "learning_rate": 0.00012217898832684825, "loss": 0.6211, "step": 306 }, { "epoch": 0.3954911433172303, "grad_norm": 0.5238732099533081, "learning_rate": 0.00012191958495460443, "loss": 0.7313, "step": 307 }, { "epoch": 0.3967793880837359, "grad_norm": 0.494093656539917, "learning_rate": 0.00012166018158236057, "loss": 0.7583, "step": 308 }, { "epoch": 0.39806763285024155, "grad_norm": 0.46139660477638245, "learning_rate": 0.00012140077821011673, "loss": 0.6841, "step": 309 }, { "epoch": 0.3993558776167472, "grad_norm": 0.4901793897151947, "learning_rate": 0.00012114137483787288, "loss": 0.6862, "step": 310 }, { "epoch": 0.4006441223832528, "grad_norm": 0.4695977568626404, "learning_rate": 0.00012088197146562905, "loss": 0.6428, "step": 311 }, { "epoch": 0.40193236714975844, "grad_norm": 0.4964921772480011, "learning_rate": 0.00012062256809338521, "loss": 0.6061, "step": 312 }, { "epoch": 0.40322061191626407, "grad_norm": 0.5101466178894043, "learning_rate": 0.00012036316472114138, "loss": 0.8195, "step": 313 }, { "epoch": 0.4045088566827697, "grad_norm": 0.470225989818573, "learning_rate": 0.00012010376134889754, "loss": 0.681, "step": 314 }, { "epoch": 0.4057971014492754, "grad_norm": 0.4532884955406189, "learning_rate": 0.0001198443579766537, "loss": 0.7239, "step": 315 }, { "epoch": 0.407085346215781, "grad_norm": 0.4604836106300354, "learning_rate": 0.00011958495460440985, "loss": 0.7433, "step": 316 }, { "epoch": 0.40837359098228665, "grad_norm": 0.4511779546737671, "learning_rate": 0.00011932555123216602, "loss": 0.7404, "step": 317 }, { "epoch": 0.4096618357487923, "grad_norm": 0.5277577042579651, "learning_rate": 0.00011906614785992218, "loss": 0.8757, "step": 318 }, { "epoch": 0.4109500805152979, "grad_norm": 0.444564551115036, "learning_rate": 0.00011880674448767835, "loss": 0.6465, "step": 319 }, { "epoch": 0.41223832528180354, "grad_norm": 0.4861951470375061, "learning_rate": 0.00011854734111543451, "loss": 0.8336, "step": 320 }, { "epoch": 0.41352657004830917, "grad_norm": 0.4412696957588196, "learning_rate": 0.00011828793774319066, "loss": 0.7586, "step": 321 }, { "epoch": 0.4148148148148148, "grad_norm": 0.5230206251144409, "learning_rate": 0.00011802853437094683, "loss": 0.7423, "step": 322 }, { "epoch": 0.41610305958132043, "grad_norm": 0.4539431631565094, "learning_rate": 0.00011776913099870299, "loss": 0.6849, "step": 323 }, { "epoch": 0.41739130434782606, "grad_norm": 0.5001434683799744, "learning_rate": 0.00011750972762645916, "loss": 0.6527, "step": 324 }, { "epoch": 0.41867954911433175, "grad_norm": 0.5230083465576172, "learning_rate": 0.00011725032425421532, "loss": 0.6829, "step": 325 }, { "epoch": 0.4199677938808374, "grad_norm": 0.5428875684738159, "learning_rate": 0.00011699092088197148, "loss": 0.6075, "step": 326 }, { "epoch": 0.421256038647343, "grad_norm": 0.49785757064819336, "learning_rate": 0.00011673151750972763, "loss": 0.6696, "step": 327 }, { "epoch": 0.42254428341384864, "grad_norm": 0.5448641180992126, "learning_rate": 0.0001164721141374838, "loss": 0.7752, "step": 328 }, { "epoch": 0.4238325281803543, "grad_norm": 0.6280490159988403, "learning_rate": 0.00011621271076523996, "loss": 0.8681, "step": 329 }, { "epoch": 0.4251207729468599, "grad_norm": 0.5525287389755249, "learning_rate": 0.00011595330739299613, "loss": 0.8434, "step": 330 }, { "epoch": 0.42640901771336553, "grad_norm": 0.4954991042613983, "learning_rate": 0.00011569390402075229, "loss": 0.7923, "step": 331 }, { "epoch": 0.42769726247987117, "grad_norm": 0.46500164270401, "learning_rate": 0.00011543450064850843, "loss": 0.7084, "step": 332 }, { "epoch": 0.4289855072463768, "grad_norm": 0.5183458924293518, "learning_rate": 0.00011517509727626459, "loss": 0.754, "step": 333 }, { "epoch": 0.4302737520128824, "grad_norm": 0.521300733089447, "learning_rate": 0.00011491569390402074, "loss": 0.7481, "step": 334 }, { "epoch": 0.43156199677938806, "grad_norm": 0.46088019013404846, "learning_rate": 0.00011465629053177691, "loss": 0.5601, "step": 335 }, { "epoch": 0.43285024154589374, "grad_norm": 0.5142108798027039, "learning_rate": 0.00011439688715953307, "loss": 0.8001, "step": 336 }, { "epoch": 0.4341384863123994, "grad_norm": 0.41947636008262634, "learning_rate": 0.00011413748378728924, "loss": 0.6669, "step": 337 }, { "epoch": 0.435426731078905, "grad_norm": 0.4584703743457794, "learning_rate": 0.0001138780804150454, "loss": 0.702, "step": 338 }, { "epoch": 0.43671497584541064, "grad_norm": 0.4480314254760742, "learning_rate": 0.00011361867704280155, "loss": 0.6379, "step": 339 }, { "epoch": 0.43800322061191627, "grad_norm": 0.49402984976768494, "learning_rate": 0.00011335927367055772, "loss": 0.7751, "step": 340 }, { "epoch": 0.4392914653784219, "grad_norm": 0.5001116991043091, "learning_rate": 0.00011309987029831388, "loss": 0.7157, "step": 341 }, { "epoch": 0.4405797101449275, "grad_norm": 0.4650849401950836, "learning_rate": 0.00011284046692607004, "loss": 0.5801, "step": 342 }, { "epoch": 0.44186795491143316, "grad_norm": 0.5000032186508179, "learning_rate": 0.00011258106355382621, "loss": 0.8127, "step": 343 }, { "epoch": 0.4431561996779388, "grad_norm": 0.5941475033760071, "learning_rate": 0.00011232166018158237, "loss": 0.8227, "step": 344 }, { "epoch": 0.4444444444444444, "grad_norm": 0.49535176157951355, "learning_rate": 0.00011206225680933852, "loss": 0.6376, "step": 345 }, { "epoch": 0.4457326892109501, "grad_norm": 0.46945926547050476, "learning_rate": 0.00011180285343709469, "loss": 0.6801, "step": 346 }, { "epoch": 0.44702093397745574, "grad_norm": 0.47991520166397095, "learning_rate": 0.00011154345006485085, "loss": 0.6071, "step": 347 }, { "epoch": 0.44830917874396137, "grad_norm": 0.45372679829597473, "learning_rate": 0.00011128404669260702, "loss": 0.7388, "step": 348 }, { "epoch": 0.449597423510467, "grad_norm": 0.5295307636260986, "learning_rate": 0.00011102464332036318, "loss": 0.776, "step": 349 }, { "epoch": 0.45088566827697263, "grad_norm": 0.516298770904541, "learning_rate": 0.00011076523994811933, "loss": 0.7546, "step": 350 }, { "epoch": 0.45217391304347826, "grad_norm": 0.4629455804824829, "learning_rate": 0.0001105058365758755, "loss": 0.5773, "step": 351 }, { "epoch": 0.4534621578099839, "grad_norm": 0.4974667727947235, "learning_rate": 0.00011024643320363166, "loss": 0.6464, "step": 352 }, { "epoch": 0.4547504025764895, "grad_norm": 0.47429102659225464, "learning_rate": 0.00010998702983138782, "loss": 0.7503, "step": 353 }, { "epoch": 0.45603864734299515, "grad_norm": 0.5169098377227783, "learning_rate": 0.00010972762645914399, "loss": 0.7697, "step": 354 }, { "epoch": 0.4573268921095008, "grad_norm": 0.6083032488822937, "learning_rate": 0.00010946822308690015, "loss": 0.7145, "step": 355 }, { "epoch": 0.45861513687600647, "grad_norm": 0.6092599034309387, "learning_rate": 0.00010920881971465629, "loss": 0.9839, "step": 356 }, { "epoch": 0.4599033816425121, "grad_norm": 0.47699296474456787, "learning_rate": 0.00010894941634241245, "loss": 0.7013, "step": 357 }, { "epoch": 0.46119162640901773, "grad_norm": 0.44026511907577515, "learning_rate": 0.0001086900129701686, "loss": 0.7314, "step": 358 }, { "epoch": 0.46247987117552336, "grad_norm": 0.5326471328735352, "learning_rate": 0.00010843060959792477, "loss": 0.8708, "step": 359 }, { "epoch": 0.463768115942029, "grad_norm": 0.5188657641410828, "learning_rate": 0.00010817120622568093, "loss": 0.6573, "step": 360 }, { "epoch": 0.4650563607085346, "grad_norm": 0.5846801400184631, "learning_rate": 0.0001079118028534371, "loss": 0.7319, "step": 361 }, { "epoch": 0.46634460547504025, "grad_norm": 0.5272177457809448, "learning_rate": 0.00010765239948119326, "loss": 0.6251, "step": 362 }, { "epoch": 0.4676328502415459, "grad_norm": 0.5060721635818481, "learning_rate": 0.00010739299610894941, "loss": 0.6675, "step": 363 }, { "epoch": 0.4689210950080515, "grad_norm": 0.5200803279876709, "learning_rate": 0.00010713359273670558, "loss": 0.6373, "step": 364 }, { "epoch": 0.47020933977455714, "grad_norm": 0.5527567863464355, "learning_rate": 0.00010687418936446174, "loss": 0.9144, "step": 365 }, { "epoch": 0.4714975845410628, "grad_norm": 0.5247730016708374, "learning_rate": 0.0001066147859922179, "loss": 0.7283, "step": 366 }, { "epoch": 0.47278582930756846, "grad_norm": 0.482681006193161, "learning_rate": 0.00010635538261997407, "loss": 0.8382, "step": 367 }, { "epoch": 0.4740740740740741, "grad_norm": 0.5045844316482544, "learning_rate": 0.00010609597924773022, "loss": 0.8324, "step": 368 }, { "epoch": 0.4753623188405797, "grad_norm": 0.500696063041687, "learning_rate": 0.00010583657587548638, "loss": 0.6749, "step": 369 }, { "epoch": 0.47665056360708535, "grad_norm": 0.49296805262565613, "learning_rate": 0.00010557717250324255, "loss": 0.8396, "step": 370 }, { "epoch": 0.477938808373591, "grad_norm": 0.5083613395690918, "learning_rate": 0.00010531776913099871, "loss": 0.596, "step": 371 }, { "epoch": 0.4792270531400966, "grad_norm": 0.6000961065292358, "learning_rate": 0.00010505836575875488, "loss": 0.7097, "step": 372 }, { "epoch": 0.48051529790660225, "grad_norm": 0.47504574060440063, "learning_rate": 0.00010479896238651104, "loss": 0.676, "step": 373 }, { "epoch": 0.4818035426731079, "grad_norm": 0.4866791069507599, "learning_rate": 0.00010453955901426719, "loss": 0.6026, "step": 374 }, { "epoch": 0.4830917874396135, "grad_norm": 0.5388527512550354, "learning_rate": 0.00010428015564202336, "loss": 0.78, "step": 375 }, { "epoch": 0.48438003220611914, "grad_norm": 0.5430642366409302, "learning_rate": 0.00010402075226977952, "loss": 0.7898, "step": 376 }, { "epoch": 0.4856682769726248, "grad_norm": 0.5378901362419128, "learning_rate": 0.00010376134889753568, "loss": 0.8215, "step": 377 }, { "epoch": 0.48695652173913045, "grad_norm": 0.46278834342956543, "learning_rate": 0.00010350194552529185, "loss": 0.786, "step": 378 }, { "epoch": 0.4882447665056361, "grad_norm": 0.5695458650588989, "learning_rate": 0.000103242542153048, "loss": 0.7929, "step": 379 }, { "epoch": 0.4895330112721417, "grad_norm": 0.5052254796028137, "learning_rate": 0.00010298313878080415, "loss": 0.8047, "step": 380 }, { "epoch": 0.49082125603864735, "grad_norm": 0.45410144329071045, "learning_rate": 0.0001027237354085603, "loss": 0.6309, "step": 381 }, { "epoch": 0.492109500805153, "grad_norm": 0.5507941842079163, "learning_rate": 0.00010246433203631646, "loss": 0.7374, "step": 382 }, { "epoch": 0.4933977455716586, "grad_norm": 0.4703005850315094, "learning_rate": 0.00010220492866407263, "loss": 0.5724, "step": 383 }, { "epoch": 0.49468599033816424, "grad_norm": 0.5034976601600647, "learning_rate": 0.0001019455252918288, "loss": 0.6829, "step": 384 }, { "epoch": 0.49597423510466987, "grad_norm": 0.5183707475662231, "learning_rate": 0.00010168612191958496, "loss": 0.6716, "step": 385 }, { "epoch": 0.4972624798711755, "grad_norm": 0.5549296736717224, "learning_rate": 0.00010142671854734112, "loss": 0.7464, "step": 386 }, { "epoch": 0.4985507246376812, "grad_norm": 0.48852047324180603, "learning_rate": 0.00010116731517509727, "loss": 0.649, "step": 387 }, { "epoch": 0.4998389694041868, "grad_norm": 0.5118862986564636, "learning_rate": 0.00010090791180285344, "loss": 0.6043, "step": 388 }, { "epoch": 0.5011272141706924, "grad_norm": 0.5366110801696777, "learning_rate": 0.0001006485084306096, "loss": 0.7139, "step": 389 }, { "epoch": 0.5024154589371981, "grad_norm": 0.5275729894638062, "learning_rate": 0.00010038910505836577, "loss": 0.7035, "step": 390 }, { "epoch": 0.5037037037037037, "grad_norm": 0.5201203227043152, "learning_rate": 0.00010012970168612193, "loss": 0.716, "step": 391 }, { "epoch": 0.5049919484702093, "grad_norm": 0.5168887376785278, "learning_rate": 9.987029831387808e-05, "loss": 0.8432, "step": 392 }, { "epoch": 0.506280193236715, "grad_norm": 0.5083385109901428, "learning_rate": 9.961089494163424e-05, "loss": 0.7078, "step": 393 }, { "epoch": 0.5075684380032206, "grad_norm": 0.5033498406410217, "learning_rate": 9.935149156939041e-05, "loss": 0.6846, "step": 394 }, { "epoch": 0.5088566827697263, "grad_norm": 0.5229712128639221, "learning_rate": 9.909208819714657e-05, "loss": 0.7517, "step": 395 }, { "epoch": 0.5101449275362319, "grad_norm": 0.4493921399116516, "learning_rate": 9.883268482490274e-05, "loss": 0.5185, "step": 396 }, { "epoch": 0.5114331723027375, "grad_norm": 0.4618862569332123, "learning_rate": 9.857328145265889e-05, "loss": 0.6914, "step": 397 }, { "epoch": 0.5127214170692431, "grad_norm": 0.5105440020561218, "learning_rate": 9.831387808041504e-05, "loss": 0.7408, "step": 398 }, { "epoch": 0.5140096618357488, "grad_norm": 0.4876827001571655, "learning_rate": 9.80544747081712e-05, "loss": 0.7642, "step": 399 }, { "epoch": 0.5152979066022544, "grad_norm": 0.5248561501502991, "learning_rate": 9.779507133592737e-05, "loss": 0.6578, "step": 400 }, { "epoch": 0.5165861513687601, "grad_norm": 0.4495491087436676, "learning_rate": 9.753566796368353e-05, "loss": 0.6296, "step": 401 }, { "epoch": 0.5178743961352656, "grad_norm": 0.4628872573375702, "learning_rate": 9.72762645914397e-05, "loss": 0.5686, "step": 402 }, { "epoch": 0.5191626409017713, "grad_norm": 0.5524469017982483, "learning_rate": 9.701686121919586e-05, "loss": 0.8243, "step": 403 }, { "epoch": 0.520450885668277, "grad_norm": 0.5526472926139832, "learning_rate": 9.675745784695201e-05, "loss": 0.7644, "step": 404 }, { "epoch": 0.5217391304347826, "grad_norm": 0.5220494270324707, "learning_rate": 9.649805447470817e-05, "loss": 0.7113, "step": 405 }, { "epoch": 0.5230273752012883, "grad_norm": 0.4727495610713959, "learning_rate": 9.623865110246434e-05, "loss": 0.5613, "step": 406 }, { "epoch": 0.5243156199677939, "grad_norm": 0.440445214509964, "learning_rate": 9.59792477302205e-05, "loss": 0.5719, "step": 407 }, { "epoch": 0.5256038647342995, "grad_norm": 0.520539402961731, "learning_rate": 9.571984435797667e-05, "loss": 0.6716, "step": 408 }, { "epoch": 0.5268921095008051, "grad_norm": 0.5473395586013794, "learning_rate": 9.546044098573282e-05, "loss": 0.6881, "step": 409 }, { "epoch": 0.5281803542673108, "grad_norm": 0.5728646516799927, "learning_rate": 9.520103761348897e-05, "loss": 0.694, "step": 410 }, { "epoch": 0.5294685990338164, "grad_norm": 0.5672905445098877, "learning_rate": 9.494163424124513e-05, "loss": 0.7893, "step": 411 }, { "epoch": 0.5307568438003221, "grad_norm": 0.5057477355003357, "learning_rate": 9.46822308690013e-05, "loss": 0.7957, "step": 412 }, { "epoch": 0.5320450885668278, "grad_norm": 0.5638203620910645, "learning_rate": 9.442282749675746e-05, "loss": 0.8208, "step": 413 }, { "epoch": 0.5333333333333333, "grad_norm": 0.4758095145225525, "learning_rate": 9.416342412451363e-05, "loss": 0.6516, "step": 414 }, { "epoch": 0.534621578099839, "grad_norm": 0.5819146037101746, "learning_rate": 9.390402075226979e-05, "loss": 0.7839, "step": 415 }, { "epoch": 0.5359098228663446, "grad_norm": 0.5698294639587402, "learning_rate": 9.364461738002594e-05, "loss": 0.6847, "step": 416 }, { "epoch": 0.5371980676328503, "grad_norm": 0.5539317727088928, "learning_rate": 9.33852140077821e-05, "loss": 0.6868, "step": 417 }, { "epoch": 0.5384863123993558, "grad_norm": 0.5531253218650818, "learning_rate": 9.312581063553827e-05, "loss": 0.6416, "step": 418 }, { "epoch": 0.5397745571658615, "grad_norm": 0.5280610918998718, "learning_rate": 9.286640726329443e-05, "loss": 0.6711, "step": 419 }, { "epoch": 0.5410628019323671, "grad_norm": 0.5485169291496277, "learning_rate": 9.26070038910506e-05, "loss": 0.67, "step": 420 }, { "epoch": 0.5423510466988728, "grad_norm": 0.632940948009491, "learning_rate": 9.234760051880675e-05, "loss": 1.0122, "step": 421 }, { "epoch": 0.5436392914653784, "grad_norm": 0.5226237177848816, "learning_rate": 9.20881971465629e-05, "loss": 0.6517, "step": 422 }, { "epoch": 0.5449275362318841, "grad_norm": 0.5100864768028259, "learning_rate": 9.182879377431906e-05, "loss": 0.5886, "step": 423 }, { "epoch": 0.5462157809983897, "grad_norm": 0.5091288685798645, "learning_rate": 9.156939040207523e-05, "loss": 0.759, "step": 424 }, { "epoch": 0.5475040257648953, "grad_norm": 0.5094250440597534, "learning_rate": 9.130998702983139e-05, "loss": 0.6407, "step": 425 }, { "epoch": 0.548792270531401, "grad_norm": 0.4518897533416748, "learning_rate": 9.105058365758756e-05, "loss": 0.5463, "step": 426 }, { "epoch": 0.5500805152979066, "grad_norm": 0.5876538753509521, "learning_rate": 9.07911802853437e-05, "loss": 0.8909, "step": 427 }, { "epoch": 0.5513687600644123, "grad_norm": 0.5553408265113831, "learning_rate": 9.053177691309987e-05, "loss": 0.9111, "step": 428 }, { "epoch": 0.5526570048309178, "grad_norm": 0.6221159100532532, "learning_rate": 9.027237354085604e-05, "loss": 0.8558, "step": 429 }, { "epoch": 0.5539452495974235, "grad_norm": 0.5404058694839478, "learning_rate": 9.00129701686122e-05, "loss": 0.7609, "step": 430 }, { "epoch": 0.5552334943639291, "grad_norm": 0.43805137276649475, "learning_rate": 8.975356679636836e-05, "loss": 0.5286, "step": 431 }, { "epoch": 0.5565217391304348, "grad_norm": 0.493563175201416, "learning_rate": 8.949416342412453e-05, "loss": 0.5727, "step": 432 }, { "epoch": 0.5578099838969404, "grad_norm": 0.5368430614471436, "learning_rate": 8.923476005188068e-05, "loss": 0.7623, "step": 433 }, { "epoch": 0.559098228663446, "grad_norm": 0.4323422312736511, "learning_rate": 8.897535667963683e-05, "loss": 0.5958, "step": 434 }, { "epoch": 0.5603864734299517, "grad_norm": 0.49179717898368835, "learning_rate": 8.8715953307393e-05, "loss": 0.681, "step": 435 }, { "epoch": 0.5616747181964573, "grad_norm": 0.40715619921684265, "learning_rate": 8.845654993514916e-05, "loss": 0.6298, "step": 436 }, { "epoch": 0.562962962962963, "grad_norm": 0.6095149517059326, "learning_rate": 8.819714656290532e-05, "loss": 1.0039, "step": 437 }, { "epoch": 0.5642512077294686, "grad_norm": 0.5469616055488586, "learning_rate": 8.793774319066149e-05, "loss": 0.7941, "step": 438 }, { "epoch": 0.5655394524959743, "grad_norm": 0.5149989128112793, "learning_rate": 8.767833981841764e-05, "loss": 0.7819, "step": 439 }, { "epoch": 0.5668276972624798, "grad_norm": 0.479438453912735, "learning_rate": 8.74189364461738e-05, "loss": 0.4806, "step": 440 }, { "epoch": 0.5681159420289855, "grad_norm": 0.562567412853241, "learning_rate": 8.715953307392997e-05, "loss": 0.7942, "step": 441 }, { "epoch": 0.5694041867954911, "grad_norm": 0.5192587375640869, "learning_rate": 8.690012970168613e-05, "loss": 0.6479, "step": 442 }, { "epoch": 0.5706924315619968, "grad_norm": 0.4897756576538086, "learning_rate": 8.66407263294423e-05, "loss": 0.6539, "step": 443 }, { "epoch": 0.5719806763285025, "grad_norm": 0.45649632811546326, "learning_rate": 8.638132295719846e-05, "loss": 0.657, "step": 444 }, { "epoch": 0.573268921095008, "grad_norm": 0.5581417679786682, "learning_rate": 8.612191958495461e-05, "loss": 0.7415, "step": 445 }, { "epoch": 0.5745571658615137, "grad_norm": 0.4822051525115967, "learning_rate": 8.586251621271076e-05, "loss": 0.7249, "step": 446 }, { "epoch": 0.5758454106280193, "grad_norm": 0.6398015022277832, "learning_rate": 8.560311284046692e-05, "loss": 0.7328, "step": 447 }, { "epoch": 0.577133655394525, "grad_norm": 0.5618659257888794, "learning_rate": 8.534370946822309e-05, "loss": 0.8104, "step": 448 }, { "epoch": 0.5784219001610306, "grad_norm": 0.49202972650527954, "learning_rate": 8.508430609597925e-05, "loss": 0.6797, "step": 449 }, { "epoch": 0.5797101449275363, "grad_norm": 0.5291930437088013, "learning_rate": 8.482490272373542e-05, "loss": 0.6015, "step": 450 }, { "epoch": 0.5809983896940418, "grad_norm": 0.5322192907333374, "learning_rate": 8.456549935149157e-05, "loss": 0.7246, "step": 451 }, { "epoch": 0.5822866344605475, "grad_norm": 0.5172200798988342, "learning_rate": 8.430609597924773e-05, "loss": 0.6873, "step": 452 }, { "epoch": 0.5835748792270531, "grad_norm": 0.5367067456245422, "learning_rate": 8.40466926070039e-05, "loss": 0.7349, "step": 453 }, { "epoch": 0.5848631239935588, "grad_norm": 0.5243058204650879, "learning_rate": 8.378728923476006e-05, "loss": 0.6441, "step": 454 }, { "epoch": 0.5861513687600645, "grad_norm": 0.5509822964668274, "learning_rate": 8.352788586251622e-05, "loss": 0.7456, "step": 455 }, { "epoch": 0.58743961352657, "grad_norm": 0.5376744866371155, "learning_rate": 8.326848249027238e-05, "loss": 0.6808, "step": 456 }, { "epoch": 0.5887278582930757, "grad_norm": 0.5412257313728333, "learning_rate": 8.300907911802854e-05, "loss": 0.6135, "step": 457 }, { "epoch": 0.5900161030595813, "grad_norm": 0.5956122279167175, "learning_rate": 8.274967574578469e-05, "loss": 0.7419, "step": 458 }, { "epoch": 0.591304347826087, "grad_norm": 0.5524086952209473, "learning_rate": 8.249027237354085e-05, "loss": 0.5655, "step": 459 }, { "epoch": 0.5925925925925926, "grad_norm": 0.5783061981201172, "learning_rate": 8.223086900129702e-05, "loss": 0.6528, "step": 460 }, { "epoch": 0.5938808373590982, "grad_norm": 0.5542893409729004, "learning_rate": 8.197146562905318e-05, "loss": 0.6988, "step": 461 }, { "epoch": 0.5951690821256038, "grad_norm": 0.5710337162017822, "learning_rate": 8.171206225680935e-05, "loss": 0.7465, "step": 462 }, { "epoch": 0.5964573268921095, "grad_norm": 0.5694112181663513, "learning_rate": 8.14526588845655e-05, "loss": 0.7005, "step": 463 }, { "epoch": 0.5977455716586151, "grad_norm": 0.5017877221107483, "learning_rate": 8.119325551232166e-05, "loss": 0.5978, "step": 464 }, { "epoch": 0.5990338164251208, "grad_norm": 0.5797461271286011, "learning_rate": 8.093385214007783e-05, "loss": 0.8, "step": 465 }, { "epoch": 0.6003220611916265, "grad_norm": 0.597811222076416, "learning_rate": 8.067444876783399e-05, "loss": 0.8356, "step": 466 }, { "epoch": 0.601610305958132, "grad_norm": 0.5971367955207825, "learning_rate": 8.041504539559015e-05, "loss": 1.0245, "step": 467 }, { "epoch": 0.6028985507246377, "grad_norm": 0.5506448745727539, "learning_rate": 8.01556420233463e-05, "loss": 0.6065, "step": 468 }, { "epoch": 0.6041867954911433, "grad_norm": 0.5866613984107971, "learning_rate": 7.989623865110247e-05, "loss": 0.7841, "step": 469 }, { "epoch": 0.605475040257649, "grad_norm": 0.5632089376449585, "learning_rate": 7.963683527885862e-05, "loss": 0.8046, "step": 470 }, { "epoch": 0.6067632850241546, "grad_norm": 0.5145373940467834, "learning_rate": 7.937743190661478e-05, "loss": 0.5978, "step": 471 }, { "epoch": 0.6080515297906602, "grad_norm": 0.48332056403160095, "learning_rate": 7.911802853437095e-05, "loss": 0.6119, "step": 472 }, { "epoch": 0.6093397745571658, "grad_norm": 0.522520899772644, "learning_rate": 7.885862516212711e-05, "loss": 0.6623, "step": 473 }, { "epoch": 0.6106280193236715, "grad_norm": 0.5305100679397583, "learning_rate": 7.859922178988328e-05, "loss": 0.7882, "step": 474 }, { "epoch": 0.6119162640901772, "grad_norm": 0.4909839630126953, "learning_rate": 7.833981841763943e-05, "loss": 0.625, "step": 475 }, { "epoch": 0.6132045088566828, "grad_norm": 0.5770312547683716, "learning_rate": 7.808041504539559e-05, "loss": 0.7479, "step": 476 }, { "epoch": 0.6144927536231884, "grad_norm": 0.556817889213562, "learning_rate": 7.782101167315176e-05, "loss": 0.8317, "step": 477 }, { "epoch": 0.615780998389694, "grad_norm": 0.5197098255157471, "learning_rate": 7.756160830090792e-05, "loss": 0.761, "step": 478 }, { "epoch": 0.6170692431561997, "grad_norm": 0.5032650828361511, "learning_rate": 7.730220492866408e-05, "loss": 0.7149, "step": 479 }, { "epoch": 0.6183574879227053, "grad_norm": 0.5901761651039124, "learning_rate": 7.704280155642024e-05, "loss": 0.723, "step": 480 }, { "epoch": 0.619645732689211, "grad_norm": 0.5224949717521667, "learning_rate": 7.67833981841764e-05, "loss": 0.7275, "step": 481 }, { "epoch": 0.6209339774557165, "grad_norm": 0.47279688715934753, "learning_rate": 7.652399481193255e-05, "loss": 0.5791, "step": 482 }, { "epoch": 0.6222222222222222, "grad_norm": 0.49582868814468384, "learning_rate": 7.626459143968871e-05, "loss": 0.6728, "step": 483 }, { "epoch": 0.6235104669887278, "grad_norm": 0.4722840189933777, "learning_rate": 7.600518806744488e-05, "loss": 0.5838, "step": 484 }, { "epoch": 0.6247987117552335, "grad_norm": 0.5800105333328247, "learning_rate": 7.574578469520104e-05, "loss": 0.595, "step": 485 }, { "epoch": 0.6260869565217392, "grad_norm": 0.5518195033073425, "learning_rate": 7.54863813229572e-05, "loss": 0.7302, "step": 486 }, { "epoch": 0.6273752012882448, "grad_norm": 0.44109973311424255, "learning_rate": 7.522697795071336e-05, "loss": 0.5433, "step": 487 }, { "epoch": 0.6286634460547504, "grad_norm": 0.5839915871620178, "learning_rate": 7.496757457846952e-05, "loss": 0.7484, "step": 488 }, { "epoch": 0.629951690821256, "grad_norm": 0.6299886107444763, "learning_rate": 7.470817120622569e-05, "loss": 0.7771, "step": 489 }, { "epoch": 0.6312399355877617, "grad_norm": 0.48367929458618164, "learning_rate": 7.444876783398185e-05, "loss": 0.5674, "step": 490 }, { "epoch": 0.6325281803542673, "grad_norm": 0.5867652893066406, "learning_rate": 7.418936446173802e-05, "loss": 0.7733, "step": 491 }, { "epoch": 0.633816425120773, "grad_norm": 0.4677927494049072, "learning_rate": 7.392996108949417e-05, "loss": 0.6418, "step": 492 }, { "epoch": 0.6351046698872785, "grad_norm": 0.5139054656028748, "learning_rate": 7.367055771725033e-05, "loss": 0.7922, "step": 493 }, { "epoch": 0.6363929146537842, "grad_norm": 0.4561646282672882, "learning_rate": 7.341115434500648e-05, "loss": 0.5885, "step": 494 }, { "epoch": 0.6376811594202898, "grad_norm": 0.5079929828643799, "learning_rate": 7.315175097276265e-05, "loss": 0.6827, "step": 495 }, { "epoch": 0.6389694041867955, "grad_norm": 0.5590360164642334, "learning_rate": 7.289234760051881e-05, "loss": 0.6681, "step": 496 }, { "epoch": 0.6402576489533012, "grad_norm": 0.585269033908844, "learning_rate": 7.263294422827497e-05, "loss": 0.7604, "step": 497 }, { "epoch": 0.6415458937198067, "grad_norm": 0.5380440950393677, "learning_rate": 7.237354085603112e-05, "loss": 0.7946, "step": 498 }, { "epoch": 0.6428341384863124, "grad_norm": 0.4413246214389801, "learning_rate": 7.211413748378729e-05, "loss": 0.5847, "step": 499 }, { "epoch": 0.644122383252818, "grad_norm": 0.536934494972229, "learning_rate": 7.185473411154345e-05, "loss": 0.745, "step": 500 }, { "epoch": 0.6454106280193237, "grad_norm": 0.46904176473617554, "learning_rate": 7.159533073929962e-05, "loss": 0.6846, "step": 501 }, { "epoch": 0.6466988727858293, "grad_norm": 0.5345873832702637, "learning_rate": 7.133592736705578e-05, "loss": 0.7499, "step": 502 }, { "epoch": 0.647987117552335, "grad_norm": 0.5083842873573303, "learning_rate": 7.107652399481195e-05, "loss": 0.7829, "step": 503 }, { "epoch": 0.6492753623188405, "grad_norm": 0.49629780650138855, "learning_rate": 7.08171206225681e-05, "loss": 0.6308, "step": 504 }, { "epoch": 0.6505636070853462, "grad_norm": 0.5113663077354431, "learning_rate": 7.055771725032426e-05, "loss": 0.7062, "step": 505 }, { "epoch": 0.6518518518518519, "grad_norm": 0.5348049402236938, "learning_rate": 7.029831387808041e-05, "loss": 0.7495, "step": 506 }, { "epoch": 0.6531400966183575, "grad_norm": 0.5834509134292603, "learning_rate": 7.003891050583658e-05, "loss": 0.81, "step": 507 }, { "epoch": 0.6544283413848632, "grad_norm": 0.5517732501029968, "learning_rate": 6.977950713359274e-05, "loss": 0.7376, "step": 508 }, { "epoch": 0.6557165861513687, "grad_norm": 0.5555460453033447, "learning_rate": 6.95201037613489e-05, "loss": 0.6707, "step": 509 }, { "epoch": 0.6570048309178744, "grad_norm": 0.5952188968658447, "learning_rate": 6.926070038910505e-05, "loss": 0.8308, "step": 510 }, { "epoch": 0.65829307568438, "grad_norm": 0.46281638741493225, "learning_rate": 6.900129701686122e-05, "loss": 0.5855, "step": 511 }, { "epoch": 0.6595813204508857, "grad_norm": 0.5051981210708618, "learning_rate": 6.874189364461738e-05, "loss": 0.7197, "step": 512 }, { "epoch": 0.6608695652173913, "grad_norm": 0.5460030436515808, "learning_rate": 6.848249027237355e-05, "loss": 0.6863, "step": 513 }, { "epoch": 0.662157809983897, "grad_norm": 0.504718542098999, "learning_rate": 6.822308690012971e-05, "loss": 0.5749, "step": 514 }, { "epoch": 0.6634460547504025, "grad_norm": 0.5503727793693542, "learning_rate": 6.796368352788586e-05, "loss": 0.6802, "step": 515 }, { "epoch": 0.6647342995169082, "grad_norm": 0.559354305267334, "learning_rate": 6.770428015564203e-05, "loss": 0.6774, "step": 516 }, { "epoch": 0.6660225442834139, "grad_norm": 0.5191950798034668, "learning_rate": 6.744487678339819e-05, "loss": 0.5853, "step": 517 }, { "epoch": 0.6673107890499195, "grad_norm": 0.5837051868438721, "learning_rate": 6.718547341115434e-05, "loss": 0.7629, "step": 518 }, { "epoch": 0.6685990338164252, "grad_norm": 0.49824637174606323, "learning_rate": 6.69260700389105e-05, "loss": 0.6449, "step": 519 }, { "epoch": 0.6698872785829307, "grad_norm": 0.5827267169952393, "learning_rate": 6.666666666666667e-05, "loss": 0.6425, "step": 520 }, { "epoch": 0.6711755233494364, "grad_norm": 0.5547000169754028, "learning_rate": 6.640726329442283e-05, "loss": 0.8105, "step": 521 }, { "epoch": 0.672463768115942, "grad_norm": 0.5251694321632385, "learning_rate": 6.614785992217898e-05, "loss": 0.6392, "step": 522 }, { "epoch": 0.6737520128824477, "grad_norm": 0.577367901802063, "learning_rate": 6.588845654993515e-05, "loss": 0.7776, "step": 523 }, { "epoch": 0.6750402576489533, "grad_norm": 0.5495286583900452, "learning_rate": 6.562905317769131e-05, "loss": 0.6673, "step": 524 }, { "epoch": 0.6763285024154589, "grad_norm": 0.6513116955757141, "learning_rate": 6.536964980544748e-05, "loss": 0.8314, "step": 525 }, { "epoch": 0.6776167471819645, "grad_norm": 0.5346915125846863, "learning_rate": 6.511024643320364e-05, "loss": 0.703, "step": 526 }, { "epoch": 0.6789049919484702, "grad_norm": 0.5663869380950928, "learning_rate": 6.485084306095979e-05, "loss": 0.6595, "step": 527 }, { "epoch": 0.6801932367149759, "grad_norm": 0.5390554070472717, "learning_rate": 6.459143968871596e-05, "loss": 0.7031, "step": 528 }, { "epoch": 0.6814814814814815, "grad_norm": 0.5291828513145447, "learning_rate": 6.433203631647212e-05, "loss": 0.7384, "step": 529 }, { "epoch": 0.6827697262479872, "grad_norm": 0.507726788520813, "learning_rate": 6.407263294422829e-05, "loss": 0.6169, "step": 530 }, { "epoch": 0.6840579710144927, "grad_norm": 0.524138331413269, "learning_rate": 6.381322957198444e-05, "loss": 0.6784, "step": 531 }, { "epoch": 0.6853462157809984, "grad_norm": 0.5644485950469971, "learning_rate": 6.35538261997406e-05, "loss": 0.8255, "step": 532 }, { "epoch": 0.686634460547504, "grad_norm": 0.5468744039535522, "learning_rate": 6.329442282749676e-05, "loss": 0.7893, "step": 533 }, { "epoch": 0.6879227053140097, "grad_norm": 0.4952101409435272, "learning_rate": 6.303501945525292e-05, "loss": 0.6192, "step": 534 }, { "epoch": 0.6892109500805152, "grad_norm": 0.5614569187164307, "learning_rate": 6.277561608300908e-05, "loss": 0.7055, "step": 535 }, { "epoch": 0.6904991948470209, "grad_norm": 0.5651270151138306, "learning_rate": 6.251621271076524e-05, "loss": 0.7327, "step": 536 }, { "epoch": 0.6917874396135266, "grad_norm": 0.5416032075881958, "learning_rate": 6.225680933852141e-05, "loss": 0.64, "step": 537 }, { "epoch": 0.6930756843800322, "grad_norm": 0.6302821636199951, "learning_rate": 6.199740596627757e-05, "loss": 0.8542, "step": 538 }, { "epoch": 0.6943639291465379, "grad_norm": 0.5361074805259705, "learning_rate": 6.173800259403372e-05, "loss": 0.6282, "step": 539 }, { "epoch": 0.6956521739130435, "grad_norm": 0.5210204124450684, "learning_rate": 6.147859922178989e-05, "loss": 0.8354, "step": 540 }, { "epoch": 0.6969404186795491, "grad_norm": 0.5401708483695984, "learning_rate": 6.121919584954605e-05, "loss": 0.6165, "step": 541 }, { "epoch": 0.6982286634460547, "grad_norm": 0.516559362411499, "learning_rate": 6.0959792477302215e-05, "loss": 0.5498, "step": 542 }, { "epoch": 0.6995169082125604, "grad_norm": 0.5983400344848633, "learning_rate": 6.0700389105058366e-05, "loss": 0.7538, "step": 543 }, { "epoch": 0.700805152979066, "grad_norm": 0.5111982226371765, "learning_rate": 6.0440985732814524e-05, "loss": 0.6156, "step": 544 }, { "epoch": 0.7020933977455717, "grad_norm": 0.5821353793144226, "learning_rate": 6.018158236057069e-05, "loss": 0.6417, "step": 545 }, { "epoch": 0.7033816425120772, "grad_norm": 0.4738411009311676, "learning_rate": 5.992217898832685e-05, "loss": 0.6541, "step": 546 }, { "epoch": 0.7046698872785829, "grad_norm": 0.6165397763252258, "learning_rate": 5.966277561608301e-05, "loss": 0.7366, "step": 547 }, { "epoch": 0.7059581320450886, "grad_norm": 0.5883972644805908, "learning_rate": 5.9403372243839174e-05, "loss": 0.7371, "step": 548 }, { "epoch": 0.7072463768115942, "grad_norm": 0.5415938496589661, "learning_rate": 5.914396887159533e-05, "loss": 0.6334, "step": 549 }, { "epoch": 0.7085346215780999, "grad_norm": 0.5565886497497559, "learning_rate": 5.8884565499351496e-05, "loss": 0.7425, "step": 550 }, { "epoch": 0.7098228663446055, "grad_norm": 0.6447110772132874, "learning_rate": 5.862516212710766e-05, "loss": 0.8405, "step": 551 }, { "epoch": 0.7111111111111111, "grad_norm": 0.6419034004211426, "learning_rate": 5.836575875486382e-05, "loss": 0.8779, "step": 552 }, { "epoch": 0.7123993558776167, "grad_norm": 0.4611152708530426, "learning_rate": 5.810635538261998e-05, "loss": 0.5832, "step": 553 }, { "epoch": 0.7136876006441224, "grad_norm": 0.6436396837234497, "learning_rate": 5.7846952010376146e-05, "loss": 0.8172, "step": 554 }, { "epoch": 0.714975845410628, "grad_norm": 0.5647209286689758, "learning_rate": 5.7587548638132296e-05, "loss": 0.6637, "step": 555 }, { "epoch": 0.7162640901771337, "grad_norm": 0.5272210240364075, "learning_rate": 5.7328145265888454e-05, "loss": 0.6103, "step": 556 }, { "epoch": 0.7175523349436392, "grad_norm": 0.5229634046554565, "learning_rate": 5.706874189364462e-05, "loss": 0.5514, "step": 557 }, { "epoch": 0.7188405797101449, "grad_norm": 0.6116520166397095, "learning_rate": 5.6809338521400776e-05, "loss": 0.8535, "step": 558 }, { "epoch": 0.7201288244766506, "grad_norm": 0.5706294178962708, "learning_rate": 5.654993514915694e-05, "loss": 0.7606, "step": 559 }, { "epoch": 0.7214170692431562, "grad_norm": 0.6013360619544983, "learning_rate": 5.6290531776913104e-05, "loss": 0.7887, "step": 560 }, { "epoch": 0.7227053140096619, "grad_norm": 0.5661988258361816, "learning_rate": 5.603112840466926e-05, "loss": 0.9302, "step": 561 }, { "epoch": 0.7239935587761674, "grad_norm": 0.5267884135246277, "learning_rate": 5.5771725032425426e-05, "loss": 0.7534, "step": 562 }, { "epoch": 0.7252818035426731, "grad_norm": 0.4822220504283905, "learning_rate": 5.551232166018159e-05, "loss": 0.5666, "step": 563 }, { "epoch": 0.7265700483091787, "grad_norm": 0.5841349363327026, "learning_rate": 5.525291828793775e-05, "loss": 0.7755, "step": 564 }, { "epoch": 0.7278582930756844, "grad_norm": 0.5259692072868347, "learning_rate": 5.499351491569391e-05, "loss": 0.7599, "step": 565 }, { "epoch": 0.72914653784219, "grad_norm": 0.5511097311973572, "learning_rate": 5.4734111543450076e-05, "loss": 0.6198, "step": 566 }, { "epoch": 0.7304347826086957, "grad_norm": 0.5707940459251404, "learning_rate": 5.447470817120623e-05, "loss": 0.7669, "step": 567 }, { "epoch": 0.7317230273752013, "grad_norm": 0.6099474430084229, "learning_rate": 5.4215304798962384e-05, "loss": 0.7636, "step": 568 }, { "epoch": 0.7330112721417069, "grad_norm": 0.4825986623764038, "learning_rate": 5.395590142671855e-05, "loss": 0.5758, "step": 569 }, { "epoch": 0.7342995169082126, "grad_norm": 0.457233190536499, "learning_rate": 5.3696498054474706e-05, "loss": 0.5534, "step": 570 }, { "epoch": 0.7355877616747182, "grad_norm": 0.5602165460586548, "learning_rate": 5.343709468223087e-05, "loss": 0.5802, "step": 571 }, { "epoch": 0.7368760064412239, "grad_norm": 0.6400203108787537, "learning_rate": 5.3177691309987034e-05, "loss": 0.9281, "step": 572 }, { "epoch": 0.7381642512077294, "grad_norm": 0.4856846332550049, "learning_rate": 5.291828793774319e-05, "loss": 0.6179, "step": 573 }, { "epoch": 0.7394524959742351, "grad_norm": 0.5459800958633423, "learning_rate": 5.2658884565499356e-05, "loss": 0.7225, "step": 574 }, { "epoch": 0.7407407407407407, "grad_norm": 0.5425988435745239, "learning_rate": 5.239948119325552e-05, "loss": 0.7091, "step": 575 }, { "epoch": 0.7420289855072464, "grad_norm": 0.6519750356674194, "learning_rate": 5.214007782101168e-05, "loss": 0.7549, "step": 576 }, { "epoch": 0.743317230273752, "grad_norm": 0.5276802778244019, "learning_rate": 5.188067444876784e-05, "loss": 0.5972, "step": 577 }, { "epoch": 0.7446054750402576, "grad_norm": 0.5245682001113892, "learning_rate": 5.1621271076524e-05, "loss": 0.631, "step": 578 }, { "epoch": 0.7458937198067633, "grad_norm": 0.574123203754425, "learning_rate": 5.136186770428015e-05, "loss": 0.8617, "step": 579 }, { "epoch": 0.7471819645732689, "grad_norm": 0.5302646160125732, "learning_rate": 5.1102464332036315e-05, "loss": 0.682, "step": 580 }, { "epoch": 0.7484702093397746, "grad_norm": 0.528005063533783, "learning_rate": 5.084306095979248e-05, "loss": 0.5872, "step": 581 }, { "epoch": 0.7497584541062802, "grad_norm": 0.5490335822105408, "learning_rate": 5.0583657587548636e-05, "loss": 0.571, "step": 582 }, { "epoch": 0.7510466988727859, "grad_norm": 0.5383925437927246, "learning_rate": 5.03242542153048e-05, "loss": 0.5694, "step": 583 }, { "epoch": 0.7523349436392914, "grad_norm": 0.5377727150917053, "learning_rate": 5.0064850843060965e-05, "loss": 0.7569, "step": 584 }, { "epoch": 0.7536231884057971, "grad_norm": 0.5144835710525513, "learning_rate": 4.980544747081712e-05, "loss": 0.6205, "step": 585 }, { "epoch": 0.7549114331723027, "grad_norm": 0.5208680033683777, "learning_rate": 4.9546044098573286e-05, "loss": 0.6205, "step": 586 }, { "epoch": 0.7561996779388084, "grad_norm": 0.6118026375770569, "learning_rate": 4.9286640726329444e-05, "loss": 0.8551, "step": 587 }, { "epoch": 0.7574879227053141, "grad_norm": 0.5483299493789673, "learning_rate": 4.90272373540856e-05, "loss": 0.6421, "step": 588 }, { "epoch": 0.7587761674718196, "grad_norm": 0.5909067392349243, "learning_rate": 4.8767833981841766e-05, "loss": 0.7131, "step": 589 }, { "epoch": 0.7600644122383253, "grad_norm": 0.46308815479278564, "learning_rate": 4.850843060959793e-05, "loss": 0.472, "step": 590 }, { "epoch": 0.7613526570048309, "grad_norm": 0.6142207384109497, "learning_rate": 4.824902723735409e-05, "loss": 0.6328, "step": 591 }, { "epoch": 0.7626409017713366, "grad_norm": 0.6880104541778564, "learning_rate": 4.798962386511025e-05, "loss": 0.838, "step": 592 }, { "epoch": 0.7639291465378422, "grad_norm": 0.5680267214775085, "learning_rate": 4.773022049286641e-05, "loss": 0.7676, "step": 593 }, { "epoch": 0.7652173913043478, "grad_norm": 0.5472928881645203, "learning_rate": 4.7470817120622567e-05, "loss": 0.7345, "step": 594 }, { "epoch": 0.7665056360708534, "grad_norm": 0.6231438517570496, "learning_rate": 4.721141374837873e-05, "loss": 0.7582, "step": 595 }, { "epoch": 0.7677938808373591, "grad_norm": 0.5030277371406555, "learning_rate": 4.6952010376134895e-05, "loss": 0.5928, "step": 596 }, { "epoch": 0.7690821256038647, "grad_norm": 0.6041036248207092, "learning_rate": 4.669260700389105e-05, "loss": 0.6439, "step": 597 }, { "epoch": 0.7703703703703704, "grad_norm": 0.5044084191322327, "learning_rate": 4.643320363164722e-05, "loss": 0.5618, "step": 598 }, { "epoch": 0.7716586151368761, "grad_norm": 0.5399697422981262, "learning_rate": 4.6173800259403374e-05, "loss": 0.6366, "step": 599 }, { "epoch": 0.7729468599033816, "grad_norm": 0.496896892786026, "learning_rate": 4.591439688715953e-05, "loss": 0.5864, "step": 600 }, { "epoch": 0.7742351046698873, "grad_norm": 0.46158090233802795, "learning_rate": 4.5654993514915696e-05, "loss": 0.609, "step": 601 }, { "epoch": 0.7755233494363929, "grad_norm": 0.5886946320533752, "learning_rate": 4.539559014267185e-05, "loss": 0.7699, "step": 602 }, { "epoch": 0.7768115942028986, "grad_norm": 0.5680760145187378, "learning_rate": 4.513618677042802e-05, "loss": 0.8665, "step": 603 }, { "epoch": 0.7780998389694042, "grad_norm": 0.5787962675094604, "learning_rate": 4.487678339818418e-05, "loss": 0.6942, "step": 604 }, { "epoch": 0.7793880837359098, "grad_norm": 0.6179983615875244, "learning_rate": 4.461738002594034e-05, "loss": 0.7403, "step": 605 }, { "epoch": 0.7806763285024154, "grad_norm": 0.5327017903327942, "learning_rate": 4.43579766536965e-05, "loss": 0.6714, "step": 606 }, { "epoch": 0.7819645732689211, "grad_norm": 0.5620171427726746, "learning_rate": 4.409857328145266e-05, "loss": 0.6706, "step": 607 }, { "epoch": 0.7832528180354267, "grad_norm": 0.5355799794197083, "learning_rate": 4.383916990920882e-05, "loss": 0.6042, "step": 608 }, { "epoch": 0.7845410628019324, "grad_norm": 0.692477285861969, "learning_rate": 4.357976653696498e-05, "loss": 0.7384, "step": 609 }, { "epoch": 0.785829307568438, "grad_norm": 0.5491352081298828, "learning_rate": 4.332036316472115e-05, "loss": 0.6315, "step": 610 }, { "epoch": 0.7871175523349436, "grad_norm": 0.6350588202476501, "learning_rate": 4.3060959792477304e-05, "loss": 0.7889, "step": 611 }, { "epoch": 0.7884057971014493, "grad_norm": 0.5784136652946472, "learning_rate": 4.280155642023346e-05, "loss": 0.7421, "step": 612 }, { "epoch": 0.7896940418679549, "grad_norm": 0.55226069688797, "learning_rate": 4.2542153047989626e-05, "loss": 0.7752, "step": 613 }, { "epoch": 0.7909822866344606, "grad_norm": 0.541728138923645, "learning_rate": 4.2282749675745784e-05, "loss": 0.6681, "step": 614 }, { "epoch": 0.7922705314009661, "grad_norm": 0.4921126067638397, "learning_rate": 4.202334630350195e-05, "loss": 0.5087, "step": 615 }, { "epoch": 0.7935587761674718, "grad_norm": 0.5723814368247986, "learning_rate": 4.176394293125811e-05, "loss": 0.7275, "step": 616 }, { "epoch": 0.7948470209339774, "grad_norm": 0.5064358115196228, "learning_rate": 4.150453955901427e-05, "loss": 0.6857, "step": 617 }, { "epoch": 0.7961352657004831, "grad_norm": 0.495473176240921, "learning_rate": 4.124513618677043e-05, "loss": 0.5766, "step": 618 }, { "epoch": 0.7974235104669888, "grad_norm": 0.47758999466896057, "learning_rate": 4.098573281452659e-05, "loss": 0.5152, "step": 619 }, { "epoch": 0.7987117552334944, "grad_norm": 0.5546131730079651, "learning_rate": 4.072632944228275e-05, "loss": 0.5477, "step": 620 }, { "epoch": 0.8, "grad_norm": 0.637289822101593, "learning_rate": 4.046692607003891e-05, "loss": 0.8712, "step": 621 }, { "epoch": 0.8012882447665056, "grad_norm": 0.6441432237625122, "learning_rate": 4.020752269779508e-05, "loss": 0.751, "step": 622 }, { "epoch": 0.8025764895330113, "grad_norm": 0.5846959352493286, "learning_rate": 3.9948119325551235e-05, "loss": 0.764, "step": 623 }, { "epoch": 0.8038647342995169, "grad_norm": 0.5156934261322021, "learning_rate": 3.968871595330739e-05, "loss": 0.6254, "step": 624 }, { "epoch": 0.8051529790660226, "grad_norm": 0.5897034406661987, "learning_rate": 3.9429312581063556e-05, "loss": 0.6232, "step": 625 }, { "epoch": 0.8064412238325281, "grad_norm": 0.6254003643989563, "learning_rate": 3.9169909208819714e-05, "loss": 0.7375, "step": 626 }, { "epoch": 0.8077294685990338, "grad_norm": 0.5816264152526855, "learning_rate": 3.891050583657588e-05, "loss": 0.752, "step": 627 }, { "epoch": 0.8090177133655394, "grad_norm": 0.570949912071228, "learning_rate": 3.865110246433204e-05, "loss": 0.6546, "step": 628 }, { "epoch": 0.8103059581320451, "grad_norm": 0.5094951391220093, "learning_rate": 3.83916990920882e-05, "loss": 0.5009, "step": 629 }, { "epoch": 0.8115942028985508, "grad_norm": 0.6055474281311035, "learning_rate": 3.813229571984436e-05, "loss": 0.7338, "step": 630 }, { "epoch": 0.8128824476650564, "grad_norm": 0.5392929911613464, "learning_rate": 3.787289234760052e-05, "loss": 0.563, "step": 631 }, { "epoch": 0.814170692431562, "grad_norm": 0.6130269765853882, "learning_rate": 3.761348897535668e-05, "loss": 0.8099, "step": 632 }, { "epoch": 0.8154589371980676, "grad_norm": 0.6383023262023926, "learning_rate": 3.735408560311284e-05, "loss": 0.5776, "step": 633 }, { "epoch": 0.8167471819645733, "grad_norm": 0.4606671929359436, "learning_rate": 3.709468223086901e-05, "loss": 0.5352, "step": 634 }, { "epoch": 0.8180354267310789, "grad_norm": 0.5078199505805969, "learning_rate": 3.6835278858625165e-05, "loss": 0.5113, "step": 635 }, { "epoch": 0.8193236714975846, "grad_norm": 0.5447183847427368, "learning_rate": 3.657587548638132e-05, "loss": 0.6092, "step": 636 }, { "epoch": 0.8206119162640901, "grad_norm": 0.632024884223938, "learning_rate": 3.631647211413749e-05, "loss": 0.7237, "step": 637 }, { "epoch": 0.8219001610305958, "grad_norm": 0.5381270051002502, "learning_rate": 3.6057068741893644e-05, "loss": 0.6391, "step": 638 }, { "epoch": 0.8231884057971014, "grad_norm": 0.5342917442321777, "learning_rate": 3.579766536964981e-05, "loss": 0.672, "step": 639 }, { "epoch": 0.8244766505636071, "grad_norm": 0.6121646761894226, "learning_rate": 3.553826199740597e-05, "loss": 0.8898, "step": 640 }, { "epoch": 0.8257648953301128, "grad_norm": 0.6056507229804993, "learning_rate": 3.527885862516213e-05, "loss": 0.585, "step": 641 }, { "epoch": 0.8270531400966183, "grad_norm": 0.5895273685455322, "learning_rate": 3.501945525291829e-05, "loss": 0.6135, "step": 642 }, { "epoch": 0.828341384863124, "grad_norm": 0.5063283443450928, "learning_rate": 3.476005188067445e-05, "loss": 0.4939, "step": 643 }, { "epoch": 0.8296296296296296, "grad_norm": 0.5781770348548889, "learning_rate": 3.450064850843061e-05, "loss": 0.7322, "step": 644 }, { "epoch": 0.8309178743961353, "grad_norm": 0.5424814820289612, "learning_rate": 3.4241245136186774e-05, "loss": 0.6702, "step": 645 }, { "epoch": 0.8322061191626409, "grad_norm": 0.5998700857162476, "learning_rate": 3.398184176394293e-05, "loss": 0.6788, "step": 646 }, { "epoch": 0.8334943639291466, "grad_norm": 0.614637017250061, "learning_rate": 3.3722438391699095e-05, "loss": 0.7391, "step": 647 }, { "epoch": 0.8347826086956521, "grad_norm": 0.6503768563270569, "learning_rate": 3.346303501945525e-05, "loss": 0.7648, "step": 648 }, { "epoch": 0.8360708534621578, "grad_norm": 0.5270184874534607, "learning_rate": 3.320363164721142e-05, "loss": 0.504, "step": 649 }, { "epoch": 0.8373590982286635, "grad_norm": 0.5014241337776184, "learning_rate": 3.2944228274967575e-05, "loss": 0.5252, "step": 650 }, { "epoch": 0.8386473429951691, "grad_norm": 0.5668673515319824, "learning_rate": 3.268482490272374e-05, "loss": 0.6537, "step": 651 }, { "epoch": 0.8399355877616748, "grad_norm": 0.5789865255355835, "learning_rate": 3.2425421530479896e-05, "loss": 0.8012, "step": 652 }, { "epoch": 0.8412238325281803, "grad_norm": 0.6261132955551147, "learning_rate": 3.216601815823606e-05, "loss": 0.6832, "step": 653 }, { "epoch": 0.842512077294686, "grad_norm": 0.5914183855056763, "learning_rate": 3.190661478599222e-05, "loss": 0.5653, "step": 654 }, { "epoch": 0.8438003220611916, "grad_norm": 0.5597856044769287, "learning_rate": 3.164721141374838e-05, "loss": 0.557, "step": 655 }, { "epoch": 0.8450885668276973, "grad_norm": 0.5060226917266846, "learning_rate": 3.138780804150454e-05, "loss": 0.6559, "step": 656 }, { "epoch": 0.8463768115942029, "grad_norm": 0.6236748695373535, "learning_rate": 3.1128404669260704e-05, "loss": 0.7372, "step": 657 }, { "epoch": 0.8476650563607085, "grad_norm": 0.5138527750968933, "learning_rate": 3.086900129701686e-05, "loss": 0.6031, "step": 658 }, { "epoch": 0.8489533011272141, "grad_norm": 0.5962822437286377, "learning_rate": 3.0609597924773026e-05, "loss": 0.7374, "step": 659 }, { "epoch": 0.8502415458937198, "grad_norm": 0.4833110272884369, "learning_rate": 3.0350194552529183e-05, "loss": 0.4742, "step": 660 }, { "epoch": 0.8515297906602255, "grad_norm": 0.5980967283248901, "learning_rate": 3.0090791180285344e-05, "loss": 0.7025, "step": 661 }, { "epoch": 0.8528180354267311, "grad_norm": 0.6031454801559448, "learning_rate": 2.9831387808041505e-05, "loss": 0.8479, "step": 662 }, { "epoch": 0.8541062801932368, "grad_norm": 0.5824582576751709, "learning_rate": 2.9571984435797666e-05, "loss": 0.7073, "step": 663 }, { "epoch": 0.8553945249597423, "grad_norm": 0.6369014978408813, "learning_rate": 2.931258106355383e-05, "loss": 0.864, "step": 664 }, { "epoch": 0.856682769726248, "grad_norm": 0.554784893989563, "learning_rate": 2.905317769130999e-05, "loss": 0.6705, "step": 665 }, { "epoch": 0.8579710144927536, "grad_norm": 0.5656050443649292, "learning_rate": 2.8793774319066148e-05, "loss": 0.6375, "step": 666 }, { "epoch": 0.8592592592592593, "grad_norm": 0.6191110014915466, "learning_rate": 2.853437094682231e-05, "loss": 0.7277, "step": 667 }, { "epoch": 0.8605475040257649, "grad_norm": 0.6224331855773926, "learning_rate": 2.827496757457847e-05, "loss": 0.6809, "step": 668 }, { "epoch": 0.8618357487922705, "grad_norm": 0.6049439311027527, "learning_rate": 2.801556420233463e-05, "loss": 0.8089, "step": 669 }, { "epoch": 0.8631239935587761, "grad_norm": 0.5969856977462769, "learning_rate": 2.7756160830090795e-05, "loss": 0.8102, "step": 670 }, { "epoch": 0.8644122383252818, "grad_norm": 0.6787256002426147, "learning_rate": 2.7496757457846956e-05, "loss": 0.7602, "step": 671 }, { "epoch": 0.8657004830917875, "grad_norm": 0.6535263061523438, "learning_rate": 2.7237354085603113e-05, "loss": 0.6816, "step": 672 }, { "epoch": 0.8669887278582931, "grad_norm": 0.6893251538276672, "learning_rate": 2.6977950713359274e-05, "loss": 0.7271, "step": 673 }, { "epoch": 0.8682769726247987, "grad_norm": 0.6239253282546997, "learning_rate": 2.6718547341115435e-05, "loss": 0.7995, "step": 674 }, { "epoch": 0.8695652173913043, "grad_norm": 0.5103424787521362, "learning_rate": 2.6459143968871596e-05, "loss": 0.5603, "step": 675 }, { "epoch": 0.87085346215781, "grad_norm": 0.558005690574646, "learning_rate": 2.619974059662776e-05, "loss": 0.5951, "step": 676 }, { "epoch": 0.8721417069243156, "grad_norm": 0.5600588917732239, "learning_rate": 2.594033722438392e-05, "loss": 0.7646, "step": 677 }, { "epoch": 0.8734299516908213, "grad_norm": 0.48512476682662964, "learning_rate": 2.5680933852140075e-05, "loss": 0.5015, "step": 678 }, { "epoch": 0.8747181964573268, "grad_norm": 0.6091920733451843, "learning_rate": 2.542153047989624e-05, "loss": 0.8244, "step": 679 }, { "epoch": 0.8760064412238325, "grad_norm": 0.49356287717819214, "learning_rate": 2.51621271076524e-05, "loss": 0.4808, "step": 680 }, { "epoch": 0.8772946859903382, "grad_norm": 0.5376326441764832, "learning_rate": 2.490272373540856e-05, "loss": 0.5296, "step": 681 }, { "epoch": 0.8785829307568438, "grad_norm": 0.611382782459259, "learning_rate": 2.4643320363164722e-05, "loss": 0.5765, "step": 682 }, { "epoch": 0.8798711755233495, "grad_norm": 0.5653994083404541, "learning_rate": 2.4383916990920883e-05, "loss": 0.5864, "step": 683 }, { "epoch": 0.881159420289855, "grad_norm": 0.48044463992118835, "learning_rate": 2.4124513618677044e-05, "loss": 0.6073, "step": 684 }, { "epoch": 0.8824476650563607, "grad_norm": 0.6067565679550171, "learning_rate": 2.3865110246433205e-05, "loss": 0.6312, "step": 685 }, { "epoch": 0.8837359098228663, "grad_norm": 0.5126189589500427, "learning_rate": 2.3605706874189365e-05, "loss": 0.5501, "step": 686 }, { "epoch": 0.885024154589372, "grad_norm": 0.551137387752533, "learning_rate": 2.3346303501945526e-05, "loss": 0.578, "step": 687 }, { "epoch": 0.8863123993558776, "grad_norm": 0.7072709202766418, "learning_rate": 2.3086900129701687e-05, "loss": 0.614, "step": 688 }, { "epoch": 0.8876006441223833, "grad_norm": 0.6444385051727295, "learning_rate": 2.2827496757457848e-05, "loss": 0.6824, "step": 689 }, { "epoch": 0.8888888888888888, "grad_norm": 0.5593189597129822, "learning_rate": 2.256809338521401e-05, "loss": 0.5942, "step": 690 }, { "epoch": 0.8901771336553945, "grad_norm": 0.6002535223960876, "learning_rate": 2.230869001297017e-05, "loss": 0.7655, "step": 691 }, { "epoch": 0.8914653784219002, "grad_norm": 0.6385635137557983, "learning_rate": 2.204928664072633e-05, "loss": 0.749, "step": 692 }, { "epoch": 0.8927536231884058, "grad_norm": 0.5951741337776184, "learning_rate": 2.178988326848249e-05, "loss": 0.6708, "step": 693 }, { "epoch": 0.8940418679549115, "grad_norm": 0.6050885915756226, "learning_rate": 2.1530479896238652e-05, "loss": 0.7473, "step": 694 }, { "epoch": 0.895330112721417, "grad_norm": 0.570475161075592, "learning_rate": 2.1271076523994813e-05, "loss": 0.5881, "step": 695 }, { "epoch": 0.8966183574879227, "grad_norm": 0.5623670816421509, "learning_rate": 2.1011673151750974e-05, "loss": 0.6613, "step": 696 }, { "epoch": 0.8979066022544283, "grad_norm": 0.6884156465530396, "learning_rate": 2.0752269779507135e-05, "loss": 0.7917, "step": 697 }, { "epoch": 0.899194847020934, "grad_norm": 0.6603716611862183, "learning_rate": 2.0492866407263296e-05, "loss": 0.6958, "step": 698 }, { "epoch": 0.9004830917874396, "grad_norm": 0.6588467359542847, "learning_rate": 2.0233463035019457e-05, "loss": 0.5612, "step": 699 }, { "epoch": 0.9017713365539453, "grad_norm": 0.5613631010055542, "learning_rate": 1.9974059662775617e-05, "loss": 0.6064, "step": 700 } ], "logging_steps": 1, "max_steps": 776, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.98827660663808e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }