diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4934 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9017713365539453, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012882447665056361, + "grad_norm": 5.764990329742432, + "learning_rate": 0.0, + "loss": 10.1729, + "step": 1 + }, + { + "epoch": 0.0025764895330112722, + "grad_norm": 5.699520111083984, + "learning_rate": 4e-05, + "loss": 9.9457, + "step": 2 + }, + { + "epoch": 0.003864734299516908, + "grad_norm": 5.760854721069336, + "learning_rate": 8e-05, + "loss": 9.7561, + "step": 3 + }, + { + "epoch": 0.0051529790660225444, + "grad_norm": 6.234244346618652, + "learning_rate": 0.00012, + "loss": 9.0043, + "step": 4 + }, + { + "epoch": 0.00644122383252818, + "grad_norm": 6.719285011291504, + "learning_rate": 0.00016, + "loss": 7.4539, + "step": 5 + }, + { + "epoch": 0.007729468599033816, + "grad_norm": 5.1464948654174805, + "learning_rate": 0.0002, + "loss": 5.7333, + "step": 6 + }, + { + "epoch": 0.009017713365539453, + "grad_norm": 3.3606348037719727, + "learning_rate": 0.00019974059662775616, + "loss": 3.8319, + "step": 7 + }, + { + "epoch": 0.010305958132045089, + "grad_norm": 2.361740827560425, + "learning_rate": 0.00019948119325551234, + "loss": 3.3392, + "step": 8 + }, + { + "epoch": 0.011594202898550725, + "grad_norm": 2.378281831741333, + "learning_rate": 0.0001992217898832685, + "loss": 2.8643, + "step": 9 + }, + { + "epoch": 0.01288244766505636, + "grad_norm": 1.9084206819534302, + "learning_rate": 0.00019896238651102467, + "loss": 2.4692, + "step": 10 + }, + { + "epoch": 0.014170692431561997, + "grad_norm": 2.3616507053375244, + "learning_rate": 0.00019870298313878082, + "loss": 2.2057, + "step": 11 + }, + { + "epoch": 0.015458937198067632, + "grad_norm": 2.7130489349365234, + "learning_rate": 0.00019844357976653697, + "loss": 1.8781, + "step": 12 + }, + { + "epoch": 0.01674718196457327, + "grad_norm": 5.479770183563232, + "learning_rate": 0.00019818417639429315, + "loss": 1.6427, + "step": 13 + }, + { + "epoch": 0.018035426731078906, + "grad_norm": 2.0840210914611816, + "learning_rate": 0.0001979247730220493, + "loss": 1.6455, + "step": 14 + }, + { + "epoch": 0.01932367149758454, + "grad_norm": 17.294357299804688, + "learning_rate": 0.00019766536964980547, + "loss": 1.9366, + "step": 15 + }, + { + "epoch": 0.020611916264090178, + "grad_norm": 3.7959189414978027, + "learning_rate": 0.00019740596627756162, + "loss": 1.9272, + "step": 16 + }, + { + "epoch": 0.021900161030595812, + "grad_norm": 9.078225135803223, + "learning_rate": 0.00019714656290531778, + "loss": 1.8734, + "step": 17 + }, + { + "epoch": 0.02318840579710145, + "grad_norm": 2.7898125648498535, + "learning_rate": 0.00019688715953307395, + "loss": 1.8415, + "step": 18 + }, + { + "epoch": 0.024476650563607084, + "grad_norm": 5.833450794219971, + "learning_rate": 0.00019662775616083008, + "loss": 1.6641, + "step": 19 + }, + { + "epoch": 0.02576489533011272, + "grad_norm": 1.286916971206665, + "learning_rate": 0.00019636835278858625, + "loss": 1.6488, + "step": 20 + }, + { + "epoch": 0.02705314009661836, + "grad_norm": 1.4083938598632812, + "learning_rate": 0.0001961089494163424, + "loss": 1.6369, + "step": 21 + }, + { + "epoch": 0.028341384863123993, + "grad_norm": 11.11021900177002, + "learning_rate": 0.00019584954604409858, + "loss": 1.0877, + "step": 22 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 4.023814678192139, + "learning_rate": 0.00019559014267185473, + "loss": 1.172, + "step": 23 + }, + { + "epoch": 0.030917874396135265, + "grad_norm": 1.5380833148956299, + "learning_rate": 0.0001953307392996109, + "loss": 1.2489, + "step": 24 + }, + { + "epoch": 0.0322061191626409, + "grad_norm": 3.5287179946899414, + "learning_rate": 0.00019507133592736706, + "loss": 1.059, + "step": 25 + }, + { + "epoch": 0.03349436392914654, + "grad_norm": 0.4443202316761017, + "learning_rate": 0.0001948119325551232, + "loss": 1.0324, + "step": 26 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 4.4658098220825195, + "learning_rate": 0.0001945525291828794, + "loss": 0.9281, + "step": 27 + }, + { + "epoch": 0.03607085346215781, + "grad_norm": 0.6924022436141968, + "learning_rate": 0.00019429312581063554, + "loss": 0.7725, + "step": 28 + }, + { + "epoch": 0.03735909822866344, + "grad_norm": 0.39130347967147827, + "learning_rate": 0.00019403372243839172, + "loss": 0.9851, + "step": 29 + }, + { + "epoch": 0.03864734299516908, + "grad_norm": 0.478762149810791, + "learning_rate": 0.00019377431906614787, + "loss": 0.9003, + "step": 30 + }, + { + "epoch": 0.03993558776167472, + "grad_norm": 0.485379695892334, + "learning_rate": 0.00019351491569390402, + "loss": 0.9038, + "step": 31 + }, + { + "epoch": 0.041223832528180356, + "grad_norm": 0.4116724729537964, + "learning_rate": 0.0001932555123216602, + "loss": 0.7782, + "step": 32 + }, + { + "epoch": 0.04251207729468599, + "grad_norm": 0.35044676065444946, + "learning_rate": 0.00019299610894941635, + "loss": 0.7145, + "step": 33 + }, + { + "epoch": 0.043800322061191624, + "grad_norm": 0.34671103954315186, + "learning_rate": 0.00019273670557717253, + "loss": 0.838, + "step": 34 + }, + { + "epoch": 0.04508856682769726, + "grad_norm": 0.3169376850128174, + "learning_rate": 0.00019247730220492868, + "loss": 0.9283, + "step": 35 + }, + { + "epoch": 0.0463768115942029, + "grad_norm": 0.3791329860687256, + "learning_rate": 0.00019221789883268483, + "loss": 0.9332, + "step": 36 + }, + { + "epoch": 0.04766505636070854, + "grad_norm": 0.39683282375335693, + "learning_rate": 0.000191958495460441, + "loss": 0.837, + "step": 37 + }, + { + "epoch": 0.04895330112721417, + "grad_norm": 0.4130147099494934, + "learning_rate": 0.00019169909208819716, + "loss": 0.688, + "step": 38 + }, + { + "epoch": 0.050241545893719805, + "grad_norm": 0.535886824131012, + "learning_rate": 0.00019143968871595333, + "loss": 0.819, + "step": 39 + }, + { + "epoch": 0.05152979066022544, + "grad_norm": 0.41564154624938965, + "learning_rate": 0.00019118028534370949, + "loss": 1.0323, + "step": 40 + }, + { + "epoch": 0.05281803542673108, + "grad_norm": 0.38580086827278137, + "learning_rate": 0.00019092088197146564, + "loss": 0.9947, + "step": 41 + }, + { + "epoch": 0.05410628019323672, + "grad_norm": 0.3614998757839203, + "learning_rate": 0.00019066147859922181, + "loss": 0.8925, + "step": 42 + }, + { + "epoch": 0.05539452495974235, + "grad_norm": 0.3364286422729492, + "learning_rate": 0.00019040207522697794, + "loss": 0.8473, + "step": 43 + }, + { + "epoch": 0.056682769726247986, + "grad_norm": 0.3541828393936157, + "learning_rate": 0.00019014267185473412, + "loss": 0.8477, + "step": 44 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 0.35495537519454956, + "learning_rate": 0.00018988326848249027, + "loss": 0.8881, + "step": 45 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 0.43733540177345276, + "learning_rate": 0.00018962386511024644, + "loss": 0.8743, + "step": 46 + }, + { + "epoch": 0.06054750402576489, + "grad_norm": 0.3078387975692749, + "learning_rate": 0.0001893644617380026, + "loss": 0.7479, + "step": 47 + }, + { + "epoch": 0.06183574879227053, + "grad_norm": 0.36661794781684875, + "learning_rate": 0.00018910505836575875, + "loss": 0.8258, + "step": 48 + }, + { + "epoch": 0.06312399355877617, + "grad_norm": 0.34701570868492126, + "learning_rate": 0.00018884565499351492, + "loss": 0.8085, + "step": 49 + }, + { + "epoch": 0.0644122383252818, + "grad_norm": 0.30905681848526, + "learning_rate": 0.00018858625162127107, + "loss": 0.7474, + "step": 50 + }, + { + "epoch": 0.06570048309178744, + "grad_norm": 0.47441986203193665, + "learning_rate": 0.00018832684824902725, + "loss": 1.0549, + "step": 51 + }, + { + "epoch": 0.06698872785829307, + "grad_norm": 0.2966022491455078, + "learning_rate": 0.0001880674448767834, + "loss": 0.8517, + "step": 52 + }, + { + "epoch": 0.06827697262479872, + "grad_norm": 0.33785632252693176, + "learning_rate": 0.00018780804150453958, + "loss": 0.8858, + "step": 53 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 0.33717742562294006, + "learning_rate": 0.00018754863813229573, + "loss": 0.7961, + "step": 54 + }, + { + "epoch": 0.07085346215780998, + "grad_norm": 0.4235801100730896, + "learning_rate": 0.00018728923476005188, + "loss": 0.9562, + "step": 55 + }, + { + "epoch": 0.07214170692431562, + "grad_norm": 0.40099507570266724, + "learning_rate": 0.00018702983138780806, + "loss": 0.6817, + "step": 56 + }, + { + "epoch": 0.07342995169082125, + "grad_norm": 0.3041292428970337, + "learning_rate": 0.0001867704280155642, + "loss": 0.6219, + "step": 57 + }, + { + "epoch": 0.07471819645732689, + "grad_norm": 0.428120493888855, + "learning_rate": 0.0001865110246433204, + "loss": 0.7911, + "step": 58 + }, + { + "epoch": 0.07600644122383253, + "grad_norm": 0.39466729760169983, + "learning_rate": 0.00018625162127107654, + "loss": 0.7883, + "step": 59 + }, + { + "epoch": 0.07729468599033816, + "grad_norm": 0.3272225856781006, + "learning_rate": 0.0001859922178988327, + "loss": 0.736, + "step": 60 + }, + { + "epoch": 0.0785829307568438, + "grad_norm": 0.3868604898452759, + "learning_rate": 0.00018573281452658887, + "loss": 0.772, + "step": 61 + }, + { + "epoch": 0.07987117552334944, + "grad_norm": 0.4111652970314026, + "learning_rate": 0.00018547341115434502, + "loss": 0.7715, + "step": 62 + }, + { + "epoch": 0.08115942028985507, + "grad_norm": 0.367587149143219, + "learning_rate": 0.0001852140077821012, + "loss": 0.8887, + "step": 63 + }, + { + "epoch": 0.08244766505636071, + "grad_norm": 0.36358535289764404, + "learning_rate": 0.00018495460440985735, + "loss": 0.6756, + "step": 64 + }, + { + "epoch": 0.08373590982286634, + "grad_norm": 0.3693746030330658, + "learning_rate": 0.0001846952010376135, + "loss": 0.7451, + "step": 65 + }, + { + "epoch": 0.08502415458937199, + "grad_norm": 0.33801788091659546, + "learning_rate": 0.00018443579766536967, + "loss": 0.7722, + "step": 66 + }, + { + "epoch": 0.08631239935587762, + "grad_norm": 0.40920770168304443, + "learning_rate": 0.0001841763942931258, + "loss": 0.6399, + "step": 67 + }, + { + "epoch": 0.08760064412238325, + "grad_norm": 0.36758852005004883, + "learning_rate": 0.00018391699092088198, + "loss": 0.7686, + "step": 68 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.37189269065856934, + "learning_rate": 0.00018365758754863813, + "loss": 0.8437, + "step": 69 + }, + { + "epoch": 0.09017713365539452, + "grad_norm": 0.42477479577064514, + "learning_rate": 0.0001833981841763943, + "loss": 0.7571, + "step": 70 + }, + { + "epoch": 0.09146537842190017, + "grad_norm": 0.34100234508514404, + "learning_rate": 0.00018313878080415046, + "loss": 0.733, + "step": 71 + }, + { + "epoch": 0.0927536231884058, + "grad_norm": 0.42223483324050903, + "learning_rate": 0.0001828793774319066, + "loss": 0.7981, + "step": 72 + }, + { + "epoch": 0.09404186795491143, + "grad_norm": 0.40469273924827576, + "learning_rate": 0.00018261997405966278, + "loss": 0.8633, + "step": 73 + }, + { + "epoch": 0.09533011272141707, + "grad_norm": 0.35466790199279785, + "learning_rate": 0.00018236057068741893, + "loss": 0.7359, + "step": 74 + }, + { + "epoch": 0.0966183574879227, + "grad_norm": 0.3824892044067383, + "learning_rate": 0.0001821011673151751, + "loss": 0.846, + "step": 75 + }, + { + "epoch": 0.09790660225442833, + "grad_norm": 0.4101675748825073, + "learning_rate": 0.00018184176394293126, + "loss": 1.0094, + "step": 76 + }, + { + "epoch": 0.09919484702093398, + "grad_norm": 0.3373378813266754, + "learning_rate": 0.0001815823605706874, + "loss": 0.6908, + "step": 77 + }, + { + "epoch": 0.10048309178743961, + "grad_norm": 0.41473421454429626, + "learning_rate": 0.0001813229571984436, + "loss": 0.7753, + "step": 78 + }, + { + "epoch": 0.10177133655394525, + "grad_norm": 0.3552979826927185, + "learning_rate": 0.00018106355382619974, + "loss": 0.883, + "step": 79 + }, + { + "epoch": 0.10305958132045089, + "grad_norm": 0.3655754029750824, + "learning_rate": 0.00018080415045395592, + "loss": 0.7978, + "step": 80 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 0.398554265499115, + "learning_rate": 0.00018054474708171207, + "loss": 0.9426, + "step": 81 + }, + { + "epoch": 0.10563607085346216, + "grad_norm": 0.4098765552043915, + "learning_rate": 0.00018028534370946825, + "loss": 0.7127, + "step": 82 + }, + { + "epoch": 0.10692431561996779, + "grad_norm": 0.38591381907463074, + "learning_rate": 0.0001800259403372244, + "loss": 0.7994, + "step": 83 + }, + { + "epoch": 0.10821256038647344, + "grad_norm": 0.42177343368530273, + "learning_rate": 0.00017976653696498055, + "loss": 0.8032, + "step": 84 + }, + { + "epoch": 0.10950080515297907, + "grad_norm": 0.38358885049819946, + "learning_rate": 0.00017950713359273673, + "loss": 0.8478, + "step": 85 + }, + { + "epoch": 0.1107890499194847, + "grad_norm": 0.4549978971481323, + "learning_rate": 0.00017924773022049288, + "loss": 0.813, + "step": 86 + }, + { + "epoch": 0.11207729468599034, + "grad_norm": 0.4372895359992981, + "learning_rate": 0.00017898832684824906, + "loss": 0.8797, + "step": 87 + }, + { + "epoch": 0.11336553945249597, + "grad_norm": 0.4454326033592224, + "learning_rate": 0.0001787289234760052, + "loss": 0.8554, + "step": 88 + }, + { + "epoch": 0.11465378421900162, + "grad_norm": 0.3808746933937073, + "learning_rate": 0.00017846952010376136, + "loss": 0.5919, + "step": 89 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 0.4146284759044647, + "learning_rate": 0.00017821011673151754, + "loss": 0.8823, + "step": 90 + }, + { + "epoch": 0.11723027375201288, + "grad_norm": 0.47205957770347595, + "learning_rate": 0.00017795071335927366, + "loss": 0.6284, + "step": 91 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 0.4155535101890564, + "learning_rate": 0.00017769130998702984, + "loss": 0.8945, + "step": 92 + }, + { + "epoch": 0.11980676328502415, + "grad_norm": 0.4152592420578003, + "learning_rate": 0.000177431906614786, + "loss": 0.818, + "step": 93 + }, + { + "epoch": 0.12109500805152978, + "grad_norm": 0.4558146297931671, + "learning_rate": 0.00017717250324254217, + "loss": 0.651, + "step": 94 + }, + { + "epoch": 0.12238325281803543, + "grad_norm": 0.4004950523376465, + "learning_rate": 0.00017691309987029832, + "loss": 0.7546, + "step": 95 + }, + { + "epoch": 0.12367149758454106, + "grad_norm": 0.35895851254463196, + "learning_rate": 0.00017665369649805447, + "loss": 0.6174, + "step": 96 + }, + { + "epoch": 0.1249597423510467, + "grad_norm": 0.4626515209674835, + "learning_rate": 0.00017639429312581064, + "loss": 0.716, + "step": 97 + }, + { + "epoch": 0.12624798711755233, + "grad_norm": 0.47447800636291504, + "learning_rate": 0.0001761348897535668, + "loss": 0.9699, + "step": 98 + }, + { + "epoch": 0.12753623188405797, + "grad_norm": 0.4361920654773712, + "learning_rate": 0.00017587548638132297, + "loss": 0.9217, + "step": 99 + }, + { + "epoch": 0.1288244766505636, + "grad_norm": 0.42450228333473206, + "learning_rate": 0.00017561608300907912, + "loss": 0.6938, + "step": 100 + }, + { + "epoch": 0.13011272141706925, + "grad_norm": 0.4310356080532074, + "learning_rate": 0.00017535667963683527, + "loss": 0.7263, + "step": 101 + }, + { + "epoch": 0.13140096618357489, + "grad_norm": 0.5808001756668091, + "learning_rate": 0.00017509727626459145, + "loss": 0.9891, + "step": 102 + }, + { + "epoch": 0.13268921095008052, + "grad_norm": 0.49347755312919617, + "learning_rate": 0.0001748378728923476, + "loss": 0.7918, + "step": 103 + }, + { + "epoch": 0.13397745571658615, + "grad_norm": 0.42868706583976746, + "learning_rate": 0.00017457846952010378, + "loss": 0.7067, + "step": 104 + }, + { + "epoch": 0.13526570048309178, + "grad_norm": 0.4322398900985718, + "learning_rate": 0.00017431906614785993, + "loss": 0.6705, + "step": 105 + }, + { + "epoch": 0.13655394524959744, + "grad_norm": 0.41033244132995605, + "learning_rate": 0.00017405966277561608, + "loss": 0.6878, + "step": 106 + }, + { + "epoch": 0.13784219001610307, + "grad_norm": 0.536390483379364, + "learning_rate": 0.00017380025940337226, + "loss": 0.6961, + "step": 107 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 0.4299734830856323, + "learning_rate": 0.0001735408560311284, + "loss": 0.7065, + "step": 108 + }, + { + "epoch": 0.14041867954911433, + "grad_norm": 0.4070943593978882, + "learning_rate": 0.0001732814526588846, + "loss": 0.6975, + "step": 109 + }, + { + "epoch": 0.14170692431561996, + "grad_norm": 0.46637794375419617, + "learning_rate": 0.00017302204928664074, + "loss": 0.7583, + "step": 110 + }, + { + "epoch": 0.14299516908212562, + "grad_norm": 0.38566964864730835, + "learning_rate": 0.00017276264591439692, + "loss": 0.7765, + "step": 111 + }, + { + "epoch": 0.14428341384863125, + "grad_norm": 0.38054248690605164, + "learning_rate": 0.00017250324254215307, + "loss": 0.6291, + "step": 112 + }, + { + "epoch": 0.14557165861513688, + "grad_norm": 0.5447641015052795, + "learning_rate": 0.00017224383916990922, + "loss": 1.0189, + "step": 113 + }, + { + "epoch": 0.1468599033816425, + "grad_norm": 0.4753653109073639, + "learning_rate": 0.0001719844357976654, + "loss": 0.83, + "step": 114 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.4890337288379669, + "learning_rate": 0.00017172503242542152, + "loss": 0.8586, + "step": 115 + }, + { + "epoch": 0.14943639291465377, + "grad_norm": 0.42376580834388733, + "learning_rate": 0.0001714656290531777, + "loss": 0.805, + "step": 116 + }, + { + "epoch": 0.15072463768115943, + "grad_norm": 0.4509013295173645, + "learning_rate": 0.00017120622568093385, + "loss": 0.8794, + "step": 117 + }, + { + "epoch": 0.15201288244766506, + "grad_norm": 0.4562942385673523, + "learning_rate": 0.00017094682230869003, + "loss": 0.6392, + "step": 118 + }, + { + "epoch": 0.1533011272141707, + "grad_norm": 0.48996442556381226, + "learning_rate": 0.00017068741893644618, + "loss": 0.7655, + "step": 119 + }, + { + "epoch": 0.15458937198067632, + "grad_norm": 0.5451317429542542, + "learning_rate": 0.00017042801556420233, + "loss": 0.796, + "step": 120 + }, + { + "epoch": 0.15587761674718195, + "grad_norm": 0.45719748735427856, + "learning_rate": 0.0001701686121919585, + "loss": 0.7704, + "step": 121 + }, + { + "epoch": 0.1571658615136876, + "grad_norm": 0.5048899054527283, + "learning_rate": 0.00016990920881971466, + "loss": 0.7396, + "step": 122 + }, + { + "epoch": 0.15845410628019324, + "grad_norm": 0.4184553921222687, + "learning_rate": 0.00016964980544747083, + "loss": 0.9926, + "step": 123 + }, + { + "epoch": 0.15974235104669887, + "grad_norm": 0.4456348717212677, + "learning_rate": 0.00016939040207522698, + "loss": 0.7654, + "step": 124 + }, + { + "epoch": 0.1610305958132045, + "grad_norm": 0.4423070251941681, + "learning_rate": 0.00016913099870298313, + "loss": 0.7832, + "step": 125 + }, + { + "epoch": 0.16231884057971013, + "grad_norm": 0.5408623218536377, + "learning_rate": 0.0001688715953307393, + "loss": 0.9074, + "step": 126 + }, + { + "epoch": 0.1636070853462158, + "grad_norm": 0.5411691665649414, + "learning_rate": 0.00016861219195849546, + "loss": 0.9271, + "step": 127 + }, + { + "epoch": 0.16489533011272142, + "grad_norm": 0.41004684567451477, + "learning_rate": 0.00016835278858625164, + "loss": 0.5686, + "step": 128 + }, + { + "epoch": 0.16618357487922705, + "grad_norm": 0.43191105127334595, + "learning_rate": 0.0001680933852140078, + "loss": 0.7841, + "step": 129 + }, + { + "epoch": 0.16747181964573268, + "grad_norm": 0.46590304374694824, + "learning_rate": 0.00016783398184176394, + "loss": 0.6283, + "step": 130 + }, + { + "epoch": 0.16876006441223831, + "grad_norm": 0.4356256425380707, + "learning_rate": 0.00016757457846952012, + "loss": 0.5977, + "step": 131 + }, + { + "epoch": 0.17004830917874397, + "grad_norm": 0.44105201959609985, + "learning_rate": 0.00016731517509727627, + "loss": 0.8701, + "step": 132 + }, + { + "epoch": 0.1713365539452496, + "grad_norm": 0.496669739484787, + "learning_rate": 0.00016705577172503245, + "loss": 0.8613, + "step": 133 + }, + { + "epoch": 0.17262479871175523, + "grad_norm": 0.41839754581451416, + "learning_rate": 0.0001667963683527886, + "loss": 0.5693, + "step": 134 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.42133820056915283, + "learning_rate": 0.00016653696498054475, + "loss": 0.7622, + "step": 135 + }, + { + "epoch": 0.1752012882447665, + "grad_norm": 0.45265620946884155, + "learning_rate": 0.00016627756160830093, + "loss": 0.8501, + "step": 136 + }, + { + "epoch": 0.17648953301127215, + "grad_norm": 0.45904725790023804, + "learning_rate": 0.00016601815823605708, + "loss": 0.9041, + "step": 137 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.42884427309036255, + "learning_rate": 0.00016575875486381326, + "loss": 0.7386, + "step": 138 + }, + { + "epoch": 0.17906602254428342, + "grad_norm": 0.44365760684013367, + "learning_rate": 0.00016549935149156938, + "loss": 0.7196, + "step": 139 + }, + { + "epoch": 0.18035426731078905, + "grad_norm": 0.38908517360687256, + "learning_rate": 0.00016523994811932556, + "loss": 0.6251, + "step": 140 + }, + { + "epoch": 0.18164251207729468, + "grad_norm": 0.3956596255302429, + "learning_rate": 0.0001649805447470817, + "loss": 0.6839, + "step": 141 + }, + { + "epoch": 0.18293075684380034, + "grad_norm": 0.46725159883499146, + "learning_rate": 0.00016472114137483789, + "loss": 0.7637, + "step": 142 + }, + { + "epoch": 0.18421900161030597, + "grad_norm": 0.4984063506126404, + "learning_rate": 0.00016446173800259404, + "loss": 0.818, + "step": 143 + }, + { + "epoch": 0.1855072463768116, + "grad_norm": 0.40556883811950684, + "learning_rate": 0.0001642023346303502, + "loss": 0.5996, + "step": 144 + }, + { + "epoch": 0.18679549114331723, + "grad_norm": 0.4421241581439972, + "learning_rate": 0.00016394293125810637, + "loss": 0.8339, + "step": 145 + }, + { + "epoch": 0.18808373590982286, + "grad_norm": 0.4321085512638092, + "learning_rate": 0.00016368352788586252, + "loss": 0.8028, + "step": 146 + }, + { + "epoch": 0.18937198067632852, + "grad_norm": 0.4498562514781952, + "learning_rate": 0.0001634241245136187, + "loss": 0.8976, + "step": 147 + }, + { + "epoch": 0.19066022544283415, + "grad_norm": 0.45957380533218384, + "learning_rate": 0.00016316472114137484, + "loss": 0.9271, + "step": 148 + }, + { + "epoch": 0.19194847020933978, + "grad_norm": 0.4764615595340729, + "learning_rate": 0.000162905317769131, + "loss": 0.9667, + "step": 149 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 0.4241081774234772, + "learning_rate": 0.00016264591439688717, + "loss": 0.6945, + "step": 150 + }, + { + "epoch": 0.19452495974235104, + "grad_norm": 0.5130481123924255, + "learning_rate": 0.00016238651102464332, + "loss": 0.7224, + "step": 151 + }, + { + "epoch": 0.19581320450885667, + "grad_norm": 0.4727570116519928, + "learning_rate": 0.0001621271076523995, + "loss": 0.7714, + "step": 152 + }, + { + "epoch": 0.19710144927536233, + "grad_norm": 0.5420963764190674, + "learning_rate": 0.00016186770428015565, + "loss": 0.9346, + "step": 153 + }, + { + "epoch": 0.19838969404186796, + "grad_norm": 0.4338800013065338, + "learning_rate": 0.0001616083009079118, + "loss": 0.8527, + "step": 154 + }, + { + "epoch": 0.1996779388083736, + "grad_norm": 0.45830976963043213, + "learning_rate": 0.00016134889753566798, + "loss": 0.8171, + "step": 155 + }, + { + "epoch": 0.20096618357487922, + "grad_norm": 0.48107942938804626, + "learning_rate": 0.00016108949416342413, + "loss": 0.8537, + "step": 156 + }, + { + "epoch": 0.20225442834138485, + "grad_norm": 0.4447987973690033, + "learning_rate": 0.0001608300907911803, + "loss": 0.7909, + "step": 157 + }, + { + "epoch": 0.2035426731078905, + "grad_norm": 0.4311445653438568, + "learning_rate": 0.00016057068741893646, + "loss": 0.7733, + "step": 158 + }, + { + "epoch": 0.20483091787439614, + "grad_norm": 0.5173223614692688, + "learning_rate": 0.0001603112840466926, + "loss": 0.9312, + "step": 159 + }, + { + "epoch": 0.20611916264090177, + "grad_norm": 0.5143957734107971, + "learning_rate": 0.0001600518806744488, + "loss": 0.804, + "step": 160 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 0.4494340121746063, + "learning_rate": 0.00015979247730220494, + "loss": 0.7741, + "step": 161 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 0.5051131248474121, + "learning_rate": 0.00015953307392996112, + "loss": 0.8216, + "step": 162 + }, + { + "epoch": 0.2099838969404187, + "grad_norm": 0.48853760957717896, + "learning_rate": 0.00015927367055771724, + "loss": 0.7958, + "step": 163 + }, + { + "epoch": 0.21127214170692432, + "grad_norm": 0.4491981863975525, + "learning_rate": 0.00015901426718547342, + "loss": 0.8012, + "step": 164 + }, + { + "epoch": 0.21256038647342995, + "grad_norm": 0.41452592611312866, + "learning_rate": 0.00015875486381322957, + "loss": 0.6653, + "step": 165 + }, + { + "epoch": 0.21384863123993558, + "grad_norm": 0.4610249400138855, + "learning_rate": 0.00015849546044098572, + "loss": 0.7559, + "step": 166 + }, + { + "epoch": 0.2151368760064412, + "grad_norm": 0.46895861625671387, + "learning_rate": 0.0001582360570687419, + "loss": 0.7393, + "step": 167 + }, + { + "epoch": 0.21642512077294687, + "grad_norm": 0.44812971353530884, + "learning_rate": 0.00015797665369649805, + "loss": 0.7904, + "step": 168 + }, + { + "epoch": 0.2177133655394525, + "grad_norm": 0.4483109712600708, + "learning_rate": 0.00015771725032425423, + "loss": 0.7503, + "step": 169 + }, + { + "epoch": 0.21900161030595813, + "grad_norm": 0.4433995485305786, + "learning_rate": 0.00015745784695201038, + "loss": 0.7791, + "step": 170 + }, + { + "epoch": 0.22028985507246376, + "grad_norm": 0.5305430889129639, + "learning_rate": 0.00015719844357976655, + "loss": 0.9014, + "step": 171 + }, + { + "epoch": 0.2215780998389694, + "grad_norm": 0.4747445285320282, + "learning_rate": 0.0001569390402075227, + "loss": 0.6446, + "step": 172 + }, + { + "epoch": 0.22286634460547505, + "grad_norm": 0.5174173712730408, + "learning_rate": 0.00015667963683527886, + "loss": 0.6938, + "step": 173 + }, + { + "epoch": 0.22415458937198068, + "grad_norm": 0.5461775660514832, + "learning_rate": 0.00015642023346303503, + "loss": 0.9596, + "step": 174 + }, + { + "epoch": 0.22544283413848631, + "grad_norm": 0.5394182205200195, + "learning_rate": 0.00015616083009079118, + "loss": 0.8632, + "step": 175 + }, + { + "epoch": 0.22673107890499195, + "grad_norm": 0.4866770803928375, + "learning_rate": 0.00015590142671854736, + "loss": 0.8799, + "step": 176 + }, + { + "epoch": 0.22801932367149758, + "grad_norm": 0.4386501908302307, + "learning_rate": 0.0001556420233463035, + "loss": 0.7341, + "step": 177 + }, + { + "epoch": 0.22930756843800323, + "grad_norm": 0.5443551540374756, + "learning_rate": 0.00015538261997405966, + "loss": 0.771, + "step": 178 + }, + { + "epoch": 0.23059581320450886, + "grad_norm": 0.45818325877189636, + "learning_rate": 0.00015512321660181584, + "loss": 0.8682, + "step": 179 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 0.501369297504425, + "learning_rate": 0.000154863813229572, + "loss": 0.7586, + "step": 180 + }, + { + "epoch": 0.23317230273752013, + "grad_norm": 0.4658907651901245, + "learning_rate": 0.00015460440985732817, + "loss": 0.6609, + "step": 181 + }, + { + "epoch": 0.23446054750402576, + "grad_norm": 0.4543883800506592, + "learning_rate": 0.00015434500648508432, + "loss": 0.5404, + "step": 182 + }, + { + "epoch": 0.2357487922705314, + "grad_norm": 0.4215242862701416, + "learning_rate": 0.00015408560311284047, + "loss": 0.7295, + "step": 183 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 0.4865438640117645, + "learning_rate": 0.00015382619974059665, + "loss": 0.8251, + "step": 184 + }, + { + "epoch": 0.23832528180354268, + "grad_norm": 0.4978322386741638, + "learning_rate": 0.0001535667963683528, + "loss": 0.9334, + "step": 185 + }, + { + "epoch": 0.2396135265700483, + "grad_norm": 0.434435099363327, + "learning_rate": 0.00015330739299610898, + "loss": 0.9299, + "step": 186 + }, + { + "epoch": 0.24090177133655394, + "grad_norm": 0.5044904947280884, + "learning_rate": 0.0001530479896238651, + "loss": 0.7411, + "step": 187 + }, + { + "epoch": 0.24219001610305957, + "grad_norm": 0.4364910423755646, + "learning_rate": 0.00015278858625162128, + "loss": 0.8248, + "step": 188 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 0.46096572279930115, + "learning_rate": 0.00015252918287937743, + "loss": 0.8211, + "step": 189 + }, + { + "epoch": 0.24476650563607086, + "grad_norm": 0.4325025677680969, + "learning_rate": 0.00015226977950713358, + "loss": 0.7043, + "step": 190 + }, + { + "epoch": 0.2460547504025765, + "grad_norm": 0.4898943305015564, + "learning_rate": 0.00015201037613488976, + "loss": 0.7608, + "step": 191 + }, + { + "epoch": 0.24734299516908212, + "grad_norm": 0.47487872838974, + "learning_rate": 0.0001517509727626459, + "loss": 0.7175, + "step": 192 + }, + { + "epoch": 0.24863123993558775, + "grad_norm": 0.4339347779750824, + "learning_rate": 0.0001514915693904021, + "loss": 0.8499, + "step": 193 + }, + { + "epoch": 0.2499194847020934, + "grad_norm": 0.46825259923934937, + "learning_rate": 0.00015123216601815824, + "loss": 0.621, + "step": 194 + }, + { + "epoch": 0.25120772946859904, + "grad_norm": 0.4948033094406128, + "learning_rate": 0.0001509727626459144, + "loss": 0.6888, + "step": 195 + }, + { + "epoch": 0.25249597423510467, + "grad_norm": 0.4327951967716217, + "learning_rate": 0.00015071335927367057, + "loss": 0.6128, + "step": 196 + }, + { + "epoch": 0.2537842190016103, + "grad_norm": 0.569115161895752, + "learning_rate": 0.00015045395590142672, + "loss": 0.8251, + "step": 197 + }, + { + "epoch": 0.25507246376811593, + "grad_norm": 0.47008320689201355, + "learning_rate": 0.0001501945525291829, + "loss": 0.8214, + "step": 198 + }, + { + "epoch": 0.25636070853462156, + "grad_norm": 0.4881947636604309, + "learning_rate": 0.00014993514915693904, + "loss": 0.6731, + "step": 199 + }, + { + "epoch": 0.2576489533011272, + "grad_norm": 0.5395270586013794, + "learning_rate": 0.00014967574578469522, + "loss": 0.8095, + "step": 200 + }, + { + "epoch": 0.2589371980676328, + "grad_norm": 0.44902658462524414, + "learning_rate": 0.00014941634241245137, + "loss": 0.7042, + "step": 201 + }, + { + "epoch": 0.2602254428341385, + "grad_norm": 0.5789260268211365, + "learning_rate": 0.00014915693904020752, + "loss": 0.9071, + "step": 202 + }, + { + "epoch": 0.26151368760064414, + "grad_norm": 0.48466676473617554, + "learning_rate": 0.0001488975356679637, + "loss": 0.7318, + "step": 203 + }, + { + "epoch": 0.26280193236714977, + "grad_norm": 0.4419580101966858, + "learning_rate": 0.00014863813229571985, + "loss": 0.7128, + "step": 204 + }, + { + "epoch": 0.2640901771336554, + "grad_norm": 0.4542410969734192, + "learning_rate": 0.00014837872892347603, + "loss": 0.7075, + "step": 205 + }, + { + "epoch": 0.26537842190016103, + "grad_norm": 0.49915802478790283, + "learning_rate": 0.00014811932555123218, + "loss": 0.8091, + "step": 206 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.39728543162345886, + "learning_rate": 0.00014785992217898833, + "loss": 0.6258, + "step": 207 + }, + { + "epoch": 0.2679549114331723, + "grad_norm": 0.524169385433197, + "learning_rate": 0.0001476005188067445, + "loss": 0.73, + "step": 208 + }, + { + "epoch": 0.2692431561996779, + "grad_norm": 0.4486137330532074, + "learning_rate": 0.00014734111543450066, + "loss": 0.7607, + "step": 209 + }, + { + "epoch": 0.27053140096618356, + "grad_norm": 0.5274791717529297, + "learning_rate": 0.00014708171206225684, + "loss": 0.6731, + "step": 210 + }, + { + "epoch": 0.2718196457326892, + "grad_norm": 0.44794782996177673, + "learning_rate": 0.00014682230869001296, + "loss": 0.5291, + "step": 211 + }, + { + "epoch": 0.27310789049919487, + "grad_norm": 0.48657894134521484, + "learning_rate": 0.00014656290531776914, + "loss": 0.6754, + "step": 212 + }, + { + "epoch": 0.2743961352657005, + "grad_norm": 0.49806416034698486, + "learning_rate": 0.0001463035019455253, + "loss": 0.7096, + "step": 213 + }, + { + "epoch": 0.27568438003220613, + "grad_norm": 0.49381333589553833, + "learning_rate": 0.00014604409857328144, + "loss": 0.5939, + "step": 214 + }, + { + "epoch": 0.27697262479871176, + "grad_norm": 0.4638739824295044, + "learning_rate": 0.00014578469520103762, + "loss": 0.6444, + "step": 215 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 0.5256271362304688, + "learning_rate": 0.00014552529182879377, + "loss": 0.7595, + "step": 216 + }, + { + "epoch": 0.279549114331723, + "grad_norm": 0.47106048464775085, + "learning_rate": 0.00014526588845654995, + "loss": 0.6394, + "step": 217 + }, + { + "epoch": 0.28083735909822866, + "grad_norm": 0.5482437610626221, + "learning_rate": 0.0001450064850843061, + "loss": 0.7181, + "step": 218 + }, + { + "epoch": 0.2821256038647343, + "grad_norm": 0.4711976945400238, + "learning_rate": 0.00014474708171206225, + "loss": 0.7207, + "step": 219 + }, + { + "epoch": 0.2834138486312399, + "grad_norm": 0.5149180293083191, + "learning_rate": 0.00014448767833981843, + "loss": 0.8199, + "step": 220 + }, + { + "epoch": 0.28470209339774555, + "grad_norm": 0.452908992767334, + "learning_rate": 0.00014422827496757458, + "loss": 0.6987, + "step": 221 + }, + { + "epoch": 0.28599033816425123, + "grad_norm": 0.5486910343170166, + "learning_rate": 0.00014396887159533075, + "loss": 0.7726, + "step": 222 + }, + { + "epoch": 0.28727858293075687, + "grad_norm": 0.5290431380271912, + "learning_rate": 0.0001437094682230869, + "loss": 0.8298, + "step": 223 + }, + { + "epoch": 0.2885668276972625, + "grad_norm": 0.49307680130004883, + "learning_rate": 0.00014345006485084306, + "loss": 0.7525, + "step": 224 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.5979593396186829, + "learning_rate": 0.00014319066147859923, + "loss": 0.8451, + "step": 225 + }, + { + "epoch": 0.29114331723027376, + "grad_norm": 0.49994269013404846, + "learning_rate": 0.00014293125810635538, + "loss": 0.6975, + "step": 226 + }, + { + "epoch": 0.2924315619967794, + "grad_norm": 0.5523327589035034, + "learning_rate": 0.00014267185473411156, + "loss": 0.7264, + "step": 227 + }, + { + "epoch": 0.293719806763285, + "grad_norm": 0.5106574296951294, + "learning_rate": 0.0001424124513618677, + "loss": 0.7794, + "step": 228 + }, + { + "epoch": 0.29500805152979065, + "grad_norm": 0.458646297454834, + "learning_rate": 0.0001421530479896239, + "loss": 0.8118, + "step": 229 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.5162986516952515, + "learning_rate": 0.00014189364461738004, + "loss": 0.8167, + "step": 230 + }, + { + "epoch": 0.2975845410628019, + "grad_norm": 0.47405433654785156, + "learning_rate": 0.0001416342412451362, + "loss": 0.7852, + "step": 231 + }, + { + "epoch": 0.29887278582930754, + "grad_norm": 0.5881102681159973, + "learning_rate": 0.00014137483787289237, + "loss": 0.9897, + "step": 232 + }, + { + "epoch": 0.3001610305958132, + "grad_norm": 0.4673059582710266, + "learning_rate": 0.00014111543450064852, + "loss": 0.7341, + "step": 233 + }, + { + "epoch": 0.30144927536231886, + "grad_norm": 0.48171284794807434, + "learning_rate": 0.0001408560311284047, + "loss": 0.7156, + "step": 234 + }, + { + "epoch": 0.3027375201288245, + "grad_norm": 0.43746286630630493, + "learning_rate": 0.00014059662775616082, + "loss": 0.6003, + "step": 235 + }, + { + "epoch": 0.3040257648953301, + "grad_norm": 0.46966665983200073, + "learning_rate": 0.000140337224383917, + "loss": 0.718, + "step": 236 + }, + { + "epoch": 0.30531400966183575, + "grad_norm": 0.4956988990306854, + "learning_rate": 0.00014007782101167315, + "loss": 0.6542, + "step": 237 + }, + { + "epoch": 0.3066022544283414, + "grad_norm": 0.5336653590202332, + "learning_rate": 0.0001398184176394293, + "loss": 0.7719, + "step": 238 + }, + { + "epoch": 0.307890499194847, + "grad_norm": 0.510515034198761, + "learning_rate": 0.00013955901426718548, + "loss": 0.8369, + "step": 239 + }, + { + "epoch": 0.30917874396135264, + "grad_norm": 0.4901074469089508, + "learning_rate": 0.00013929961089494163, + "loss": 0.7973, + "step": 240 + }, + { + "epoch": 0.3104669887278583, + "grad_norm": 0.5074118375778198, + "learning_rate": 0.0001390402075226978, + "loss": 0.8418, + "step": 241 + }, + { + "epoch": 0.3117552334943639, + "grad_norm": 0.48613104224205017, + "learning_rate": 0.00013878080415045396, + "loss": 0.7661, + "step": 242 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 0.527791440486908, + "learning_rate": 0.0001385214007782101, + "loss": 0.6461, + "step": 243 + }, + { + "epoch": 0.3143317230273752, + "grad_norm": 0.539172887802124, + "learning_rate": 0.0001382619974059663, + "loss": 0.7588, + "step": 244 + }, + { + "epoch": 0.31561996779388085, + "grad_norm": 0.4465171694755554, + "learning_rate": 0.00013800259403372244, + "loss": 0.6897, + "step": 245 + }, + { + "epoch": 0.3169082125603865, + "grad_norm": 0.44620922207832336, + "learning_rate": 0.00013774319066147862, + "loss": 0.591, + "step": 246 + }, + { + "epoch": 0.3181964573268921, + "grad_norm": 0.44383737444877625, + "learning_rate": 0.00013748378728923477, + "loss": 0.822, + "step": 247 + }, + { + "epoch": 0.31948470209339774, + "grad_norm": 0.5062816739082336, + "learning_rate": 0.00013722438391699092, + "loss": 0.7657, + "step": 248 + }, + { + "epoch": 0.3207729468599034, + "grad_norm": 0.4794199764728546, + "learning_rate": 0.0001369649805447471, + "loss": 0.6533, + "step": 249 + }, + { + "epoch": 0.322061191626409, + "grad_norm": 0.506678581237793, + "learning_rate": 0.00013670557717250325, + "loss": 0.6881, + "step": 250 + }, + { + "epoch": 0.32334943639291464, + "grad_norm": 0.5363421440124512, + "learning_rate": 0.00013644617380025942, + "loss": 0.7263, + "step": 251 + }, + { + "epoch": 0.32463768115942027, + "grad_norm": 0.4600725769996643, + "learning_rate": 0.00013618677042801557, + "loss": 0.6522, + "step": 252 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 0.4250006377696991, + "learning_rate": 0.00013592736705577172, + "loss": 0.5492, + "step": 253 + }, + { + "epoch": 0.3272141706924316, + "grad_norm": 0.5984755158424377, + "learning_rate": 0.0001356679636835279, + "loss": 0.7152, + "step": 254 + }, + { + "epoch": 0.3285024154589372, + "grad_norm": 0.4653768241405487, + "learning_rate": 0.00013540856031128405, + "loss": 0.6651, + "step": 255 + }, + { + "epoch": 0.32979066022544284, + "grad_norm": 0.5344521403312683, + "learning_rate": 0.00013514915693904023, + "loss": 0.736, + "step": 256 + }, + { + "epoch": 0.3310789049919485, + "grad_norm": 0.469061017036438, + "learning_rate": 0.00013488975356679638, + "loss": 0.5771, + "step": 257 + }, + { + "epoch": 0.3323671497584541, + "grad_norm": 0.46232855319976807, + "learning_rate": 0.00013463035019455256, + "loss": 0.6887, + "step": 258 + }, + { + "epoch": 0.33365539452495974, + "grad_norm": 0.4812975525856018, + "learning_rate": 0.00013437094682230868, + "loss": 0.8316, + "step": 259 + }, + { + "epoch": 0.33494363929146537, + "grad_norm": 0.5068632960319519, + "learning_rate": 0.00013411154345006486, + "loss": 0.7372, + "step": 260 + }, + { + "epoch": 0.336231884057971, + "grad_norm": 0.42497095465660095, + "learning_rate": 0.000133852140077821, + "loss": 0.7469, + "step": 261 + }, + { + "epoch": 0.33752012882447663, + "grad_norm": 0.49439537525177, + "learning_rate": 0.00013359273670557716, + "loss": 0.6429, + "step": 262 + }, + { + "epoch": 0.33880837359098226, + "grad_norm": 0.4804583787918091, + "learning_rate": 0.00013333333333333334, + "loss": 0.7772, + "step": 263 + }, + { + "epoch": 0.34009661835748795, + "grad_norm": 0.46911564469337463, + "learning_rate": 0.0001330739299610895, + "loss": 0.5994, + "step": 264 + }, + { + "epoch": 0.3413848631239936, + "grad_norm": 0.5286073088645935, + "learning_rate": 0.00013281452658884567, + "loss": 0.6459, + "step": 265 + }, + { + "epoch": 0.3426731078904992, + "grad_norm": 0.48704788088798523, + "learning_rate": 0.00013255512321660182, + "loss": 0.6466, + "step": 266 + }, + { + "epoch": 0.34396135265700484, + "grad_norm": 0.5040203332901001, + "learning_rate": 0.00013229571984435797, + "loss": 0.7436, + "step": 267 + }, + { + "epoch": 0.34524959742351047, + "grad_norm": 0.48882773518562317, + "learning_rate": 0.00013203631647211415, + "loss": 0.7009, + "step": 268 + }, + { + "epoch": 0.3465378421900161, + "grad_norm": 0.5158678889274597, + "learning_rate": 0.0001317769130998703, + "loss": 0.6862, + "step": 269 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.489501416683197, + "learning_rate": 0.00013151750972762648, + "loss": 0.6378, + "step": 270 + }, + { + "epoch": 0.34911433172302736, + "grad_norm": 0.42305371165275574, + "learning_rate": 0.00013125810635538263, + "loss": 0.593, + "step": 271 + }, + { + "epoch": 0.350402576489533, + "grad_norm": 0.5226255059242249, + "learning_rate": 0.00013099870298313878, + "loss": 0.7828, + "step": 272 + }, + { + "epoch": 0.3516908212560386, + "grad_norm": 0.4217074513435364, + "learning_rate": 0.00013073929961089496, + "loss": 0.6397, + "step": 273 + }, + { + "epoch": 0.3529790660225443, + "grad_norm": 0.46896272897720337, + "learning_rate": 0.0001304798962386511, + "loss": 0.614, + "step": 274 + }, + { + "epoch": 0.35426731078904994, + "grad_norm": 0.47062304615974426, + "learning_rate": 0.00013022049286640728, + "loss": 0.6892, + "step": 275 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.4669751822948456, + "learning_rate": 0.00012996108949416343, + "loss": 0.8675, + "step": 276 + }, + { + "epoch": 0.3568438003220612, + "grad_norm": 0.4246136546134949, + "learning_rate": 0.00012970168612191958, + "loss": 0.6467, + "step": 277 + }, + { + "epoch": 0.35813204508856683, + "grad_norm": 0.42293113470077515, + "learning_rate": 0.00012944228274967576, + "loss": 0.6006, + "step": 278 + }, + { + "epoch": 0.35942028985507246, + "grad_norm": 0.44599637389183044, + "learning_rate": 0.0001291828793774319, + "loss": 0.6241, + "step": 279 + }, + { + "epoch": 0.3607085346215781, + "grad_norm": 0.4490668773651123, + "learning_rate": 0.0001289234760051881, + "loss": 0.5644, + "step": 280 + }, + { + "epoch": 0.3619967793880837, + "grad_norm": 0.5100782513618469, + "learning_rate": 0.00012866407263294424, + "loss": 0.7378, + "step": 281 + }, + { + "epoch": 0.36328502415458935, + "grad_norm": 0.4394833445549011, + "learning_rate": 0.0001284046692607004, + "loss": 0.6662, + "step": 282 + }, + { + "epoch": 0.364573268921095, + "grad_norm": 0.49244457483291626, + "learning_rate": 0.00012814526588845657, + "loss": 0.7512, + "step": 283 + }, + { + "epoch": 0.36586151368760067, + "grad_norm": 0.4558521807193756, + "learning_rate": 0.0001278858625162127, + "loss": 0.7213, + "step": 284 + }, + { + "epoch": 0.3671497584541063, + "grad_norm": 0.6079721450805664, + "learning_rate": 0.00012762645914396887, + "loss": 0.7615, + "step": 285 + }, + { + "epoch": 0.36843800322061193, + "grad_norm": 0.5249935984611511, + "learning_rate": 0.00012736705577172502, + "loss": 0.8172, + "step": 286 + }, + { + "epoch": 0.36972624798711756, + "grad_norm": 0.5798977613449097, + "learning_rate": 0.0001271076523994812, + "loss": 0.8244, + "step": 287 + }, + { + "epoch": 0.3710144927536232, + "grad_norm": 0.496056467294693, + "learning_rate": 0.00012684824902723735, + "loss": 0.8799, + "step": 288 + }, + { + "epoch": 0.3723027375201288, + "grad_norm": 0.47068995237350464, + "learning_rate": 0.00012658884565499353, + "loss": 0.8069, + "step": 289 + }, + { + "epoch": 0.37359098228663445, + "grad_norm": 0.5302271842956543, + "learning_rate": 0.00012632944228274968, + "loss": 0.7593, + "step": 290 + }, + { + "epoch": 0.3748792270531401, + "grad_norm": 0.5044103860855103, + "learning_rate": 0.00012607003891050583, + "loss": 0.7462, + "step": 291 + }, + { + "epoch": 0.3761674718196457, + "grad_norm": 0.4707060158252716, + "learning_rate": 0.000125810635538262, + "loss": 0.6593, + "step": 292 + }, + { + "epoch": 0.37745571658615135, + "grad_norm": 0.5337527394294739, + "learning_rate": 0.00012555123216601816, + "loss": 0.6138, + "step": 293 + }, + { + "epoch": 0.37874396135265703, + "grad_norm": 0.5467652082443237, + "learning_rate": 0.00012529182879377434, + "loss": 0.8375, + "step": 294 + }, + { + "epoch": 0.38003220611916266, + "grad_norm": 0.48266416788101196, + "learning_rate": 0.0001250324254215305, + "loss": 0.6897, + "step": 295 + }, + { + "epoch": 0.3813204508856683, + "grad_norm": 0.49726054072380066, + "learning_rate": 0.00012477302204928664, + "loss": 0.8202, + "step": 296 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 0.5109860301017761, + "learning_rate": 0.00012451361867704282, + "loss": 0.7937, + "step": 297 + }, + { + "epoch": 0.38389694041867956, + "grad_norm": 0.44613054394721985, + "learning_rate": 0.00012425421530479897, + "loss": 0.7205, + "step": 298 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 0.5678048729896545, + "learning_rate": 0.00012399481193255514, + "loss": 0.8164, + "step": 299 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 0.4355293810367584, + "learning_rate": 0.0001237354085603113, + "loss": 0.5967, + "step": 300 + }, + { + "epoch": 0.38776167471819645, + "grad_norm": 0.5225346088409424, + "learning_rate": 0.00012347600518806745, + "loss": 0.7688, + "step": 301 + }, + { + "epoch": 0.3890499194847021, + "grad_norm": 0.47630950808525085, + "learning_rate": 0.00012321660181582362, + "loss": 0.6535, + "step": 302 + }, + { + "epoch": 0.3903381642512077, + "grad_norm": 0.48992452025413513, + "learning_rate": 0.00012295719844357977, + "loss": 0.652, + "step": 303 + }, + { + "epoch": 0.39162640901771334, + "grad_norm": 0.4927466809749603, + "learning_rate": 0.00012269779507133595, + "loss": 0.6116, + "step": 304 + }, + { + "epoch": 0.392914653784219, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0001224383916990921, + "loss": 0.6457, + "step": 305 + }, + { + "epoch": 0.39420289855072466, + "grad_norm": 0.49338245391845703, + "learning_rate": 0.00012217898832684825, + "loss": 0.6211, + "step": 306 + }, + { + "epoch": 0.3954911433172303, + "grad_norm": 0.5238732099533081, + "learning_rate": 0.00012191958495460443, + "loss": 0.7313, + "step": 307 + }, + { + "epoch": 0.3967793880837359, + "grad_norm": 0.494093656539917, + "learning_rate": 0.00012166018158236057, + "loss": 0.7583, + "step": 308 + }, + { + "epoch": 0.39806763285024155, + "grad_norm": 0.46139660477638245, + "learning_rate": 0.00012140077821011673, + "loss": 0.6841, + "step": 309 + }, + { + "epoch": 0.3993558776167472, + "grad_norm": 0.4901793897151947, + "learning_rate": 0.00012114137483787288, + "loss": 0.6862, + "step": 310 + }, + { + "epoch": 0.4006441223832528, + "grad_norm": 0.4695977568626404, + "learning_rate": 0.00012088197146562905, + "loss": 0.6428, + "step": 311 + }, + { + "epoch": 0.40193236714975844, + "grad_norm": 0.4964921772480011, + "learning_rate": 0.00012062256809338521, + "loss": 0.6061, + "step": 312 + }, + { + "epoch": 0.40322061191626407, + "grad_norm": 0.5101466178894043, + "learning_rate": 0.00012036316472114138, + "loss": 0.8195, + "step": 313 + }, + { + "epoch": 0.4045088566827697, + "grad_norm": 0.470225989818573, + "learning_rate": 0.00012010376134889754, + "loss": 0.681, + "step": 314 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 0.4532884955406189, + "learning_rate": 0.0001198443579766537, + "loss": 0.7239, + "step": 315 + }, + { + "epoch": 0.407085346215781, + "grad_norm": 0.4604836106300354, + "learning_rate": 0.00011958495460440985, + "loss": 0.7433, + "step": 316 + }, + { + "epoch": 0.40837359098228665, + "grad_norm": 0.4511779546737671, + "learning_rate": 0.00011932555123216602, + "loss": 0.7404, + "step": 317 + }, + { + "epoch": 0.4096618357487923, + "grad_norm": 0.5277577042579651, + "learning_rate": 0.00011906614785992218, + "loss": 0.8757, + "step": 318 + }, + { + "epoch": 0.4109500805152979, + "grad_norm": 0.444564551115036, + "learning_rate": 0.00011880674448767835, + "loss": 0.6465, + "step": 319 + }, + { + "epoch": 0.41223832528180354, + "grad_norm": 0.4861951470375061, + "learning_rate": 0.00011854734111543451, + "loss": 0.8336, + "step": 320 + }, + { + "epoch": 0.41352657004830917, + "grad_norm": 0.4412696957588196, + "learning_rate": 0.00011828793774319066, + "loss": 0.7586, + "step": 321 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 0.5230206251144409, + "learning_rate": 0.00011802853437094683, + "loss": 0.7423, + "step": 322 + }, + { + "epoch": 0.41610305958132043, + "grad_norm": 0.4539431631565094, + "learning_rate": 0.00011776913099870299, + "loss": 0.6849, + "step": 323 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 0.5001434683799744, + "learning_rate": 0.00011750972762645916, + "loss": 0.6527, + "step": 324 + }, + { + "epoch": 0.41867954911433175, + "grad_norm": 0.5230083465576172, + "learning_rate": 0.00011725032425421532, + "loss": 0.6829, + "step": 325 + }, + { + "epoch": 0.4199677938808374, + "grad_norm": 0.5428875684738159, + "learning_rate": 0.00011699092088197148, + "loss": 0.6075, + "step": 326 + }, + { + "epoch": 0.421256038647343, + "grad_norm": 0.49785757064819336, + "learning_rate": 0.00011673151750972763, + "loss": 0.6696, + "step": 327 + }, + { + "epoch": 0.42254428341384864, + "grad_norm": 0.5448641180992126, + "learning_rate": 0.0001164721141374838, + "loss": 0.7752, + "step": 328 + }, + { + "epoch": 0.4238325281803543, + "grad_norm": 0.6280490159988403, + "learning_rate": 0.00011621271076523996, + "loss": 0.8681, + "step": 329 + }, + { + "epoch": 0.4251207729468599, + "grad_norm": 0.5525287389755249, + "learning_rate": 0.00011595330739299613, + "loss": 0.8434, + "step": 330 + }, + { + "epoch": 0.42640901771336553, + "grad_norm": 0.4954991042613983, + "learning_rate": 0.00011569390402075229, + "loss": 0.7923, + "step": 331 + }, + { + "epoch": 0.42769726247987117, + "grad_norm": 0.46500164270401, + "learning_rate": 0.00011543450064850843, + "loss": 0.7084, + "step": 332 + }, + { + "epoch": 0.4289855072463768, + "grad_norm": 0.5183458924293518, + "learning_rate": 0.00011517509727626459, + "loss": 0.754, + "step": 333 + }, + { + "epoch": 0.4302737520128824, + "grad_norm": 0.521300733089447, + "learning_rate": 0.00011491569390402074, + "loss": 0.7481, + "step": 334 + }, + { + "epoch": 0.43156199677938806, + "grad_norm": 0.46088019013404846, + "learning_rate": 0.00011465629053177691, + "loss": 0.5601, + "step": 335 + }, + { + "epoch": 0.43285024154589374, + "grad_norm": 0.5142108798027039, + "learning_rate": 0.00011439688715953307, + "loss": 0.8001, + "step": 336 + }, + { + "epoch": 0.4341384863123994, + "grad_norm": 0.41947636008262634, + "learning_rate": 0.00011413748378728924, + "loss": 0.6669, + "step": 337 + }, + { + "epoch": 0.435426731078905, + "grad_norm": 0.4584703743457794, + "learning_rate": 0.0001138780804150454, + "loss": 0.702, + "step": 338 + }, + { + "epoch": 0.43671497584541064, + "grad_norm": 0.4480314254760742, + "learning_rate": 0.00011361867704280155, + "loss": 0.6379, + "step": 339 + }, + { + "epoch": 0.43800322061191627, + "grad_norm": 0.49402984976768494, + "learning_rate": 0.00011335927367055772, + "loss": 0.7751, + "step": 340 + }, + { + "epoch": 0.4392914653784219, + "grad_norm": 0.5001116991043091, + "learning_rate": 0.00011309987029831388, + "loss": 0.7157, + "step": 341 + }, + { + "epoch": 0.4405797101449275, + "grad_norm": 0.4650849401950836, + "learning_rate": 0.00011284046692607004, + "loss": 0.5801, + "step": 342 + }, + { + "epoch": 0.44186795491143316, + "grad_norm": 0.5000032186508179, + "learning_rate": 0.00011258106355382621, + "loss": 0.8127, + "step": 343 + }, + { + "epoch": 0.4431561996779388, + "grad_norm": 0.5941475033760071, + "learning_rate": 0.00011232166018158237, + "loss": 0.8227, + "step": 344 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.49535176157951355, + "learning_rate": 0.00011206225680933852, + "loss": 0.6376, + "step": 345 + }, + { + "epoch": 0.4457326892109501, + "grad_norm": 0.46945926547050476, + "learning_rate": 0.00011180285343709469, + "loss": 0.6801, + "step": 346 + }, + { + "epoch": 0.44702093397745574, + "grad_norm": 0.47991520166397095, + "learning_rate": 0.00011154345006485085, + "loss": 0.6071, + "step": 347 + }, + { + "epoch": 0.44830917874396137, + "grad_norm": 0.45372679829597473, + "learning_rate": 0.00011128404669260702, + "loss": 0.7388, + "step": 348 + }, + { + "epoch": 0.449597423510467, + "grad_norm": 0.5295307636260986, + "learning_rate": 0.00011102464332036318, + "loss": 0.776, + "step": 349 + }, + { + "epoch": 0.45088566827697263, + "grad_norm": 0.516298770904541, + "learning_rate": 0.00011076523994811933, + "loss": 0.7546, + "step": 350 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 0.4629455804824829, + "learning_rate": 0.0001105058365758755, + "loss": 0.5773, + "step": 351 + }, + { + "epoch": 0.4534621578099839, + "grad_norm": 0.4974667727947235, + "learning_rate": 0.00011024643320363166, + "loss": 0.6464, + "step": 352 + }, + { + "epoch": 0.4547504025764895, + "grad_norm": 0.47429102659225464, + "learning_rate": 0.00010998702983138782, + "loss": 0.7503, + "step": 353 + }, + { + "epoch": 0.45603864734299515, + "grad_norm": 0.5169098377227783, + "learning_rate": 0.00010972762645914399, + "loss": 0.7697, + "step": 354 + }, + { + "epoch": 0.4573268921095008, + "grad_norm": 0.6083032488822937, + "learning_rate": 0.00010946822308690015, + "loss": 0.7145, + "step": 355 + }, + { + "epoch": 0.45861513687600647, + "grad_norm": 0.6092599034309387, + "learning_rate": 0.00010920881971465629, + "loss": 0.9839, + "step": 356 + }, + { + "epoch": 0.4599033816425121, + "grad_norm": 0.47699296474456787, + "learning_rate": 0.00010894941634241245, + "loss": 0.7013, + "step": 357 + }, + { + "epoch": 0.46119162640901773, + "grad_norm": 0.44026511907577515, + "learning_rate": 0.0001086900129701686, + "loss": 0.7314, + "step": 358 + }, + { + "epoch": 0.46247987117552336, + "grad_norm": 0.5326471328735352, + "learning_rate": 0.00010843060959792477, + "loss": 0.8708, + "step": 359 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 0.5188657641410828, + "learning_rate": 0.00010817120622568093, + "loss": 0.6573, + "step": 360 + }, + { + "epoch": 0.4650563607085346, + "grad_norm": 0.5846801400184631, + "learning_rate": 0.0001079118028534371, + "loss": 0.7319, + "step": 361 + }, + { + "epoch": 0.46634460547504025, + "grad_norm": 0.5272177457809448, + "learning_rate": 0.00010765239948119326, + "loss": 0.6251, + "step": 362 + }, + { + "epoch": 0.4676328502415459, + "grad_norm": 0.5060721635818481, + "learning_rate": 0.00010739299610894941, + "loss": 0.6675, + "step": 363 + }, + { + "epoch": 0.4689210950080515, + "grad_norm": 0.5200803279876709, + "learning_rate": 0.00010713359273670558, + "loss": 0.6373, + "step": 364 + }, + { + "epoch": 0.47020933977455714, + "grad_norm": 0.5527567863464355, + "learning_rate": 0.00010687418936446174, + "loss": 0.9144, + "step": 365 + }, + { + "epoch": 0.4714975845410628, + "grad_norm": 0.5247730016708374, + "learning_rate": 0.0001066147859922179, + "loss": 0.7283, + "step": 366 + }, + { + "epoch": 0.47278582930756846, + "grad_norm": 0.482681006193161, + "learning_rate": 0.00010635538261997407, + "loss": 0.8382, + "step": 367 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 0.5045844316482544, + "learning_rate": 0.00010609597924773022, + "loss": 0.8324, + "step": 368 + }, + { + "epoch": 0.4753623188405797, + "grad_norm": 0.500696063041687, + "learning_rate": 0.00010583657587548638, + "loss": 0.6749, + "step": 369 + }, + { + "epoch": 0.47665056360708535, + "grad_norm": 0.49296805262565613, + "learning_rate": 0.00010557717250324255, + "loss": 0.8396, + "step": 370 + }, + { + "epoch": 0.477938808373591, + "grad_norm": 0.5083613395690918, + "learning_rate": 0.00010531776913099871, + "loss": 0.596, + "step": 371 + }, + { + "epoch": 0.4792270531400966, + "grad_norm": 0.6000961065292358, + "learning_rate": 0.00010505836575875488, + "loss": 0.7097, + "step": 372 + }, + { + "epoch": 0.48051529790660225, + "grad_norm": 0.47504574060440063, + "learning_rate": 0.00010479896238651104, + "loss": 0.676, + "step": 373 + }, + { + "epoch": 0.4818035426731079, + "grad_norm": 0.4866791069507599, + "learning_rate": 0.00010453955901426719, + "loss": 0.6026, + "step": 374 + }, + { + "epoch": 0.4830917874396135, + "grad_norm": 0.5388527512550354, + "learning_rate": 0.00010428015564202336, + "loss": 0.78, + "step": 375 + }, + { + "epoch": 0.48438003220611914, + "grad_norm": 0.5430642366409302, + "learning_rate": 0.00010402075226977952, + "loss": 0.7898, + "step": 376 + }, + { + "epoch": 0.4856682769726248, + "grad_norm": 0.5378901362419128, + "learning_rate": 0.00010376134889753568, + "loss": 0.8215, + "step": 377 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 0.46278834342956543, + "learning_rate": 0.00010350194552529185, + "loss": 0.786, + "step": 378 + }, + { + "epoch": 0.4882447665056361, + "grad_norm": 0.5695458650588989, + "learning_rate": 0.000103242542153048, + "loss": 0.7929, + "step": 379 + }, + { + "epoch": 0.4895330112721417, + "grad_norm": 0.5052254796028137, + "learning_rate": 0.00010298313878080415, + "loss": 0.8047, + "step": 380 + }, + { + "epoch": 0.49082125603864735, + "grad_norm": 0.45410144329071045, + "learning_rate": 0.0001027237354085603, + "loss": 0.6309, + "step": 381 + }, + { + "epoch": 0.492109500805153, + "grad_norm": 0.5507941842079163, + "learning_rate": 0.00010246433203631646, + "loss": 0.7374, + "step": 382 + }, + { + "epoch": 0.4933977455716586, + "grad_norm": 0.4703005850315094, + "learning_rate": 0.00010220492866407263, + "loss": 0.5724, + "step": 383 + }, + { + "epoch": 0.49468599033816424, + "grad_norm": 0.5034976601600647, + "learning_rate": 0.0001019455252918288, + "loss": 0.6829, + "step": 384 + }, + { + "epoch": 0.49597423510466987, + "grad_norm": 0.5183707475662231, + "learning_rate": 0.00010168612191958496, + "loss": 0.6716, + "step": 385 + }, + { + "epoch": 0.4972624798711755, + "grad_norm": 0.5549296736717224, + "learning_rate": 0.00010142671854734112, + "loss": 0.7464, + "step": 386 + }, + { + "epoch": 0.4985507246376812, + "grad_norm": 0.48852047324180603, + "learning_rate": 0.00010116731517509727, + "loss": 0.649, + "step": 387 + }, + { + "epoch": 0.4998389694041868, + "grad_norm": 0.5118862986564636, + "learning_rate": 0.00010090791180285344, + "loss": 0.6043, + "step": 388 + }, + { + "epoch": 0.5011272141706924, + "grad_norm": 0.5366110801696777, + "learning_rate": 0.0001006485084306096, + "loss": 0.7139, + "step": 389 + }, + { + "epoch": 0.5024154589371981, + "grad_norm": 0.5275729894638062, + "learning_rate": 0.00010038910505836577, + "loss": 0.7035, + "step": 390 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 0.5201203227043152, + "learning_rate": 0.00010012970168612193, + "loss": 0.716, + "step": 391 + }, + { + "epoch": 0.5049919484702093, + "grad_norm": 0.5168887376785278, + "learning_rate": 9.987029831387808e-05, + "loss": 0.8432, + "step": 392 + }, + { + "epoch": 0.506280193236715, + "grad_norm": 0.5083385109901428, + "learning_rate": 9.961089494163424e-05, + "loss": 0.7078, + "step": 393 + }, + { + "epoch": 0.5075684380032206, + "grad_norm": 0.5033498406410217, + "learning_rate": 9.935149156939041e-05, + "loss": 0.6846, + "step": 394 + }, + { + "epoch": 0.5088566827697263, + "grad_norm": 0.5229712128639221, + "learning_rate": 9.909208819714657e-05, + "loss": 0.7517, + "step": 395 + }, + { + "epoch": 0.5101449275362319, + "grad_norm": 0.4493921399116516, + "learning_rate": 9.883268482490274e-05, + "loss": 0.5185, + "step": 396 + }, + { + "epoch": 0.5114331723027375, + "grad_norm": 0.4618862569332123, + "learning_rate": 9.857328145265889e-05, + "loss": 0.6914, + "step": 397 + }, + { + "epoch": 0.5127214170692431, + "grad_norm": 0.5105440020561218, + "learning_rate": 9.831387808041504e-05, + "loss": 0.7408, + "step": 398 + }, + { + "epoch": 0.5140096618357488, + "grad_norm": 0.4876827001571655, + "learning_rate": 9.80544747081712e-05, + "loss": 0.7642, + "step": 399 + }, + { + "epoch": 0.5152979066022544, + "grad_norm": 0.5248561501502991, + "learning_rate": 9.779507133592737e-05, + "loss": 0.6578, + "step": 400 + }, + { + "epoch": 0.5165861513687601, + "grad_norm": 0.4495491087436676, + "learning_rate": 9.753566796368353e-05, + "loss": 0.6296, + "step": 401 + }, + { + "epoch": 0.5178743961352656, + "grad_norm": 0.4628872573375702, + "learning_rate": 9.72762645914397e-05, + "loss": 0.5686, + "step": 402 + }, + { + "epoch": 0.5191626409017713, + "grad_norm": 0.5524469017982483, + "learning_rate": 9.701686121919586e-05, + "loss": 0.8243, + "step": 403 + }, + { + "epoch": 0.520450885668277, + "grad_norm": 0.5526472926139832, + "learning_rate": 9.675745784695201e-05, + "loss": 0.7644, + "step": 404 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.5220494270324707, + "learning_rate": 9.649805447470817e-05, + "loss": 0.7113, + "step": 405 + }, + { + "epoch": 0.5230273752012883, + "grad_norm": 0.4727495610713959, + "learning_rate": 9.623865110246434e-05, + "loss": 0.5613, + "step": 406 + }, + { + "epoch": 0.5243156199677939, + "grad_norm": 0.440445214509964, + "learning_rate": 9.59792477302205e-05, + "loss": 0.5719, + "step": 407 + }, + { + "epoch": 0.5256038647342995, + "grad_norm": 0.520539402961731, + "learning_rate": 9.571984435797667e-05, + "loss": 0.6716, + "step": 408 + }, + { + "epoch": 0.5268921095008051, + "grad_norm": 0.5473395586013794, + "learning_rate": 9.546044098573282e-05, + "loss": 0.6881, + "step": 409 + }, + { + "epoch": 0.5281803542673108, + "grad_norm": 0.5728646516799927, + "learning_rate": 9.520103761348897e-05, + "loss": 0.694, + "step": 410 + }, + { + "epoch": 0.5294685990338164, + "grad_norm": 0.5672905445098877, + "learning_rate": 9.494163424124513e-05, + "loss": 0.7893, + "step": 411 + }, + { + "epoch": 0.5307568438003221, + "grad_norm": 0.5057477355003357, + "learning_rate": 9.46822308690013e-05, + "loss": 0.7957, + "step": 412 + }, + { + "epoch": 0.5320450885668278, + "grad_norm": 0.5638203620910645, + "learning_rate": 9.442282749675746e-05, + "loss": 0.8208, + "step": 413 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4758095145225525, + "learning_rate": 9.416342412451363e-05, + "loss": 0.6516, + "step": 414 + }, + { + "epoch": 0.534621578099839, + "grad_norm": 0.5819146037101746, + "learning_rate": 9.390402075226979e-05, + "loss": 0.7839, + "step": 415 + }, + { + "epoch": 0.5359098228663446, + "grad_norm": 0.5698294639587402, + "learning_rate": 9.364461738002594e-05, + "loss": 0.6847, + "step": 416 + }, + { + "epoch": 0.5371980676328503, + "grad_norm": 0.5539317727088928, + "learning_rate": 9.33852140077821e-05, + "loss": 0.6868, + "step": 417 + }, + { + "epoch": 0.5384863123993558, + "grad_norm": 0.5531253218650818, + "learning_rate": 9.312581063553827e-05, + "loss": 0.6416, + "step": 418 + }, + { + "epoch": 0.5397745571658615, + "grad_norm": 0.5280610918998718, + "learning_rate": 9.286640726329443e-05, + "loss": 0.6711, + "step": 419 + }, + { + "epoch": 0.5410628019323671, + "grad_norm": 0.5485169291496277, + "learning_rate": 9.26070038910506e-05, + "loss": 0.67, + "step": 420 + }, + { + "epoch": 0.5423510466988728, + "grad_norm": 0.632940948009491, + "learning_rate": 9.234760051880675e-05, + "loss": 1.0122, + "step": 421 + }, + { + "epoch": 0.5436392914653784, + "grad_norm": 0.5226237177848816, + "learning_rate": 9.20881971465629e-05, + "loss": 0.6517, + "step": 422 + }, + { + "epoch": 0.5449275362318841, + "grad_norm": 0.5100864768028259, + "learning_rate": 9.182879377431906e-05, + "loss": 0.5886, + "step": 423 + }, + { + "epoch": 0.5462157809983897, + "grad_norm": 0.5091288685798645, + "learning_rate": 9.156939040207523e-05, + "loss": 0.759, + "step": 424 + }, + { + "epoch": 0.5475040257648953, + "grad_norm": 0.5094250440597534, + "learning_rate": 9.130998702983139e-05, + "loss": 0.6407, + "step": 425 + }, + { + "epoch": 0.548792270531401, + "grad_norm": 0.4518897533416748, + "learning_rate": 9.105058365758756e-05, + "loss": 0.5463, + "step": 426 + }, + { + "epoch": 0.5500805152979066, + "grad_norm": 0.5876538753509521, + "learning_rate": 9.07911802853437e-05, + "loss": 0.8909, + "step": 427 + }, + { + "epoch": 0.5513687600644123, + "grad_norm": 0.5553408265113831, + "learning_rate": 9.053177691309987e-05, + "loss": 0.9111, + "step": 428 + }, + { + "epoch": 0.5526570048309178, + "grad_norm": 0.6221159100532532, + "learning_rate": 9.027237354085604e-05, + "loss": 0.8558, + "step": 429 + }, + { + "epoch": 0.5539452495974235, + "grad_norm": 0.5404058694839478, + "learning_rate": 9.00129701686122e-05, + "loss": 0.7609, + "step": 430 + }, + { + "epoch": 0.5552334943639291, + "grad_norm": 0.43805137276649475, + "learning_rate": 8.975356679636836e-05, + "loss": 0.5286, + "step": 431 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 0.493563175201416, + "learning_rate": 8.949416342412453e-05, + "loss": 0.5727, + "step": 432 + }, + { + "epoch": 0.5578099838969404, + "grad_norm": 0.5368430614471436, + "learning_rate": 8.923476005188068e-05, + "loss": 0.7623, + "step": 433 + }, + { + "epoch": 0.559098228663446, + "grad_norm": 0.4323422312736511, + "learning_rate": 8.897535667963683e-05, + "loss": 0.5958, + "step": 434 + }, + { + "epoch": 0.5603864734299517, + "grad_norm": 0.49179717898368835, + "learning_rate": 8.8715953307393e-05, + "loss": 0.681, + "step": 435 + }, + { + "epoch": 0.5616747181964573, + "grad_norm": 0.40715619921684265, + "learning_rate": 8.845654993514916e-05, + "loss": 0.6298, + "step": 436 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 0.6095149517059326, + "learning_rate": 8.819714656290532e-05, + "loss": 1.0039, + "step": 437 + }, + { + "epoch": 0.5642512077294686, + "grad_norm": 0.5469616055488586, + "learning_rate": 8.793774319066149e-05, + "loss": 0.7941, + "step": 438 + }, + { + "epoch": 0.5655394524959743, + "grad_norm": 0.5149989128112793, + "learning_rate": 8.767833981841764e-05, + "loss": 0.7819, + "step": 439 + }, + { + "epoch": 0.5668276972624798, + "grad_norm": 0.479438453912735, + "learning_rate": 8.74189364461738e-05, + "loss": 0.4806, + "step": 440 + }, + { + "epoch": 0.5681159420289855, + "grad_norm": 0.562567412853241, + "learning_rate": 8.715953307392997e-05, + "loss": 0.7942, + "step": 441 + }, + { + "epoch": 0.5694041867954911, + "grad_norm": 0.5192587375640869, + "learning_rate": 8.690012970168613e-05, + "loss": 0.6479, + "step": 442 + }, + { + "epoch": 0.5706924315619968, + "grad_norm": 0.4897756576538086, + "learning_rate": 8.66407263294423e-05, + "loss": 0.6539, + "step": 443 + }, + { + "epoch": 0.5719806763285025, + "grad_norm": 0.45649632811546326, + "learning_rate": 8.638132295719846e-05, + "loss": 0.657, + "step": 444 + }, + { + "epoch": 0.573268921095008, + "grad_norm": 0.5581417679786682, + "learning_rate": 8.612191958495461e-05, + "loss": 0.7415, + "step": 445 + }, + { + "epoch": 0.5745571658615137, + "grad_norm": 0.4822051525115967, + "learning_rate": 8.586251621271076e-05, + "loss": 0.7249, + "step": 446 + }, + { + "epoch": 0.5758454106280193, + "grad_norm": 0.6398015022277832, + "learning_rate": 8.560311284046692e-05, + "loss": 0.7328, + "step": 447 + }, + { + "epoch": 0.577133655394525, + "grad_norm": 0.5618659257888794, + "learning_rate": 8.534370946822309e-05, + "loss": 0.8104, + "step": 448 + }, + { + "epoch": 0.5784219001610306, + "grad_norm": 0.49202972650527954, + "learning_rate": 8.508430609597925e-05, + "loss": 0.6797, + "step": 449 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.5291930437088013, + "learning_rate": 8.482490272373542e-05, + "loss": 0.6015, + "step": 450 + }, + { + "epoch": 0.5809983896940418, + "grad_norm": 0.5322192907333374, + "learning_rate": 8.456549935149157e-05, + "loss": 0.7246, + "step": 451 + }, + { + "epoch": 0.5822866344605475, + "grad_norm": 0.5172200798988342, + "learning_rate": 8.430609597924773e-05, + "loss": 0.6873, + "step": 452 + }, + { + "epoch": 0.5835748792270531, + "grad_norm": 0.5367067456245422, + "learning_rate": 8.40466926070039e-05, + "loss": 0.7349, + "step": 453 + }, + { + "epoch": 0.5848631239935588, + "grad_norm": 0.5243058204650879, + "learning_rate": 8.378728923476006e-05, + "loss": 0.6441, + "step": 454 + }, + { + "epoch": 0.5861513687600645, + "grad_norm": 0.5509822964668274, + "learning_rate": 8.352788586251622e-05, + "loss": 0.7456, + "step": 455 + }, + { + "epoch": 0.58743961352657, + "grad_norm": 0.5376744866371155, + "learning_rate": 8.326848249027238e-05, + "loss": 0.6808, + "step": 456 + }, + { + "epoch": 0.5887278582930757, + "grad_norm": 0.5412257313728333, + "learning_rate": 8.300907911802854e-05, + "loss": 0.6135, + "step": 457 + }, + { + "epoch": 0.5900161030595813, + "grad_norm": 0.5956122279167175, + "learning_rate": 8.274967574578469e-05, + "loss": 0.7419, + "step": 458 + }, + { + "epoch": 0.591304347826087, + "grad_norm": 0.5524086952209473, + "learning_rate": 8.249027237354085e-05, + "loss": 0.5655, + "step": 459 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.5783061981201172, + "learning_rate": 8.223086900129702e-05, + "loss": 0.6528, + "step": 460 + }, + { + "epoch": 0.5938808373590982, + "grad_norm": 0.5542893409729004, + "learning_rate": 8.197146562905318e-05, + "loss": 0.6988, + "step": 461 + }, + { + "epoch": 0.5951690821256038, + "grad_norm": 0.5710337162017822, + "learning_rate": 8.171206225680935e-05, + "loss": 0.7465, + "step": 462 + }, + { + "epoch": 0.5964573268921095, + "grad_norm": 0.5694112181663513, + "learning_rate": 8.14526588845655e-05, + "loss": 0.7005, + "step": 463 + }, + { + "epoch": 0.5977455716586151, + "grad_norm": 0.5017877221107483, + "learning_rate": 8.119325551232166e-05, + "loss": 0.5978, + "step": 464 + }, + { + "epoch": 0.5990338164251208, + "grad_norm": 0.5797461271286011, + "learning_rate": 8.093385214007783e-05, + "loss": 0.8, + "step": 465 + }, + { + "epoch": 0.6003220611916265, + "grad_norm": 0.597811222076416, + "learning_rate": 8.067444876783399e-05, + "loss": 0.8356, + "step": 466 + }, + { + "epoch": 0.601610305958132, + "grad_norm": 0.5971367955207825, + "learning_rate": 8.041504539559015e-05, + "loss": 1.0245, + "step": 467 + }, + { + "epoch": 0.6028985507246377, + "grad_norm": 0.5506448745727539, + "learning_rate": 8.01556420233463e-05, + "loss": 0.6065, + "step": 468 + }, + { + "epoch": 0.6041867954911433, + "grad_norm": 0.5866613984107971, + "learning_rate": 7.989623865110247e-05, + "loss": 0.7841, + "step": 469 + }, + { + "epoch": 0.605475040257649, + "grad_norm": 0.5632089376449585, + "learning_rate": 7.963683527885862e-05, + "loss": 0.8046, + "step": 470 + }, + { + "epoch": 0.6067632850241546, + "grad_norm": 0.5145373940467834, + "learning_rate": 7.937743190661478e-05, + "loss": 0.5978, + "step": 471 + }, + { + "epoch": 0.6080515297906602, + "grad_norm": 0.48332056403160095, + "learning_rate": 7.911802853437095e-05, + "loss": 0.6119, + "step": 472 + }, + { + "epoch": 0.6093397745571658, + "grad_norm": 0.522520899772644, + "learning_rate": 7.885862516212711e-05, + "loss": 0.6623, + "step": 473 + }, + { + "epoch": 0.6106280193236715, + "grad_norm": 0.5305100679397583, + "learning_rate": 7.859922178988328e-05, + "loss": 0.7882, + "step": 474 + }, + { + "epoch": 0.6119162640901772, + "grad_norm": 0.4909839630126953, + "learning_rate": 7.833981841763943e-05, + "loss": 0.625, + "step": 475 + }, + { + "epoch": 0.6132045088566828, + "grad_norm": 0.5770312547683716, + "learning_rate": 7.808041504539559e-05, + "loss": 0.7479, + "step": 476 + }, + { + "epoch": 0.6144927536231884, + "grad_norm": 0.556817889213562, + "learning_rate": 7.782101167315176e-05, + "loss": 0.8317, + "step": 477 + }, + { + "epoch": 0.615780998389694, + "grad_norm": 0.5197098255157471, + "learning_rate": 7.756160830090792e-05, + "loss": 0.761, + "step": 478 + }, + { + "epoch": 0.6170692431561997, + "grad_norm": 0.5032650828361511, + "learning_rate": 7.730220492866408e-05, + "loss": 0.7149, + "step": 479 + }, + { + "epoch": 0.6183574879227053, + "grad_norm": 0.5901761651039124, + "learning_rate": 7.704280155642024e-05, + "loss": 0.723, + "step": 480 + }, + { + "epoch": 0.619645732689211, + "grad_norm": 0.5224949717521667, + "learning_rate": 7.67833981841764e-05, + "loss": 0.7275, + "step": 481 + }, + { + "epoch": 0.6209339774557165, + "grad_norm": 0.47279688715934753, + "learning_rate": 7.652399481193255e-05, + "loss": 0.5791, + "step": 482 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.49582868814468384, + "learning_rate": 7.626459143968871e-05, + "loss": 0.6728, + "step": 483 + }, + { + "epoch": 0.6235104669887278, + "grad_norm": 0.4722840189933777, + "learning_rate": 7.600518806744488e-05, + "loss": 0.5838, + "step": 484 + }, + { + "epoch": 0.6247987117552335, + "grad_norm": 0.5800105333328247, + "learning_rate": 7.574578469520104e-05, + "loss": 0.595, + "step": 485 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 0.5518195033073425, + "learning_rate": 7.54863813229572e-05, + "loss": 0.7302, + "step": 486 + }, + { + "epoch": 0.6273752012882448, + "grad_norm": 0.44109973311424255, + "learning_rate": 7.522697795071336e-05, + "loss": 0.5433, + "step": 487 + }, + { + "epoch": 0.6286634460547504, + "grad_norm": 0.5839915871620178, + "learning_rate": 7.496757457846952e-05, + "loss": 0.7484, + "step": 488 + }, + { + "epoch": 0.629951690821256, + "grad_norm": 0.6299886107444763, + "learning_rate": 7.470817120622569e-05, + "loss": 0.7771, + "step": 489 + }, + { + "epoch": 0.6312399355877617, + "grad_norm": 0.48367929458618164, + "learning_rate": 7.444876783398185e-05, + "loss": 0.5674, + "step": 490 + }, + { + "epoch": 0.6325281803542673, + "grad_norm": 0.5867652893066406, + "learning_rate": 7.418936446173802e-05, + "loss": 0.7733, + "step": 491 + }, + { + "epoch": 0.633816425120773, + "grad_norm": 0.4677927494049072, + "learning_rate": 7.392996108949417e-05, + "loss": 0.6418, + "step": 492 + }, + { + "epoch": 0.6351046698872785, + "grad_norm": 0.5139054656028748, + "learning_rate": 7.367055771725033e-05, + "loss": 0.7922, + "step": 493 + }, + { + "epoch": 0.6363929146537842, + "grad_norm": 0.4561646282672882, + "learning_rate": 7.341115434500648e-05, + "loss": 0.5885, + "step": 494 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 0.5079929828643799, + "learning_rate": 7.315175097276265e-05, + "loss": 0.6827, + "step": 495 + }, + { + "epoch": 0.6389694041867955, + "grad_norm": 0.5590360164642334, + "learning_rate": 7.289234760051881e-05, + "loss": 0.6681, + "step": 496 + }, + { + "epoch": 0.6402576489533012, + "grad_norm": 0.585269033908844, + "learning_rate": 7.263294422827497e-05, + "loss": 0.7604, + "step": 497 + }, + { + "epoch": 0.6415458937198067, + "grad_norm": 0.5380440950393677, + "learning_rate": 7.237354085603112e-05, + "loss": 0.7946, + "step": 498 + }, + { + "epoch": 0.6428341384863124, + "grad_norm": 0.4413246214389801, + "learning_rate": 7.211413748378729e-05, + "loss": 0.5847, + "step": 499 + }, + { + "epoch": 0.644122383252818, + "grad_norm": 0.536934494972229, + "learning_rate": 7.185473411154345e-05, + "loss": 0.745, + "step": 500 + }, + { + "epoch": 0.6454106280193237, + "grad_norm": 0.46904176473617554, + "learning_rate": 7.159533073929962e-05, + "loss": 0.6846, + "step": 501 + }, + { + "epoch": 0.6466988727858293, + "grad_norm": 0.5345873832702637, + "learning_rate": 7.133592736705578e-05, + "loss": 0.7499, + "step": 502 + }, + { + "epoch": 0.647987117552335, + "grad_norm": 0.5083842873573303, + "learning_rate": 7.107652399481195e-05, + "loss": 0.7829, + "step": 503 + }, + { + "epoch": 0.6492753623188405, + "grad_norm": 0.49629780650138855, + "learning_rate": 7.08171206225681e-05, + "loss": 0.6308, + "step": 504 + }, + { + "epoch": 0.6505636070853462, + "grad_norm": 0.5113663077354431, + "learning_rate": 7.055771725032426e-05, + "loss": 0.7062, + "step": 505 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 0.5348049402236938, + "learning_rate": 7.029831387808041e-05, + "loss": 0.7495, + "step": 506 + }, + { + "epoch": 0.6531400966183575, + "grad_norm": 0.5834509134292603, + "learning_rate": 7.003891050583658e-05, + "loss": 0.81, + "step": 507 + }, + { + "epoch": 0.6544283413848632, + "grad_norm": 0.5517732501029968, + "learning_rate": 6.977950713359274e-05, + "loss": 0.7376, + "step": 508 + }, + { + "epoch": 0.6557165861513687, + "grad_norm": 0.5555460453033447, + "learning_rate": 6.95201037613489e-05, + "loss": 0.6707, + "step": 509 + }, + { + "epoch": 0.6570048309178744, + "grad_norm": 0.5952188968658447, + "learning_rate": 6.926070038910505e-05, + "loss": 0.8308, + "step": 510 + }, + { + "epoch": 0.65829307568438, + "grad_norm": 0.46281638741493225, + "learning_rate": 6.900129701686122e-05, + "loss": 0.5855, + "step": 511 + }, + { + "epoch": 0.6595813204508857, + "grad_norm": 0.5051981210708618, + "learning_rate": 6.874189364461738e-05, + "loss": 0.7197, + "step": 512 + }, + { + "epoch": 0.6608695652173913, + "grad_norm": 0.5460030436515808, + "learning_rate": 6.848249027237355e-05, + "loss": 0.6863, + "step": 513 + }, + { + "epoch": 0.662157809983897, + "grad_norm": 0.504718542098999, + "learning_rate": 6.822308690012971e-05, + "loss": 0.5749, + "step": 514 + }, + { + "epoch": 0.6634460547504025, + "grad_norm": 0.5503727793693542, + "learning_rate": 6.796368352788586e-05, + "loss": 0.6802, + "step": 515 + }, + { + "epoch": 0.6647342995169082, + "grad_norm": 0.559354305267334, + "learning_rate": 6.770428015564203e-05, + "loss": 0.6774, + "step": 516 + }, + { + "epoch": 0.6660225442834139, + "grad_norm": 0.5191950798034668, + "learning_rate": 6.744487678339819e-05, + "loss": 0.5853, + "step": 517 + }, + { + "epoch": 0.6673107890499195, + "grad_norm": 0.5837051868438721, + "learning_rate": 6.718547341115434e-05, + "loss": 0.7629, + "step": 518 + }, + { + "epoch": 0.6685990338164252, + "grad_norm": 0.49824637174606323, + "learning_rate": 6.69260700389105e-05, + "loss": 0.6449, + "step": 519 + }, + { + "epoch": 0.6698872785829307, + "grad_norm": 0.5827267169952393, + "learning_rate": 6.666666666666667e-05, + "loss": 0.6425, + "step": 520 + }, + { + "epoch": 0.6711755233494364, + "grad_norm": 0.5547000169754028, + "learning_rate": 6.640726329442283e-05, + "loss": 0.8105, + "step": 521 + }, + { + "epoch": 0.672463768115942, + "grad_norm": 0.5251694321632385, + "learning_rate": 6.614785992217898e-05, + "loss": 0.6392, + "step": 522 + }, + { + "epoch": 0.6737520128824477, + "grad_norm": 0.577367901802063, + "learning_rate": 6.588845654993515e-05, + "loss": 0.7776, + "step": 523 + }, + { + "epoch": 0.6750402576489533, + "grad_norm": 0.5495286583900452, + "learning_rate": 6.562905317769131e-05, + "loss": 0.6673, + "step": 524 + }, + { + "epoch": 0.6763285024154589, + "grad_norm": 0.6513116955757141, + "learning_rate": 6.536964980544748e-05, + "loss": 0.8314, + "step": 525 + }, + { + "epoch": 0.6776167471819645, + "grad_norm": 0.5346915125846863, + "learning_rate": 6.511024643320364e-05, + "loss": 0.703, + "step": 526 + }, + { + "epoch": 0.6789049919484702, + "grad_norm": 0.5663869380950928, + "learning_rate": 6.485084306095979e-05, + "loss": 0.6595, + "step": 527 + }, + { + "epoch": 0.6801932367149759, + "grad_norm": 0.5390554070472717, + "learning_rate": 6.459143968871596e-05, + "loss": 0.7031, + "step": 528 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 0.5291828513145447, + "learning_rate": 6.433203631647212e-05, + "loss": 0.7384, + "step": 529 + }, + { + "epoch": 0.6827697262479872, + "grad_norm": 0.507726788520813, + "learning_rate": 6.407263294422829e-05, + "loss": 0.6169, + "step": 530 + }, + { + "epoch": 0.6840579710144927, + "grad_norm": 0.524138331413269, + "learning_rate": 6.381322957198444e-05, + "loss": 0.6784, + "step": 531 + }, + { + "epoch": 0.6853462157809984, + "grad_norm": 0.5644485950469971, + "learning_rate": 6.35538261997406e-05, + "loss": 0.8255, + "step": 532 + }, + { + "epoch": 0.686634460547504, + "grad_norm": 0.5468744039535522, + "learning_rate": 6.329442282749676e-05, + "loss": 0.7893, + "step": 533 + }, + { + "epoch": 0.6879227053140097, + "grad_norm": 0.4952101409435272, + "learning_rate": 6.303501945525292e-05, + "loss": 0.6192, + "step": 534 + }, + { + "epoch": 0.6892109500805152, + "grad_norm": 0.5614569187164307, + "learning_rate": 6.277561608300908e-05, + "loss": 0.7055, + "step": 535 + }, + { + "epoch": 0.6904991948470209, + "grad_norm": 0.5651270151138306, + "learning_rate": 6.251621271076524e-05, + "loss": 0.7327, + "step": 536 + }, + { + "epoch": 0.6917874396135266, + "grad_norm": 0.5416032075881958, + "learning_rate": 6.225680933852141e-05, + "loss": 0.64, + "step": 537 + }, + { + "epoch": 0.6930756843800322, + "grad_norm": 0.6302821636199951, + "learning_rate": 6.199740596627757e-05, + "loss": 0.8542, + "step": 538 + }, + { + "epoch": 0.6943639291465379, + "grad_norm": 0.5361074805259705, + "learning_rate": 6.173800259403372e-05, + "loss": 0.6282, + "step": 539 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.5210204124450684, + "learning_rate": 6.147859922178989e-05, + "loss": 0.8354, + "step": 540 + }, + { + "epoch": 0.6969404186795491, + "grad_norm": 0.5401708483695984, + "learning_rate": 6.121919584954605e-05, + "loss": 0.6165, + "step": 541 + }, + { + "epoch": 0.6982286634460547, + "grad_norm": 0.516559362411499, + "learning_rate": 6.0959792477302215e-05, + "loss": 0.5498, + "step": 542 + }, + { + "epoch": 0.6995169082125604, + "grad_norm": 0.5983400344848633, + "learning_rate": 6.0700389105058366e-05, + "loss": 0.7538, + "step": 543 + }, + { + "epoch": 0.700805152979066, + "grad_norm": 0.5111982226371765, + "learning_rate": 6.0440985732814524e-05, + "loss": 0.6156, + "step": 544 + }, + { + "epoch": 0.7020933977455717, + "grad_norm": 0.5821353793144226, + "learning_rate": 6.018158236057069e-05, + "loss": 0.6417, + "step": 545 + }, + { + "epoch": 0.7033816425120772, + "grad_norm": 0.4738411009311676, + "learning_rate": 5.992217898832685e-05, + "loss": 0.6541, + "step": 546 + }, + { + "epoch": 0.7046698872785829, + "grad_norm": 0.6165397763252258, + "learning_rate": 5.966277561608301e-05, + "loss": 0.7366, + "step": 547 + }, + { + "epoch": 0.7059581320450886, + "grad_norm": 0.5883972644805908, + "learning_rate": 5.9403372243839174e-05, + "loss": 0.7371, + "step": 548 + }, + { + "epoch": 0.7072463768115942, + "grad_norm": 0.5415938496589661, + "learning_rate": 5.914396887159533e-05, + "loss": 0.6334, + "step": 549 + }, + { + "epoch": 0.7085346215780999, + "grad_norm": 0.5565886497497559, + "learning_rate": 5.8884565499351496e-05, + "loss": 0.7425, + "step": 550 + }, + { + "epoch": 0.7098228663446055, + "grad_norm": 0.6447110772132874, + "learning_rate": 5.862516212710766e-05, + "loss": 0.8405, + "step": 551 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.6419034004211426, + "learning_rate": 5.836575875486382e-05, + "loss": 0.8779, + "step": 552 + }, + { + "epoch": 0.7123993558776167, + "grad_norm": 0.4611152708530426, + "learning_rate": 5.810635538261998e-05, + "loss": 0.5832, + "step": 553 + }, + { + "epoch": 0.7136876006441224, + "grad_norm": 0.6436396837234497, + "learning_rate": 5.7846952010376146e-05, + "loss": 0.8172, + "step": 554 + }, + { + "epoch": 0.714975845410628, + "grad_norm": 0.5647209286689758, + "learning_rate": 5.7587548638132296e-05, + "loss": 0.6637, + "step": 555 + }, + { + "epoch": 0.7162640901771337, + "grad_norm": 0.5272210240364075, + "learning_rate": 5.7328145265888454e-05, + "loss": 0.6103, + "step": 556 + }, + { + "epoch": 0.7175523349436392, + "grad_norm": 0.5229634046554565, + "learning_rate": 5.706874189364462e-05, + "loss": 0.5514, + "step": 557 + }, + { + "epoch": 0.7188405797101449, + "grad_norm": 0.6116520166397095, + "learning_rate": 5.6809338521400776e-05, + "loss": 0.8535, + "step": 558 + }, + { + "epoch": 0.7201288244766506, + "grad_norm": 0.5706294178962708, + "learning_rate": 5.654993514915694e-05, + "loss": 0.7606, + "step": 559 + }, + { + "epoch": 0.7214170692431562, + "grad_norm": 0.6013360619544983, + "learning_rate": 5.6290531776913104e-05, + "loss": 0.7887, + "step": 560 + }, + { + "epoch": 0.7227053140096619, + "grad_norm": 0.5661988258361816, + "learning_rate": 5.603112840466926e-05, + "loss": 0.9302, + "step": 561 + }, + { + "epoch": 0.7239935587761674, + "grad_norm": 0.5267884135246277, + "learning_rate": 5.5771725032425426e-05, + "loss": 0.7534, + "step": 562 + }, + { + "epoch": 0.7252818035426731, + "grad_norm": 0.4822220504283905, + "learning_rate": 5.551232166018159e-05, + "loss": 0.5666, + "step": 563 + }, + { + "epoch": 0.7265700483091787, + "grad_norm": 0.5841349363327026, + "learning_rate": 5.525291828793775e-05, + "loss": 0.7755, + "step": 564 + }, + { + "epoch": 0.7278582930756844, + "grad_norm": 0.5259692072868347, + "learning_rate": 5.499351491569391e-05, + "loss": 0.7599, + "step": 565 + }, + { + "epoch": 0.72914653784219, + "grad_norm": 0.5511097311973572, + "learning_rate": 5.4734111543450076e-05, + "loss": 0.6198, + "step": 566 + }, + { + "epoch": 0.7304347826086957, + "grad_norm": 0.5707940459251404, + "learning_rate": 5.447470817120623e-05, + "loss": 0.7669, + "step": 567 + }, + { + "epoch": 0.7317230273752013, + "grad_norm": 0.6099474430084229, + "learning_rate": 5.4215304798962384e-05, + "loss": 0.7636, + "step": 568 + }, + { + "epoch": 0.7330112721417069, + "grad_norm": 0.4825986623764038, + "learning_rate": 5.395590142671855e-05, + "loss": 0.5758, + "step": 569 + }, + { + "epoch": 0.7342995169082126, + "grad_norm": 0.457233190536499, + "learning_rate": 5.3696498054474706e-05, + "loss": 0.5534, + "step": 570 + }, + { + "epoch": 0.7355877616747182, + "grad_norm": 0.5602165460586548, + "learning_rate": 5.343709468223087e-05, + "loss": 0.5802, + "step": 571 + }, + { + "epoch": 0.7368760064412239, + "grad_norm": 0.6400203108787537, + "learning_rate": 5.3177691309987034e-05, + "loss": 0.9281, + "step": 572 + }, + { + "epoch": 0.7381642512077294, + "grad_norm": 0.4856846332550049, + "learning_rate": 5.291828793774319e-05, + "loss": 0.6179, + "step": 573 + }, + { + "epoch": 0.7394524959742351, + "grad_norm": 0.5459800958633423, + "learning_rate": 5.2658884565499356e-05, + "loss": 0.7225, + "step": 574 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.5425988435745239, + "learning_rate": 5.239948119325552e-05, + "loss": 0.7091, + "step": 575 + }, + { + "epoch": 0.7420289855072464, + "grad_norm": 0.6519750356674194, + "learning_rate": 5.214007782101168e-05, + "loss": 0.7549, + "step": 576 + }, + { + "epoch": 0.743317230273752, + "grad_norm": 0.5276802778244019, + "learning_rate": 5.188067444876784e-05, + "loss": 0.5972, + "step": 577 + }, + { + "epoch": 0.7446054750402576, + "grad_norm": 0.5245682001113892, + "learning_rate": 5.1621271076524e-05, + "loss": 0.631, + "step": 578 + }, + { + "epoch": 0.7458937198067633, + "grad_norm": 0.574123203754425, + "learning_rate": 5.136186770428015e-05, + "loss": 0.8617, + "step": 579 + }, + { + "epoch": 0.7471819645732689, + "grad_norm": 0.5302646160125732, + "learning_rate": 5.1102464332036315e-05, + "loss": 0.682, + "step": 580 + }, + { + "epoch": 0.7484702093397746, + "grad_norm": 0.528005063533783, + "learning_rate": 5.084306095979248e-05, + "loss": 0.5872, + "step": 581 + }, + { + "epoch": 0.7497584541062802, + "grad_norm": 0.5490335822105408, + "learning_rate": 5.0583657587548636e-05, + "loss": 0.571, + "step": 582 + }, + { + "epoch": 0.7510466988727859, + "grad_norm": 0.5383925437927246, + "learning_rate": 5.03242542153048e-05, + "loss": 0.5694, + "step": 583 + }, + { + "epoch": 0.7523349436392914, + "grad_norm": 0.5377727150917053, + "learning_rate": 5.0064850843060965e-05, + "loss": 0.7569, + "step": 584 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 0.5144835710525513, + "learning_rate": 4.980544747081712e-05, + "loss": 0.6205, + "step": 585 + }, + { + "epoch": 0.7549114331723027, + "grad_norm": 0.5208680033683777, + "learning_rate": 4.9546044098573286e-05, + "loss": 0.6205, + "step": 586 + }, + { + "epoch": 0.7561996779388084, + "grad_norm": 0.6118026375770569, + "learning_rate": 4.9286640726329444e-05, + "loss": 0.8551, + "step": 587 + }, + { + "epoch": 0.7574879227053141, + "grad_norm": 0.5483299493789673, + "learning_rate": 4.90272373540856e-05, + "loss": 0.6421, + "step": 588 + }, + { + "epoch": 0.7587761674718196, + "grad_norm": 0.5909067392349243, + "learning_rate": 4.8767833981841766e-05, + "loss": 0.7131, + "step": 589 + }, + { + "epoch": 0.7600644122383253, + "grad_norm": 0.46308815479278564, + "learning_rate": 4.850843060959793e-05, + "loss": 0.472, + "step": 590 + }, + { + "epoch": 0.7613526570048309, + "grad_norm": 0.6142207384109497, + "learning_rate": 4.824902723735409e-05, + "loss": 0.6328, + "step": 591 + }, + { + "epoch": 0.7626409017713366, + "grad_norm": 0.6880104541778564, + "learning_rate": 4.798962386511025e-05, + "loss": 0.838, + "step": 592 + }, + { + "epoch": 0.7639291465378422, + "grad_norm": 0.5680267214775085, + "learning_rate": 4.773022049286641e-05, + "loss": 0.7676, + "step": 593 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 0.5472928881645203, + "learning_rate": 4.7470817120622567e-05, + "loss": 0.7345, + "step": 594 + }, + { + "epoch": 0.7665056360708534, + "grad_norm": 0.6231438517570496, + "learning_rate": 4.721141374837873e-05, + "loss": 0.7582, + "step": 595 + }, + { + "epoch": 0.7677938808373591, + "grad_norm": 0.5030277371406555, + "learning_rate": 4.6952010376134895e-05, + "loss": 0.5928, + "step": 596 + }, + { + "epoch": 0.7690821256038647, + "grad_norm": 0.6041036248207092, + "learning_rate": 4.669260700389105e-05, + "loss": 0.6439, + "step": 597 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 0.5044084191322327, + "learning_rate": 4.643320363164722e-05, + "loss": 0.5618, + "step": 598 + }, + { + "epoch": 0.7716586151368761, + "grad_norm": 0.5399697422981262, + "learning_rate": 4.6173800259403374e-05, + "loss": 0.6366, + "step": 599 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 0.496896892786026, + "learning_rate": 4.591439688715953e-05, + "loss": 0.5864, + "step": 600 + }, + { + "epoch": 0.7742351046698873, + "grad_norm": 0.46158090233802795, + "learning_rate": 4.5654993514915696e-05, + "loss": 0.609, + "step": 601 + }, + { + "epoch": 0.7755233494363929, + "grad_norm": 0.5886946320533752, + "learning_rate": 4.539559014267185e-05, + "loss": 0.7699, + "step": 602 + }, + { + "epoch": 0.7768115942028986, + "grad_norm": 0.5680760145187378, + "learning_rate": 4.513618677042802e-05, + "loss": 0.8665, + "step": 603 + }, + { + "epoch": 0.7780998389694042, + "grad_norm": 0.5787962675094604, + "learning_rate": 4.487678339818418e-05, + "loss": 0.6942, + "step": 604 + }, + { + "epoch": 0.7793880837359098, + "grad_norm": 0.6179983615875244, + "learning_rate": 4.461738002594034e-05, + "loss": 0.7403, + "step": 605 + }, + { + "epoch": 0.7806763285024154, + "grad_norm": 0.5327017903327942, + "learning_rate": 4.43579766536965e-05, + "loss": 0.6714, + "step": 606 + }, + { + "epoch": 0.7819645732689211, + "grad_norm": 0.5620171427726746, + "learning_rate": 4.409857328145266e-05, + "loss": 0.6706, + "step": 607 + }, + { + "epoch": 0.7832528180354267, + "grad_norm": 0.5355799794197083, + "learning_rate": 4.383916990920882e-05, + "loss": 0.6042, + "step": 608 + }, + { + "epoch": 0.7845410628019324, + "grad_norm": 0.692477285861969, + "learning_rate": 4.357976653696498e-05, + "loss": 0.7384, + "step": 609 + }, + { + "epoch": 0.785829307568438, + "grad_norm": 0.5491352081298828, + "learning_rate": 4.332036316472115e-05, + "loss": 0.6315, + "step": 610 + }, + { + "epoch": 0.7871175523349436, + "grad_norm": 0.6350588202476501, + "learning_rate": 4.3060959792477304e-05, + "loss": 0.7889, + "step": 611 + }, + { + "epoch": 0.7884057971014493, + "grad_norm": 0.5784136652946472, + "learning_rate": 4.280155642023346e-05, + "loss": 0.7421, + "step": 612 + }, + { + "epoch": 0.7896940418679549, + "grad_norm": 0.55226069688797, + "learning_rate": 4.2542153047989626e-05, + "loss": 0.7752, + "step": 613 + }, + { + "epoch": 0.7909822866344606, + "grad_norm": 0.541728138923645, + "learning_rate": 4.2282749675745784e-05, + "loss": 0.6681, + "step": 614 + }, + { + "epoch": 0.7922705314009661, + "grad_norm": 0.4921126067638397, + "learning_rate": 4.202334630350195e-05, + "loss": 0.5087, + "step": 615 + }, + { + "epoch": 0.7935587761674718, + "grad_norm": 0.5723814368247986, + "learning_rate": 4.176394293125811e-05, + "loss": 0.7275, + "step": 616 + }, + { + "epoch": 0.7948470209339774, + "grad_norm": 0.5064358115196228, + "learning_rate": 4.150453955901427e-05, + "loss": 0.6857, + "step": 617 + }, + { + "epoch": 0.7961352657004831, + "grad_norm": 0.495473176240921, + "learning_rate": 4.124513618677043e-05, + "loss": 0.5766, + "step": 618 + }, + { + "epoch": 0.7974235104669888, + "grad_norm": 0.47758999466896057, + "learning_rate": 4.098573281452659e-05, + "loss": 0.5152, + "step": 619 + }, + { + "epoch": 0.7987117552334944, + "grad_norm": 0.5546131730079651, + "learning_rate": 4.072632944228275e-05, + "loss": 0.5477, + "step": 620 + }, + { + "epoch": 0.8, + "grad_norm": 0.637289822101593, + "learning_rate": 4.046692607003891e-05, + "loss": 0.8712, + "step": 621 + }, + { + "epoch": 0.8012882447665056, + "grad_norm": 0.6441432237625122, + "learning_rate": 4.020752269779508e-05, + "loss": 0.751, + "step": 622 + }, + { + "epoch": 0.8025764895330113, + "grad_norm": 0.5846959352493286, + "learning_rate": 3.9948119325551235e-05, + "loss": 0.764, + "step": 623 + }, + { + "epoch": 0.8038647342995169, + "grad_norm": 0.5156934261322021, + "learning_rate": 3.968871595330739e-05, + "loss": 0.6254, + "step": 624 + }, + { + "epoch": 0.8051529790660226, + "grad_norm": 0.5897034406661987, + "learning_rate": 3.9429312581063556e-05, + "loss": 0.6232, + "step": 625 + }, + { + "epoch": 0.8064412238325281, + "grad_norm": 0.6254003643989563, + "learning_rate": 3.9169909208819714e-05, + "loss": 0.7375, + "step": 626 + }, + { + "epoch": 0.8077294685990338, + "grad_norm": 0.5816264152526855, + "learning_rate": 3.891050583657588e-05, + "loss": 0.752, + "step": 627 + }, + { + "epoch": 0.8090177133655394, + "grad_norm": 0.570949912071228, + "learning_rate": 3.865110246433204e-05, + "loss": 0.6546, + "step": 628 + }, + { + "epoch": 0.8103059581320451, + "grad_norm": 0.5094951391220093, + "learning_rate": 3.83916990920882e-05, + "loss": 0.5009, + "step": 629 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 0.6055474281311035, + "learning_rate": 3.813229571984436e-05, + "loss": 0.7338, + "step": 630 + }, + { + "epoch": 0.8128824476650564, + "grad_norm": 0.5392929911613464, + "learning_rate": 3.787289234760052e-05, + "loss": 0.563, + "step": 631 + }, + { + "epoch": 0.814170692431562, + "grad_norm": 0.6130269765853882, + "learning_rate": 3.761348897535668e-05, + "loss": 0.8099, + "step": 632 + }, + { + "epoch": 0.8154589371980676, + "grad_norm": 0.6383023262023926, + "learning_rate": 3.735408560311284e-05, + "loss": 0.5776, + "step": 633 + }, + { + "epoch": 0.8167471819645733, + "grad_norm": 0.4606671929359436, + "learning_rate": 3.709468223086901e-05, + "loss": 0.5352, + "step": 634 + }, + { + "epoch": 0.8180354267310789, + "grad_norm": 0.5078199505805969, + "learning_rate": 3.6835278858625165e-05, + "loss": 0.5113, + "step": 635 + }, + { + "epoch": 0.8193236714975846, + "grad_norm": 0.5447183847427368, + "learning_rate": 3.657587548638132e-05, + "loss": 0.6092, + "step": 636 + }, + { + "epoch": 0.8206119162640901, + "grad_norm": 0.632024884223938, + "learning_rate": 3.631647211413749e-05, + "loss": 0.7237, + "step": 637 + }, + { + "epoch": 0.8219001610305958, + "grad_norm": 0.5381270051002502, + "learning_rate": 3.6057068741893644e-05, + "loss": 0.6391, + "step": 638 + }, + { + "epoch": 0.8231884057971014, + "grad_norm": 0.5342917442321777, + "learning_rate": 3.579766536964981e-05, + "loss": 0.672, + "step": 639 + }, + { + "epoch": 0.8244766505636071, + "grad_norm": 0.6121646761894226, + "learning_rate": 3.553826199740597e-05, + "loss": 0.8898, + "step": 640 + }, + { + "epoch": 0.8257648953301128, + "grad_norm": 0.6056507229804993, + "learning_rate": 3.527885862516213e-05, + "loss": 0.585, + "step": 641 + }, + { + "epoch": 0.8270531400966183, + "grad_norm": 0.5895273685455322, + "learning_rate": 3.501945525291829e-05, + "loss": 0.6135, + "step": 642 + }, + { + "epoch": 0.828341384863124, + "grad_norm": 0.5063283443450928, + "learning_rate": 3.476005188067445e-05, + "loss": 0.4939, + "step": 643 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 0.5781770348548889, + "learning_rate": 3.450064850843061e-05, + "loss": 0.7322, + "step": 644 + }, + { + "epoch": 0.8309178743961353, + "grad_norm": 0.5424814820289612, + "learning_rate": 3.4241245136186774e-05, + "loss": 0.6702, + "step": 645 + }, + { + "epoch": 0.8322061191626409, + "grad_norm": 0.5998700857162476, + "learning_rate": 3.398184176394293e-05, + "loss": 0.6788, + "step": 646 + }, + { + "epoch": 0.8334943639291466, + "grad_norm": 0.614637017250061, + "learning_rate": 3.3722438391699095e-05, + "loss": 0.7391, + "step": 647 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 0.6503768563270569, + "learning_rate": 3.346303501945525e-05, + "loss": 0.7648, + "step": 648 + }, + { + "epoch": 0.8360708534621578, + "grad_norm": 0.5270184874534607, + "learning_rate": 3.320363164721142e-05, + "loss": 0.504, + "step": 649 + }, + { + "epoch": 0.8373590982286635, + "grad_norm": 0.5014241337776184, + "learning_rate": 3.2944228274967575e-05, + "loss": 0.5252, + "step": 650 + }, + { + "epoch": 0.8386473429951691, + "grad_norm": 0.5668673515319824, + "learning_rate": 3.268482490272374e-05, + "loss": 0.6537, + "step": 651 + }, + { + "epoch": 0.8399355877616748, + "grad_norm": 0.5789865255355835, + "learning_rate": 3.2425421530479896e-05, + "loss": 0.8012, + "step": 652 + }, + { + "epoch": 0.8412238325281803, + "grad_norm": 0.6261132955551147, + "learning_rate": 3.216601815823606e-05, + "loss": 0.6832, + "step": 653 + }, + { + "epoch": 0.842512077294686, + "grad_norm": 0.5914183855056763, + "learning_rate": 3.190661478599222e-05, + "loss": 0.5653, + "step": 654 + }, + { + "epoch": 0.8438003220611916, + "grad_norm": 0.5597856044769287, + "learning_rate": 3.164721141374838e-05, + "loss": 0.557, + "step": 655 + }, + { + "epoch": 0.8450885668276973, + "grad_norm": 0.5060226917266846, + "learning_rate": 3.138780804150454e-05, + "loss": 0.6559, + "step": 656 + }, + { + "epoch": 0.8463768115942029, + "grad_norm": 0.6236748695373535, + "learning_rate": 3.1128404669260704e-05, + "loss": 0.7372, + "step": 657 + }, + { + "epoch": 0.8476650563607085, + "grad_norm": 0.5138527750968933, + "learning_rate": 3.086900129701686e-05, + "loss": 0.6031, + "step": 658 + }, + { + "epoch": 0.8489533011272141, + "grad_norm": 0.5962822437286377, + "learning_rate": 3.0609597924773026e-05, + "loss": 0.7374, + "step": 659 + }, + { + "epoch": 0.8502415458937198, + "grad_norm": 0.4833110272884369, + "learning_rate": 3.0350194552529183e-05, + "loss": 0.4742, + "step": 660 + }, + { + "epoch": 0.8515297906602255, + "grad_norm": 0.5980967283248901, + "learning_rate": 3.0090791180285344e-05, + "loss": 0.7025, + "step": 661 + }, + { + "epoch": 0.8528180354267311, + "grad_norm": 0.6031454801559448, + "learning_rate": 2.9831387808041505e-05, + "loss": 0.8479, + "step": 662 + }, + { + "epoch": 0.8541062801932368, + "grad_norm": 0.5824582576751709, + "learning_rate": 2.9571984435797666e-05, + "loss": 0.7073, + "step": 663 + }, + { + "epoch": 0.8553945249597423, + "grad_norm": 0.6369014978408813, + "learning_rate": 2.931258106355383e-05, + "loss": 0.864, + "step": 664 + }, + { + "epoch": 0.856682769726248, + "grad_norm": 0.554784893989563, + "learning_rate": 2.905317769130999e-05, + "loss": 0.6705, + "step": 665 + }, + { + "epoch": 0.8579710144927536, + "grad_norm": 0.5656050443649292, + "learning_rate": 2.8793774319066148e-05, + "loss": 0.6375, + "step": 666 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 0.6191110014915466, + "learning_rate": 2.853437094682231e-05, + "loss": 0.7277, + "step": 667 + }, + { + "epoch": 0.8605475040257649, + "grad_norm": 0.6224331855773926, + "learning_rate": 2.827496757457847e-05, + "loss": 0.6809, + "step": 668 + }, + { + "epoch": 0.8618357487922705, + "grad_norm": 0.6049439311027527, + "learning_rate": 2.801556420233463e-05, + "loss": 0.8089, + "step": 669 + }, + { + "epoch": 0.8631239935587761, + "grad_norm": 0.5969856977462769, + "learning_rate": 2.7756160830090795e-05, + "loss": 0.8102, + "step": 670 + }, + { + "epoch": 0.8644122383252818, + "grad_norm": 0.6787256002426147, + "learning_rate": 2.7496757457846956e-05, + "loss": 0.7602, + "step": 671 + }, + { + "epoch": 0.8657004830917875, + "grad_norm": 0.6535263061523438, + "learning_rate": 2.7237354085603113e-05, + "loss": 0.6816, + "step": 672 + }, + { + "epoch": 0.8669887278582931, + "grad_norm": 0.6893251538276672, + "learning_rate": 2.6977950713359274e-05, + "loss": 0.7271, + "step": 673 + }, + { + "epoch": 0.8682769726247987, + "grad_norm": 0.6239253282546997, + "learning_rate": 2.6718547341115435e-05, + "loss": 0.7995, + "step": 674 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.5103424787521362, + "learning_rate": 2.6459143968871596e-05, + "loss": 0.5603, + "step": 675 + }, + { + "epoch": 0.87085346215781, + "grad_norm": 0.558005690574646, + "learning_rate": 2.619974059662776e-05, + "loss": 0.5951, + "step": 676 + }, + { + "epoch": 0.8721417069243156, + "grad_norm": 0.5600588917732239, + "learning_rate": 2.594033722438392e-05, + "loss": 0.7646, + "step": 677 + }, + { + "epoch": 0.8734299516908213, + "grad_norm": 0.48512476682662964, + "learning_rate": 2.5680933852140075e-05, + "loss": 0.5015, + "step": 678 + }, + { + "epoch": 0.8747181964573268, + "grad_norm": 0.6091920733451843, + "learning_rate": 2.542153047989624e-05, + "loss": 0.8244, + "step": 679 + }, + { + "epoch": 0.8760064412238325, + "grad_norm": 0.49356287717819214, + "learning_rate": 2.51621271076524e-05, + "loss": 0.4808, + "step": 680 + }, + { + "epoch": 0.8772946859903382, + "grad_norm": 0.5376326441764832, + "learning_rate": 2.490272373540856e-05, + "loss": 0.5296, + "step": 681 + }, + { + "epoch": 0.8785829307568438, + "grad_norm": 0.611382782459259, + "learning_rate": 2.4643320363164722e-05, + "loss": 0.5765, + "step": 682 + }, + { + "epoch": 0.8798711755233495, + "grad_norm": 0.5653994083404541, + "learning_rate": 2.4383916990920883e-05, + "loss": 0.5864, + "step": 683 + }, + { + "epoch": 0.881159420289855, + "grad_norm": 0.48044463992118835, + "learning_rate": 2.4124513618677044e-05, + "loss": 0.6073, + "step": 684 + }, + { + "epoch": 0.8824476650563607, + "grad_norm": 0.6067565679550171, + "learning_rate": 2.3865110246433205e-05, + "loss": 0.6312, + "step": 685 + }, + { + "epoch": 0.8837359098228663, + "grad_norm": 0.5126189589500427, + "learning_rate": 2.3605706874189365e-05, + "loss": 0.5501, + "step": 686 + }, + { + "epoch": 0.885024154589372, + "grad_norm": 0.551137387752533, + "learning_rate": 2.3346303501945526e-05, + "loss": 0.578, + "step": 687 + }, + { + "epoch": 0.8863123993558776, + "grad_norm": 0.7072709202766418, + "learning_rate": 2.3086900129701687e-05, + "loss": 0.614, + "step": 688 + }, + { + "epoch": 0.8876006441223833, + "grad_norm": 0.6444385051727295, + "learning_rate": 2.2827496757457848e-05, + "loss": 0.6824, + "step": 689 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.5593189597129822, + "learning_rate": 2.256809338521401e-05, + "loss": 0.5942, + "step": 690 + }, + { + "epoch": 0.8901771336553945, + "grad_norm": 0.6002535223960876, + "learning_rate": 2.230869001297017e-05, + "loss": 0.7655, + "step": 691 + }, + { + "epoch": 0.8914653784219002, + "grad_norm": 0.6385635137557983, + "learning_rate": 2.204928664072633e-05, + "loss": 0.749, + "step": 692 + }, + { + "epoch": 0.8927536231884058, + "grad_norm": 0.5951741337776184, + "learning_rate": 2.178988326848249e-05, + "loss": 0.6708, + "step": 693 + }, + { + "epoch": 0.8940418679549115, + "grad_norm": 0.6050885915756226, + "learning_rate": 2.1530479896238652e-05, + "loss": 0.7473, + "step": 694 + }, + { + "epoch": 0.895330112721417, + "grad_norm": 0.570475161075592, + "learning_rate": 2.1271076523994813e-05, + "loss": 0.5881, + "step": 695 + }, + { + "epoch": 0.8966183574879227, + "grad_norm": 0.5623670816421509, + "learning_rate": 2.1011673151750974e-05, + "loss": 0.6613, + "step": 696 + }, + { + "epoch": 0.8979066022544283, + "grad_norm": 0.6884156465530396, + "learning_rate": 2.0752269779507135e-05, + "loss": 0.7917, + "step": 697 + }, + { + "epoch": 0.899194847020934, + "grad_norm": 0.6603716611862183, + "learning_rate": 2.0492866407263296e-05, + "loss": 0.6958, + "step": 698 + }, + { + "epoch": 0.9004830917874396, + "grad_norm": 0.6588467359542847, + "learning_rate": 2.0233463035019457e-05, + "loss": 0.5612, + "step": 699 + }, + { + "epoch": 0.9017713365539453, + "grad_norm": 0.5613631010055542, + "learning_rate": 1.9974059662775617e-05, + "loss": 0.6064, + "step": 700 + } + ], + "logging_steps": 1, + "max_steps": 776, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.98827660663808e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}