diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,91532 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 13070, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001530221882172915, + "grad_norm": 13.516160750382397, + "learning_rate": 5.0890585241730285e-08, + "loss": 1.0816, + "step": 1 + }, + { + "epoch": 0.000306044376434583, + "grad_norm": 13.656487119130613, + "learning_rate": 1.0178117048346057e-07, + "loss": 1.0528, + "step": 2 + }, + { + "epoch": 0.0004590665646518745, + "grad_norm": 16.846354425859488, + "learning_rate": 1.5267175572519085e-07, + "loss": 1.1708, + "step": 3 + }, + { + "epoch": 0.000612088752869166, + "grad_norm": 13.046572036423532, + "learning_rate": 2.0356234096692114e-07, + "loss": 1.0517, + "step": 4 + }, + { + "epoch": 0.0007651109410864575, + "grad_norm": 12.49233347364335, + "learning_rate": 2.544529262086514e-07, + "loss": 1.0145, + "step": 5 + }, + { + "epoch": 0.000918133129303749, + "grad_norm": 14.751771565656806, + "learning_rate": 3.053435114503817e-07, + "loss": 1.0758, + "step": 6 + }, + { + "epoch": 0.0010711553175210406, + "grad_norm": 11.653404038618469, + "learning_rate": 3.56234096692112e-07, + "loss": 1.0636, + "step": 7 + }, + { + "epoch": 0.001224177505738332, + "grad_norm": 13.668951288685138, + "learning_rate": 4.071246819338423e-07, + "loss": 1.1952, + "step": 8 + }, + { + "epoch": 0.0013771996939556236, + "grad_norm": 12.046531599063504, + "learning_rate": 4.5801526717557257e-07, + "loss": 1.0257, + "step": 9 + }, + { + "epoch": 0.001530221882172915, + "grad_norm": 13.723654296994994, + "learning_rate": 5.089058524173028e-07, + "loss": 1.059, + "step": 10 + }, + { + "epoch": 0.0016832440703902067, + "grad_norm": 13.902265225322944, + "learning_rate": 5.597964376590332e-07, + "loss": 1.0858, + "step": 11 + }, + { + "epoch": 0.001836266258607498, + "grad_norm": 12.528149564606679, + "learning_rate": 6.106870229007634e-07, + "loss": 1.0277, + "step": 12 + }, + { + "epoch": 0.0019892884468247895, + "grad_norm": 13.1123093332078, + "learning_rate": 6.615776081424936e-07, + "loss": 0.9474, + "step": 13 + }, + { + "epoch": 0.002142310635042081, + "grad_norm": 11.039557385133689, + "learning_rate": 7.12468193384224e-07, + "loss": 1.0024, + "step": 14 + }, + { + "epoch": 0.0022953328232593728, + "grad_norm": 8.906284572299981, + "learning_rate": 7.633587786259543e-07, + "loss": 0.9381, + "step": 15 + }, + { + "epoch": 0.002448355011476664, + "grad_norm": 10.05716027554367, + "learning_rate": 8.142493638676846e-07, + "loss": 0.9707, + "step": 16 + }, + { + "epoch": 0.0026013771996939556, + "grad_norm": 9.006177898036631, + "learning_rate": 8.651399491094148e-07, + "loss": 0.9394, + "step": 17 + }, + { + "epoch": 0.0027543993879112472, + "grad_norm": 21.183255574001187, + "learning_rate": 9.160305343511451e-07, + "loss": 1.0931, + "step": 18 + }, + { + "epoch": 0.002907421576128539, + "grad_norm": 10.995986966246456, + "learning_rate": 9.669211195928755e-07, + "loss": 0.984, + "step": 19 + }, + { + "epoch": 0.00306044376434583, + "grad_norm": 11.307646984532525, + "learning_rate": 1.0178117048346056e-06, + "loss": 0.9666, + "step": 20 + }, + { + "epoch": 0.0032134659525631217, + "grad_norm": 14.964095843531805, + "learning_rate": 1.068702290076336e-06, + "loss": 1.0085, + "step": 21 + }, + { + "epoch": 0.0033664881407804133, + "grad_norm": 9.3514966763225, + "learning_rate": 1.1195928753180663e-06, + "loss": 0.9647, + "step": 22 + }, + { + "epoch": 0.0035195103289977045, + "grad_norm": 7.7748996347946, + "learning_rate": 1.1704834605597967e-06, + "loss": 1.0044, + "step": 23 + }, + { + "epoch": 0.003672532517214996, + "grad_norm": 9.478805120179091, + "learning_rate": 1.2213740458015268e-06, + "loss": 0.9446, + "step": 24 + }, + { + "epoch": 0.0038255547054322878, + "grad_norm": 6.758033563189933, + "learning_rate": 1.2722646310432571e-06, + "loss": 0.9163, + "step": 25 + }, + { + "epoch": 0.003978576893649579, + "grad_norm": 5.984168999533934, + "learning_rate": 1.3231552162849873e-06, + "loss": 0.8452, + "step": 26 + }, + { + "epoch": 0.004131599081866871, + "grad_norm": 6.141293673107684, + "learning_rate": 1.3740458015267178e-06, + "loss": 0.9215, + "step": 27 + }, + { + "epoch": 0.004284621270084162, + "grad_norm": 8.443582458602195, + "learning_rate": 1.424936386768448e-06, + "loss": 0.9543, + "step": 28 + }, + { + "epoch": 0.004437643458301454, + "grad_norm": 6.219529039996243, + "learning_rate": 1.475826972010178e-06, + "loss": 1.1055, + "step": 29 + }, + { + "epoch": 0.0045906656465187455, + "grad_norm": 6.723218647037651, + "learning_rate": 1.5267175572519086e-06, + "loss": 0.8657, + "step": 30 + }, + { + "epoch": 0.004743687834736037, + "grad_norm": 6.925216418396402, + "learning_rate": 1.5776081424936388e-06, + "loss": 1.0145, + "step": 31 + }, + { + "epoch": 0.004896710022953328, + "grad_norm": 5.817861415648484, + "learning_rate": 1.6284987277353691e-06, + "loss": 0.9494, + "step": 32 + }, + { + "epoch": 0.0050497322111706195, + "grad_norm": 6.517661419074156, + "learning_rate": 1.6793893129770995e-06, + "loss": 1.0573, + "step": 33 + }, + { + "epoch": 0.005202754399387911, + "grad_norm": 7.507479750968244, + "learning_rate": 1.7302798982188296e-06, + "loss": 0.8793, + "step": 34 + }, + { + "epoch": 0.005355776587605203, + "grad_norm": 6.961283303110031, + "learning_rate": 1.78117048346056e-06, + "loss": 0.9334, + "step": 35 + }, + { + "epoch": 0.0055087987758224944, + "grad_norm": 7.154082162667194, + "learning_rate": 1.8320610687022903e-06, + "loss": 0.8061, + "step": 36 + }, + { + "epoch": 0.005661820964039786, + "grad_norm": 6.110703133559988, + "learning_rate": 1.8829516539440204e-06, + "loss": 0.8045, + "step": 37 + }, + { + "epoch": 0.005814843152257078, + "grad_norm": 6.144648736565948, + "learning_rate": 1.933842239185751e-06, + "loss": 0.7255, + "step": 38 + }, + { + "epoch": 0.0059678653404743685, + "grad_norm": 6.104408552237334, + "learning_rate": 1.984732824427481e-06, + "loss": 0.8799, + "step": 39 + }, + { + "epoch": 0.00612088752869166, + "grad_norm": 6.224216203164156, + "learning_rate": 2.0356234096692112e-06, + "loss": 0.8629, + "step": 40 + }, + { + "epoch": 0.006273909716908952, + "grad_norm": 5.354540996861285, + "learning_rate": 2.0865139949109416e-06, + "loss": 0.9332, + "step": 41 + }, + { + "epoch": 0.006426931905126243, + "grad_norm": 7.248232082302233, + "learning_rate": 2.137404580152672e-06, + "loss": 0.879, + "step": 42 + }, + { + "epoch": 0.006579954093343535, + "grad_norm": 6.526634157834574, + "learning_rate": 2.1882951653944023e-06, + "loss": 0.8931, + "step": 43 + }, + { + "epoch": 0.006732976281560827, + "grad_norm": 5.558985637856043, + "learning_rate": 2.2391857506361326e-06, + "loss": 0.9254, + "step": 44 + }, + { + "epoch": 0.006885998469778117, + "grad_norm": 5.143083416175365, + "learning_rate": 2.2900763358778625e-06, + "loss": 0.7931, + "step": 45 + }, + { + "epoch": 0.007039020657995409, + "grad_norm": 6.230004131099689, + "learning_rate": 2.3409669211195933e-06, + "loss": 0.8783, + "step": 46 + }, + { + "epoch": 0.007192042846212701, + "grad_norm": 6.96270555642169, + "learning_rate": 2.3918575063613232e-06, + "loss": 0.8966, + "step": 47 + }, + { + "epoch": 0.007345065034429992, + "grad_norm": 4.725287444763657, + "learning_rate": 2.4427480916030536e-06, + "loss": 0.792, + "step": 48 + }, + { + "epoch": 0.007498087222647284, + "grad_norm": 4.516169085236729, + "learning_rate": 2.493638676844784e-06, + "loss": 0.8188, + "step": 49 + }, + { + "epoch": 0.0076511094108645756, + "grad_norm": 6.169099136800104, + "learning_rate": 2.5445292620865143e-06, + "loss": 0.9132, + "step": 50 + }, + { + "epoch": 0.007804131599081867, + "grad_norm": 5.030562592462142, + "learning_rate": 2.595419847328244e-06, + "loss": 0.9125, + "step": 51 + }, + { + "epoch": 0.007957153787299158, + "grad_norm": 5.741056522914908, + "learning_rate": 2.6463104325699745e-06, + "loss": 0.8294, + "step": 52 + }, + { + "epoch": 0.00811017597551645, + "grad_norm": 5.3502817029143515, + "learning_rate": 2.6972010178117053e-06, + "loss": 0.8005, + "step": 53 + }, + { + "epoch": 0.008263198163733741, + "grad_norm": 5.153661369992958, + "learning_rate": 2.7480916030534356e-06, + "loss": 0.8175, + "step": 54 + }, + { + "epoch": 0.008416220351951033, + "grad_norm": 6.046120010370828, + "learning_rate": 2.7989821882951656e-06, + "loss": 0.8508, + "step": 55 + }, + { + "epoch": 0.008569242540168324, + "grad_norm": 6.292867560774823, + "learning_rate": 2.849872773536896e-06, + "loss": 0.9253, + "step": 56 + }, + { + "epoch": 0.008722264728385616, + "grad_norm": 6.105008240723325, + "learning_rate": 2.900763358778626e-06, + "loss": 0.9678, + "step": 57 + }, + { + "epoch": 0.008875286916602908, + "grad_norm": 4.459519530250151, + "learning_rate": 2.951653944020356e-06, + "loss": 0.857, + "step": 58 + }, + { + "epoch": 0.0090283091048202, + "grad_norm": 4.966961705007526, + "learning_rate": 3.002544529262087e-06, + "loss": 0.8162, + "step": 59 + }, + { + "epoch": 0.009181331293037491, + "grad_norm": 5.179614147649331, + "learning_rate": 3.0534351145038173e-06, + "loss": 0.7892, + "step": 60 + }, + { + "epoch": 0.009334353481254783, + "grad_norm": 6.585863543513201, + "learning_rate": 3.104325699745547e-06, + "loss": 0.81, + "step": 61 + }, + { + "epoch": 0.009487375669472074, + "grad_norm": 5.684014842731852, + "learning_rate": 3.1552162849872775e-06, + "loss": 0.9146, + "step": 62 + }, + { + "epoch": 0.009640397857689364, + "grad_norm": 4.941334064996768, + "learning_rate": 3.206106870229008e-06, + "loss": 0.8218, + "step": 63 + }, + { + "epoch": 0.009793420045906656, + "grad_norm": 5.436505901693147, + "learning_rate": 3.2569974554707382e-06, + "loss": 0.7455, + "step": 64 + }, + { + "epoch": 0.009946442234123947, + "grad_norm": 4.827550970928569, + "learning_rate": 3.3078880407124686e-06, + "loss": 0.7799, + "step": 65 + }, + { + "epoch": 0.010099464422341239, + "grad_norm": 5.1126560035818125, + "learning_rate": 3.358778625954199e-06, + "loss": 0.8829, + "step": 66 + }, + { + "epoch": 0.01025248661055853, + "grad_norm": 4.734944949714936, + "learning_rate": 3.409669211195929e-06, + "loss": 0.8362, + "step": 67 + }, + { + "epoch": 0.010405508798775822, + "grad_norm": 5.1152605564609726, + "learning_rate": 3.460559796437659e-06, + "loss": 0.806, + "step": 68 + }, + { + "epoch": 0.010558530986993114, + "grad_norm": 5.501335018085708, + "learning_rate": 3.5114503816793895e-06, + "loss": 0.8836, + "step": 69 + }, + { + "epoch": 0.010711553175210406, + "grad_norm": 5.6907042929238, + "learning_rate": 3.56234096692112e-06, + "loss": 0.7729, + "step": 70 + }, + { + "epoch": 0.010864575363427697, + "grad_norm": 4.7874920288700755, + "learning_rate": 3.6132315521628502e-06, + "loss": 0.7401, + "step": 71 + }, + { + "epoch": 0.011017597551644989, + "grad_norm": 5.34946991015153, + "learning_rate": 3.6641221374045806e-06, + "loss": 0.804, + "step": 72 + }, + { + "epoch": 0.01117061973986228, + "grad_norm": 6.0799857986381, + "learning_rate": 3.7150127226463105e-06, + "loss": 0.845, + "step": 73 + }, + { + "epoch": 0.011323641928079572, + "grad_norm": 6.075178425573325, + "learning_rate": 3.765903307888041e-06, + "loss": 0.7568, + "step": 74 + }, + { + "epoch": 0.011476664116296864, + "grad_norm": 4.759854629433448, + "learning_rate": 3.816793893129772e-06, + "loss": 0.7735, + "step": 75 + }, + { + "epoch": 0.011629686304514155, + "grad_norm": 5.486398129731232, + "learning_rate": 3.867684478371502e-06, + "loss": 0.788, + "step": 76 + }, + { + "epoch": 0.011782708492731445, + "grad_norm": 5.097255872801085, + "learning_rate": 3.918575063613232e-06, + "loss": 0.7627, + "step": 77 + }, + { + "epoch": 0.011935730680948737, + "grad_norm": 4.242028635114688, + "learning_rate": 3.969465648854962e-06, + "loss": 0.7489, + "step": 78 + }, + { + "epoch": 0.012088752869166029, + "grad_norm": 5.948491901588452, + "learning_rate": 4.020356234096692e-06, + "loss": 0.8135, + "step": 79 + }, + { + "epoch": 0.01224177505738332, + "grad_norm": 6.895223832667884, + "learning_rate": 4.0712468193384225e-06, + "loss": 0.84, + "step": 80 + }, + { + "epoch": 0.012394797245600612, + "grad_norm": 4.672739171576903, + "learning_rate": 4.122137404580153e-06, + "loss": 0.7619, + "step": 81 + }, + { + "epoch": 0.012547819433817903, + "grad_norm": 4.6442577318217815, + "learning_rate": 4.173027989821883e-06, + "loss": 0.8648, + "step": 82 + }, + { + "epoch": 0.012700841622035195, + "grad_norm": 5.053558792090741, + "learning_rate": 4.2239185750636135e-06, + "loss": 0.8448, + "step": 83 + }, + { + "epoch": 0.012853863810252487, + "grad_norm": 4.530911050956989, + "learning_rate": 4.274809160305344e-06, + "loss": 0.8076, + "step": 84 + }, + { + "epoch": 0.013006885998469778, + "grad_norm": 6.262686478254499, + "learning_rate": 4.325699745547074e-06, + "loss": 0.9201, + "step": 85 + }, + { + "epoch": 0.01315990818668707, + "grad_norm": 4.640895988619081, + "learning_rate": 4.3765903307888045e-06, + "loss": 0.7195, + "step": 86 + }, + { + "epoch": 0.013312930374904362, + "grad_norm": 5.0565684924592755, + "learning_rate": 4.427480916030535e-06, + "loss": 0.8627, + "step": 87 + }, + { + "epoch": 0.013465952563121653, + "grad_norm": 5.283088934487553, + "learning_rate": 4.478371501272265e-06, + "loss": 0.7922, + "step": 88 + }, + { + "epoch": 0.013618974751338945, + "grad_norm": 5.026171701957064, + "learning_rate": 4.5292620865139956e-06, + "loss": 0.8261, + "step": 89 + }, + { + "epoch": 0.013771996939556235, + "grad_norm": 5.884463834573161, + "learning_rate": 4.580152671755725e-06, + "loss": 0.8093, + "step": 90 + }, + { + "epoch": 0.013925019127773526, + "grad_norm": 4.438561387505246, + "learning_rate": 4.631043256997455e-06, + "loss": 0.7371, + "step": 91 + }, + { + "epoch": 0.014078041315990818, + "grad_norm": 5.305169117726312, + "learning_rate": 4.681933842239187e-06, + "loss": 0.8009, + "step": 92 + }, + { + "epoch": 0.01423106350420811, + "grad_norm": 5.419418332275794, + "learning_rate": 4.732824427480917e-06, + "loss": 0.8181, + "step": 93 + }, + { + "epoch": 0.014384085692425401, + "grad_norm": 5.040887136301689, + "learning_rate": 4.7837150127226464e-06, + "loss": 0.7963, + "step": 94 + }, + { + "epoch": 0.014537107880642693, + "grad_norm": 6.45075012480531, + "learning_rate": 4.834605597964377e-06, + "loss": 0.6717, + "step": 95 + }, + { + "epoch": 0.014690130068859985, + "grad_norm": 5.015628494483557, + "learning_rate": 4.885496183206107e-06, + "loss": 0.7777, + "step": 96 + }, + { + "epoch": 0.014843152257077276, + "grad_norm": 4.517147075856258, + "learning_rate": 4.9363867684478375e-06, + "loss": 0.7682, + "step": 97 + }, + { + "epoch": 0.014996174445294568, + "grad_norm": 5.015648995454229, + "learning_rate": 4.987277353689568e-06, + "loss": 0.793, + "step": 98 + }, + { + "epoch": 0.01514919663351186, + "grad_norm": 4.91359741535642, + "learning_rate": 5.038167938931297e-06, + "loss": 0.7853, + "step": 99 + }, + { + "epoch": 0.015302218821729151, + "grad_norm": 4.88280014065132, + "learning_rate": 5.0890585241730285e-06, + "loss": 0.7867, + "step": 100 + }, + { + "epoch": 0.015455241009946443, + "grad_norm": 5.310203953123644, + "learning_rate": 5.139949109414759e-06, + "loss": 0.8291, + "step": 101 + }, + { + "epoch": 0.015608263198163734, + "grad_norm": 5.063524860125988, + "learning_rate": 5.190839694656488e-06, + "loss": 0.8207, + "step": 102 + }, + { + "epoch": 0.015761285386381024, + "grad_norm": 5.476946259775138, + "learning_rate": 5.2417302798982195e-06, + "loss": 0.7438, + "step": 103 + }, + { + "epoch": 0.015914307574598316, + "grad_norm": 4.563268298277021, + "learning_rate": 5.292620865139949e-06, + "loss": 0.8014, + "step": 104 + }, + { + "epoch": 0.016067329762815608, + "grad_norm": 4.766384946361356, + "learning_rate": 5.34351145038168e-06, + "loss": 0.7385, + "step": 105 + }, + { + "epoch": 0.0162203519510329, + "grad_norm": 4.959052123725921, + "learning_rate": 5.394402035623411e-06, + "loss": 0.6631, + "step": 106 + }, + { + "epoch": 0.01637337413925019, + "grad_norm": 4.660517289576387, + "learning_rate": 5.44529262086514e-06, + "loss": 0.7539, + "step": 107 + }, + { + "epoch": 0.016526396327467482, + "grad_norm": 4.760332379137938, + "learning_rate": 5.496183206106871e-06, + "loss": 0.8044, + "step": 108 + }, + { + "epoch": 0.016679418515684774, + "grad_norm": 5.213877297259624, + "learning_rate": 5.547073791348601e-06, + "loss": 0.8032, + "step": 109 + }, + { + "epoch": 0.016832440703902066, + "grad_norm": 4.500414357173698, + "learning_rate": 5.597964376590331e-06, + "loss": 0.851, + "step": 110 + }, + { + "epoch": 0.016985462892119357, + "grad_norm": 5.20758083198332, + "learning_rate": 5.648854961832062e-06, + "loss": 0.8376, + "step": 111 + }, + { + "epoch": 0.01713848508033665, + "grad_norm": 4.313432158821755, + "learning_rate": 5.699745547073792e-06, + "loss": 0.7425, + "step": 112 + }, + { + "epoch": 0.01729150726855394, + "grad_norm": 4.384745758633966, + "learning_rate": 5.750636132315522e-06, + "loss": 0.9117, + "step": 113 + }, + { + "epoch": 0.017444529456771232, + "grad_norm": 5.289188278919928, + "learning_rate": 5.801526717557252e-06, + "loss": 0.7709, + "step": 114 + }, + { + "epoch": 0.017597551644988524, + "grad_norm": 4.718040596929263, + "learning_rate": 5.852417302798983e-06, + "loss": 0.825, + "step": 115 + }, + { + "epoch": 0.017750573833205815, + "grad_norm": 4.4740329916884045, + "learning_rate": 5.903307888040712e-06, + "loss": 0.8031, + "step": 116 + }, + { + "epoch": 0.017903596021423107, + "grad_norm": 3.8294489939326657, + "learning_rate": 5.9541984732824435e-06, + "loss": 0.7537, + "step": 117 + }, + { + "epoch": 0.0180566182096404, + "grad_norm": 5.3921477281799906, + "learning_rate": 6.005089058524174e-06, + "loss": 0.8343, + "step": 118 + }, + { + "epoch": 0.01820964039785769, + "grad_norm": 4.585186066396906, + "learning_rate": 6.055979643765903e-06, + "loss": 0.8059, + "step": 119 + }, + { + "epoch": 0.018362662586074982, + "grad_norm": 4.164993298962277, + "learning_rate": 6.1068702290076346e-06, + "loss": 0.7707, + "step": 120 + }, + { + "epoch": 0.018515684774292274, + "grad_norm": 4.7470966220472715, + "learning_rate": 6.157760814249364e-06, + "loss": 0.7635, + "step": 121 + }, + { + "epoch": 0.018668706962509565, + "grad_norm": 4.57053122827436, + "learning_rate": 6.208651399491094e-06, + "loss": 0.7802, + "step": 122 + }, + { + "epoch": 0.018821729150726857, + "grad_norm": 4.764792340656931, + "learning_rate": 6.259541984732826e-06, + "loss": 0.7754, + "step": 123 + }, + { + "epoch": 0.01897475133894415, + "grad_norm": 4.673906229696043, + "learning_rate": 6.310432569974555e-06, + "loss": 0.7227, + "step": 124 + }, + { + "epoch": 0.019127773527161437, + "grad_norm": 4.254889278087384, + "learning_rate": 6.3613231552162854e-06, + "loss": 0.7967, + "step": 125 + }, + { + "epoch": 0.01928079571537873, + "grad_norm": 3.9781988510098314, + "learning_rate": 6.412213740458016e-06, + "loss": 0.6657, + "step": 126 + }, + { + "epoch": 0.01943381790359602, + "grad_norm": 4.294430988412905, + "learning_rate": 6.463104325699746e-06, + "loss": 0.8075, + "step": 127 + }, + { + "epoch": 0.01958684009181331, + "grad_norm": 4.562471061393905, + "learning_rate": 6.5139949109414765e-06, + "loss": 0.8503, + "step": 128 + }, + { + "epoch": 0.019739862280030603, + "grad_norm": 4.549436052521284, + "learning_rate": 6.564885496183207e-06, + "loss": 0.8009, + "step": 129 + }, + { + "epoch": 0.019892884468247895, + "grad_norm": 3.835829653833588, + "learning_rate": 6.615776081424937e-06, + "loss": 0.7992, + "step": 130 + }, + { + "epoch": 0.020045906656465187, + "grad_norm": 4.267030813968025, + "learning_rate": 6.666666666666667e-06, + "loss": 0.7283, + "step": 131 + }, + { + "epoch": 0.020198928844682478, + "grad_norm": 3.839774979418662, + "learning_rate": 6.717557251908398e-06, + "loss": 0.8088, + "step": 132 + }, + { + "epoch": 0.02035195103289977, + "grad_norm": 4.05482208608787, + "learning_rate": 6.768447837150128e-06, + "loss": 0.691, + "step": 133 + }, + { + "epoch": 0.02050497322111706, + "grad_norm": 4.39540221039343, + "learning_rate": 6.819338422391858e-06, + "loss": 0.769, + "step": 134 + }, + { + "epoch": 0.020657995409334353, + "grad_norm": 4.889279260558998, + "learning_rate": 6.870229007633589e-06, + "loss": 0.7327, + "step": 135 + }, + { + "epoch": 0.020811017597551645, + "grad_norm": 4.256993693098611, + "learning_rate": 6.921119592875318e-06, + "loss": 0.8079, + "step": 136 + }, + { + "epoch": 0.020964039785768936, + "grad_norm": 3.907777183976378, + "learning_rate": 6.972010178117049e-06, + "loss": 0.7274, + "step": 137 + }, + { + "epoch": 0.021117061973986228, + "grad_norm": 5.169921684229104, + "learning_rate": 7.022900763358779e-06, + "loss": 0.7762, + "step": 138 + }, + { + "epoch": 0.02127008416220352, + "grad_norm": 3.705954278589657, + "learning_rate": 7.073791348600509e-06, + "loss": 0.7769, + "step": 139 + }, + { + "epoch": 0.02142310635042081, + "grad_norm": 4.415407291310754, + "learning_rate": 7.12468193384224e-06, + "loss": 0.7743, + "step": 140 + }, + { + "epoch": 0.021576128538638103, + "grad_norm": 3.9001765524312577, + "learning_rate": 7.17557251908397e-06, + "loss": 0.7702, + "step": 141 + }, + { + "epoch": 0.021729150726855394, + "grad_norm": 4.010966551248577, + "learning_rate": 7.2264631043257004e-06, + "loss": 0.794, + "step": 142 + }, + { + "epoch": 0.021882172915072686, + "grad_norm": 4.547219140713333, + "learning_rate": 7.27735368956743e-06, + "loss": 0.8035, + "step": 143 + }, + { + "epoch": 0.022035195103289978, + "grad_norm": 3.895437930156407, + "learning_rate": 7.328244274809161e-06, + "loss": 0.7342, + "step": 144 + }, + { + "epoch": 0.02218821729150727, + "grad_norm": 3.7831765310187784, + "learning_rate": 7.3791348600508915e-06, + "loss": 0.7207, + "step": 145 + }, + { + "epoch": 0.02234123947972456, + "grad_norm": 4.02944403589167, + "learning_rate": 7.430025445292621e-06, + "loss": 0.6893, + "step": 146 + }, + { + "epoch": 0.022494261667941853, + "grad_norm": 3.955758005563262, + "learning_rate": 7.480916030534352e-06, + "loss": 0.744, + "step": 147 + }, + { + "epoch": 0.022647283856159144, + "grad_norm": 4.067883709052432, + "learning_rate": 7.531806615776082e-06, + "loss": 0.755, + "step": 148 + }, + { + "epoch": 0.022800306044376436, + "grad_norm": 4.2402099809259, + "learning_rate": 7.582697201017812e-06, + "loss": 0.7422, + "step": 149 + }, + { + "epoch": 0.022953328232593728, + "grad_norm": 4.262612871583228, + "learning_rate": 7.633587786259543e-06, + "loss": 0.8447, + "step": 150 + }, + { + "epoch": 0.02310635042081102, + "grad_norm": 3.7947232149122043, + "learning_rate": 7.684478371501274e-06, + "loss": 0.745, + "step": 151 + }, + { + "epoch": 0.02325937260902831, + "grad_norm": 4.143133247827625, + "learning_rate": 7.735368956743004e-06, + "loss": 0.8246, + "step": 152 + }, + { + "epoch": 0.0234123947972456, + "grad_norm": 3.9840463660552916, + "learning_rate": 7.786259541984733e-06, + "loss": 0.817, + "step": 153 + }, + { + "epoch": 0.02356541698546289, + "grad_norm": 4.101939658807749, + "learning_rate": 7.837150127226465e-06, + "loss": 0.8084, + "step": 154 + }, + { + "epoch": 0.023718439173680182, + "grad_norm": 4.061501850729827, + "learning_rate": 7.888040712468195e-06, + "loss": 0.8093, + "step": 155 + }, + { + "epoch": 0.023871461361897474, + "grad_norm": 4.822404469329433, + "learning_rate": 7.938931297709924e-06, + "loss": 0.8627, + "step": 156 + }, + { + "epoch": 0.024024483550114765, + "grad_norm": 4.64280647954761, + "learning_rate": 7.989821882951656e-06, + "loss": 0.7448, + "step": 157 + }, + { + "epoch": 0.024177505738332057, + "grad_norm": 4.03108430157329, + "learning_rate": 8.040712468193384e-06, + "loss": 0.7671, + "step": 158 + }, + { + "epoch": 0.02433052792654935, + "grad_norm": 4.4845780168203575, + "learning_rate": 8.091603053435115e-06, + "loss": 0.813, + "step": 159 + }, + { + "epoch": 0.02448355011476664, + "grad_norm": 4.345621286114522, + "learning_rate": 8.142493638676845e-06, + "loss": 0.7408, + "step": 160 + }, + { + "epoch": 0.024636572302983932, + "grad_norm": 4.6919797838381605, + "learning_rate": 8.193384223918575e-06, + "loss": 0.8106, + "step": 161 + }, + { + "epoch": 0.024789594491201224, + "grad_norm": 4.333359656214333, + "learning_rate": 8.244274809160306e-06, + "loss": 0.8115, + "step": 162 + }, + { + "epoch": 0.024942616679418515, + "grad_norm": 3.81217702704086, + "learning_rate": 8.295165394402036e-06, + "loss": 0.7482, + "step": 163 + }, + { + "epoch": 0.025095638867635807, + "grad_norm": 4.44804919658539, + "learning_rate": 8.346055979643766e-06, + "loss": 0.8544, + "step": 164 + }, + { + "epoch": 0.0252486610558531, + "grad_norm": 4.131700271663872, + "learning_rate": 8.396946564885497e-06, + "loss": 0.7262, + "step": 165 + }, + { + "epoch": 0.02540168324407039, + "grad_norm": 4.951792235136878, + "learning_rate": 8.447837150127227e-06, + "loss": 0.853, + "step": 166 + }, + { + "epoch": 0.025554705432287682, + "grad_norm": 4.326303123701613, + "learning_rate": 8.498727735368957e-06, + "loss": 0.7414, + "step": 167 + }, + { + "epoch": 0.025707727620504973, + "grad_norm": 3.955742466785765, + "learning_rate": 8.549618320610688e-06, + "loss": 0.6963, + "step": 168 + }, + { + "epoch": 0.025860749808722265, + "grad_norm": 3.9376734300269995, + "learning_rate": 8.600508905852418e-06, + "loss": 0.655, + "step": 169 + }, + { + "epoch": 0.026013771996939557, + "grad_norm": 5.053922872384057, + "learning_rate": 8.651399491094148e-06, + "loss": 0.8598, + "step": 170 + }, + { + "epoch": 0.02616679418515685, + "grad_norm": 3.940496090207759, + "learning_rate": 8.702290076335879e-06, + "loss": 0.7512, + "step": 171 + }, + { + "epoch": 0.02631981637337414, + "grad_norm": 4.22917739285267, + "learning_rate": 8.753180661577609e-06, + "loss": 0.9123, + "step": 172 + }, + { + "epoch": 0.02647283856159143, + "grad_norm": 4.508693336682513, + "learning_rate": 8.80407124681934e-06, + "loss": 0.8793, + "step": 173 + }, + { + "epoch": 0.026625860749808723, + "grad_norm": 4.233775278965739, + "learning_rate": 8.85496183206107e-06, + "loss": 0.8819, + "step": 174 + }, + { + "epoch": 0.026778882938026015, + "grad_norm": 3.7511210956873544, + "learning_rate": 8.9058524173028e-06, + "loss": 0.7657, + "step": 175 + }, + { + "epoch": 0.026931905126243307, + "grad_norm": 4.143781255150889, + "learning_rate": 8.95674300254453e-06, + "loss": 0.7149, + "step": 176 + }, + { + "epoch": 0.027084927314460598, + "grad_norm": 4.671966201492468, + "learning_rate": 9.007633587786259e-06, + "loss": 0.9099, + "step": 177 + }, + { + "epoch": 0.02723794950267789, + "grad_norm": 3.8699485585110627, + "learning_rate": 9.058524173027991e-06, + "loss": 0.7838, + "step": 178 + }, + { + "epoch": 0.02739097169089518, + "grad_norm": 3.7570884060183305, + "learning_rate": 9.109414758269721e-06, + "loss": 0.8138, + "step": 179 + }, + { + "epoch": 0.02754399387911247, + "grad_norm": 4.294913936841194, + "learning_rate": 9.16030534351145e-06, + "loss": 0.8253, + "step": 180 + }, + { + "epoch": 0.02769701606732976, + "grad_norm": 4.424497607816517, + "learning_rate": 9.211195928753182e-06, + "loss": 0.7881, + "step": 181 + }, + { + "epoch": 0.027850038255547053, + "grad_norm": 3.9878850555273604, + "learning_rate": 9.26208651399491e-06, + "loss": 0.7683, + "step": 182 + }, + { + "epoch": 0.028003060443764344, + "grad_norm": 4.214766952361433, + "learning_rate": 9.312977099236641e-06, + "loss": 0.811, + "step": 183 + }, + { + "epoch": 0.028156082631981636, + "grad_norm": 4.299160854226878, + "learning_rate": 9.363867684478373e-06, + "loss": 0.8099, + "step": 184 + }, + { + "epoch": 0.028309104820198928, + "grad_norm": 4.261601197119955, + "learning_rate": 9.414758269720102e-06, + "loss": 0.8327, + "step": 185 + }, + { + "epoch": 0.02846212700841622, + "grad_norm": 4.348418153113838, + "learning_rate": 9.465648854961834e-06, + "loss": 0.8351, + "step": 186 + }, + { + "epoch": 0.02861514919663351, + "grad_norm": 4.159462607341161, + "learning_rate": 9.516539440203563e-06, + "loss": 0.8109, + "step": 187 + }, + { + "epoch": 0.028768171384850803, + "grad_norm": 4.139615856283388, + "learning_rate": 9.567430025445293e-06, + "loss": 0.808, + "step": 188 + }, + { + "epoch": 0.028921193573068094, + "grad_norm": 3.966142369811451, + "learning_rate": 9.618320610687025e-06, + "loss": 0.8221, + "step": 189 + }, + { + "epoch": 0.029074215761285386, + "grad_norm": 4.2883136258405585, + "learning_rate": 9.669211195928754e-06, + "loss": 0.7766, + "step": 190 + }, + { + "epoch": 0.029227237949502678, + "grad_norm": 3.5038128496057706, + "learning_rate": 9.720101781170484e-06, + "loss": 0.802, + "step": 191 + }, + { + "epoch": 0.02938026013771997, + "grad_norm": 4.018460389006565, + "learning_rate": 9.770992366412214e-06, + "loss": 0.8793, + "step": 192 + }, + { + "epoch": 0.02953328232593726, + "grad_norm": 4.197653542836699, + "learning_rate": 9.821882951653945e-06, + "loss": 0.7636, + "step": 193 + }, + { + "epoch": 0.029686304514154552, + "grad_norm": 4.3491719420032275, + "learning_rate": 9.872773536895675e-06, + "loss": 0.8217, + "step": 194 + }, + { + "epoch": 0.029839326702371844, + "grad_norm": 3.896319127881109, + "learning_rate": 9.923664122137405e-06, + "loss": 0.6789, + "step": 195 + }, + { + "epoch": 0.029992348890589136, + "grad_norm": 4.135458574765948, + "learning_rate": 9.974554707379136e-06, + "loss": 0.7865, + "step": 196 + }, + { + "epoch": 0.030145371078806427, + "grad_norm": 4.497645124133756, + "learning_rate": 1.0025445292620866e-05, + "loss": 0.8283, + "step": 197 + }, + { + "epoch": 0.03029839326702372, + "grad_norm": 3.5421499415575246, + "learning_rate": 1.0076335877862595e-05, + "loss": 0.9806, + "step": 198 + }, + { + "epoch": 0.03045141545524101, + "grad_norm": 3.5582689056601446, + "learning_rate": 1.0127226463104327e-05, + "loss": 0.7229, + "step": 199 + }, + { + "epoch": 0.030604437643458302, + "grad_norm": 4.505265157757878, + "learning_rate": 1.0178117048346057e-05, + "loss": 0.8239, + "step": 200 + }, + { + "epoch": 0.030757459831675594, + "grad_norm": 3.785002672226262, + "learning_rate": 1.0229007633587786e-05, + "loss": 0.8285, + "step": 201 + }, + { + "epoch": 0.030910482019892885, + "grad_norm": 4.056563880467215, + "learning_rate": 1.0279898218829518e-05, + "loss": 0.8093, + "step": 202 + }, + { + "epoch": 0.031063504208110177, + "grad_norm": 3.8406342164711114, + "learning_rate": 1.0330788804071248e-05, + "loss": 0.8487, + "step": 203 + }, + { + "epoch": 0.03121652639632747, + "grad_norm": 4.52064054660571, + "learning_rate": 1.0381679389312977e-05, + "loss": 0.7581, + "step": 204 + }, + { + "epoch": 0.03136954858454476, + "grad_norm": 3.6506482696933977, + "learning_rate": 1.0432569974554709e-05, + "loss": 0.7612, + "step": 205 + }, + { + "epoch": 0.03152257077276205, + "grad_norm": 4.383431897980942, + "learning_rate": 1.0483460559796439e-05, + "loss": 0.8776, + "step": 206 + }, + { + "epoch": 0.03167559296097934, + "grad_norm": 3.741616408368712, + "learning_rate": 1.0534351145038168e-05, + "loss": 0.7634, + "step": 207 + }, + { + "epoch": 0.03182861514919663, + "grad_norm": 4.448598087029719, + "learning_rate": 1.0585241730279898e-05, + "loss": 0.8928, + "step": 208 + }, + { + "epoch": 0.03198163733741392, + "grad_norm": 3.7260572298800136, + "learning_rate": 1.063613231552163e-05, + "loss": 0.7409, + "step": 209 + }, + { + "epoch": 0.032134659525631215, + "grad_norm": 4.082446929366836, + "learning_rate": 1.068702290076336e-05, + "loss": 0.7702, + "step": 210 + }, + { + "epoch": 0.03228768171384851, + "grad_norm": 4.276111120838432, + "learning_rate": 1.0737913486005089e-05, + "loss": 0.8451, + "step": 211 + }, + { + "epoch": 0.0324407039020658, + "grad_norm": 3.5982082879998303, + "learning_rate": 1.0788804071246821e-05, + "loss": 0.7412, + "step": 212 + }, + { + "epoch": 0.03259372609028309, + "grad_norm": 3.7182399171777427, + "learning_rate": 1.0839694656488552e-05, + "loss": 0.7734, + "step": 213 + }, + { + "epoch": 0.03274674827850038, + "grad_norm": 4.45090671467595, + "learning_rate": 1.089058524173028e-05, + "loss": 0.7803, + "step": 214 + }, + { + "epoch": 0.03289977046671767, + "grad_norm": 4.10877525262828, + "learning_rate": 1.094147582697201e-05, + "loss": 0.8259, + "step": 215 + }, + { + "epoch": 0.033052792654934965, + "grad_norm": 3.6500837791600693, + "learning_rate": 1.0992366412213743e-05, + "loss": 0.789, + "step": 216 + }, + { + "epoch": 0.033205814843152257, + "grad_norm": 3.5491002374247267, + "learning_rate": 1.1043256997455471e-05, + "loss": 0.7059, + "step": 217 + }, + { + "epoch": 0.03335883703136955, + "grad_norm": 3.4521743275023624, + "learning_rate": 1.1094147582697202e-05, + "loss": 0.7247, + "step": 218 + }, + { + "epoch": 0.03351185921958684, + "grad_norm": 3.8737746868152994, + "learning_rate": 1.1145038167938934e-05, + "loss": 0.7646, + "step": 219 + }, + { + "epoch": 0.03366488140780413, + "grad_norm": 3.8562641801906192, + "learning_rate": 1.1195928753180662e-05, + "loss": 0.8318, + "step": 220 + }, + { + "epoch": 0.03381790359602142, + "grad_norm": 3.912581720585126, + "learning_rate": 1.1246819338422393e-05, + "loss": 0.8093, + "step": 221 + }, + { + "epoch": 0.033970925784238715, + "grad_norm": 4.065770061675511, + "learning_rate": 1.1297709923664125e-05, + "loss": 0.7937, + "step": 222 + }, + { + "epoch": 0.034123947972456006, + "grad_norm": 3.6237024117778986, + "learning_rate": 1.1348600508905853e-05, + "loss": 0.7211, + "step": 223 + }, + { + "epoch": 0.0342769701606733, + "grad_norm": 3.199937206085619, + "learning_rate": 1.1399491094147584e-05, + "loss": 0.6572, + "step": 224 + }, + { + "epoch": 0.03442999234889059, + "grad_norm": 3.9340874687404392, + "learning_rate": 1.1450381679389312e-05, + "loss": 0.7167, + "step": 225 + }, + { + "epoch": 0.03458301453710788, + "grad_norm": 4.280709448985111, + "learning_rate": 1.1501272264631044e-05, + "loss": 0.8466, + "step": 226 + }, + { + "epoch": 0.03473603672532517, + "grad_norm": 4.079222952395199, + "learning_rate": 1.1552162849872775e-05, + "loss": 0.7506, + "step": 227 + }, + { + "epoch": 0.034889058913542464, + "grad_norm": 3.5818500329635308, + "learning_rate": 1.1603053435114503e-05, + "loss": 0.8351, + "step": 228 + }, + { + "epoch": 0.035042081101759756, + "grad_norm": 3.683692326325265, + "learning_rate": 1.1653944020356235e-05, + "loss": 0.8308, + "step": 229 + }, + { + "epoch": 0.03519510328997705, + "grad_norm": 4.314842659628774, + "learning_rate": 1.1704834605597966e-05, + "loss": 0.863, + "step": 230 + }, + { + "epoch": 0.03534812547819434, + "grad_norm": 3.8577935082344057, + "learning_rate": 1.1755725190839696e-05, + "loss": 0.7947, + "step": 231 + }, + { + "epoch": 0.03550114766641163, + "grad_norm": 4.001739000653889, + "learning_rate": 1.1806615776081425e-05, + "loss": 0.8155, + "step": 232 + }, + { + "epoch": 0.03565416985462892, + "grad_norm": 4.413074080050747, + "learning_rate": 1.1857506361323157e-05, + "loss": 0.8612, + "step": 233 + }, + { + "epoch": 0.035807192042846214, + "grad_norm": 3.7562800982675433, + "learning_rate": 1.1908396946564887e-05, + "loss": 0.7496, + "step": 234 + }, + { + "epoch": 0.035960214231063506, + "grad_norm": 4.676164618927764, + "learning_rate": 1.1959287531806616e-05, + "loss": 0.7961, + "step": 235 + }, + { + "epoch": 0.0361132364192808, + "grad_norm": 4.1865326091807376, + "learning_rate": 1.2010178117048348e-05, + "loss": 0.7285, + "step": 236 + }, + { + "epoch": 0.03626625860749809, + "grad_norm": 4.330640395701582, + "learning_rate": 1.2061068702290078e-05, + "loss": 0.86, + "step": 237 + }, + { + "epoch": 0.03641928079571538, + "grad_norm": 4.221924703378774, + "learning_rate": 1.2111959287531807e-05, + "loss": 0.8088, + "step": 238 + }, + { + "epoch": 0.03657230298393267, + "grad_norm": 4.08774124838906, + "learning_rate": 1.2162849872773539e-05, + "loss": 0.7768, + "step": 239 + }, + { + "epoch": 0.036725325172149964, + "grad_norm": 3.6681049073966387, + "learning_rate": 1.2213740458015269e-05, + "loss": 0.8089, + "step": 240 + }, + { + "epoch": 0.036878347360367256, + "grad_norm": 4.339040545843134, + "learning_rate": 1.2264631043256998e-05, + "loss": 0.9708, + "step": 241 + }, + { + "epoch": 0.03703136954858455, + "grad_norm": 4.090377047572124, + "learning_rate": 1.2315521628498728e-05, + "loss": 0.7973, + "step": 242 + }, + { + "epoch": 0.03718439173680184, + "grad_norm": 3.8573977126801324, + "learning_rate": 1.236641221374046e-05, + "loss": 0.8493, + "step": 243 + }, + { + "epoch": 0.03733741392501913, + "grad_norm": 3.8417101718993036, + "learning_rate": 1.2417302798982189e-05, + "loss": 0.7898, + "step": 244 + }, + { + "epoch": 0.03749043611323642, + "grad_norm": 4.167929367567143, + "learning_rate": 1.2468193384223919e-05, + "loss": 0.8347, + "step": 245 + }, + { + "epoch": 0.037643458301453714, + "grad_norm": 4.304125499094029, + "learning_rate": 1.2519083969465651e-05, + "loss": 0.9077, + "step": 246 + }, + { + "epoch": 0.037796480489671005, + "grad_norm": 3.7732211137563523, + "learning_rate": 1.256997455470738e-05, + "loss": 0.7202, + "step": 247 + }, + { + "epoch": 0.0379495026778883, + "grad_norm": 4.028839146378367, + "learning_rate": 1.262086513994911e-05, + "loss": 0.9032, + "step": 248 + }, + { + "epoch": 0.03810252486610559, + "grad_norm": 3.571061110080041, + "learning_rate": 1.2671755725190839e-05, + "loss": 0.8451, + "step": 249 + }, + { + "epoch": 0.03825554705432287, + "grad_norm": 4.231210681726749, + "learning_rate": 1.2722646310432571e-05, + "loss": 0.8744, + "step": 250 + }, + { + "epoch": 0.038408569242540165, + "grad_norm": 3.8963900051843092, + "learning_rate": 1.2773536895674301e-05, + "loss": 0.8427, + "step": 251 + }, + { + "epoch": 0.03856159143075746, + "grad_norm": 3.983079418182341, + "learning_rate": 1.2824427480916032e-05, + "loss": 0.7799, + "step": 252 + }, + { + "epoch": 0.03871461361897475, + "grad_norm": 3.3692269874643967, + "learning_rate": 1.2875318066157762e-05, + "loss": 0.7349, + "step": 253 + }, + { + "epoch": 0.03886763580719204, + "grad_norm": 4.081444797385999, + "learning_rate": 1.2926208651399492e-05, + "loss": 0.8248, + "step": 254 + }, + { + "epoch": 0.03902065799540933, + "grad_norm": 3.5071517749127143, + "learning_rate": 1.2977099236641223e-05, + "loss": 0.7729, + "step": 255 + }, + { + "epoch": 0.03917368018362662, + "grad_norm": 3.983554583470299, + "learning_rate": 1.3027989821882953e-05, + "loss": 0.7358, + "step": 256 + }, + { + "epoch": 0.039326702371843915, + "grad_norm": 4.156162140200395, + "learning_rate": 1.3078880407124683e-05, + "loss": 0.7827, + "step": 257 + }, + { + "epoch": 0.039479724560061207, + "grad_norm": 3.7356062298023005, + "learning_rate": 1.3129770992366414e-05, + "loss": 0.8187, + "step": 258 + }, + { + "epoch": 0.0396327467482785, + "grad_norm": 4.41491523535067, + "learning_rate": 1.3180661577608142e-05, + "loss": 0.9266, + "step": 259 + }, + { + "epoch": 0.03978576893649579, + "grad_norm": 3.823408041477082, + "learning_rate": 1.3231552162849874e-05, + "loss": 0.7646, + "step": 260 + }, + { + "epoch": 0.03993879112471308, + "grad_norm": 4.187964678105524, + "learning_rate": 1.3282442748091605e-05, + "loss": 0.7775, + "step": 261 + }, + { + "epoch": 0.04009181331293037, + "grad_norm": 3.677752698776951, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.7735, + "step": 262 + }, + { + "epoch": 0.040244835501147665, + "grad_norm": 4.163618150951032, + "learning_rate": 1.3384223918575065e-05, + "loss": 0.8633, + "step": 263 + }, + { + "epoch": 0.040397857689364956, + "grad_norm": 4.340339901550398, + "learning_rate": 1.3435114503816796e-05, + "loss": 0.8143, + "step": 264 + }, + { + "epoch": 0.04055087987758225, + "grad_norm": 3.779345498739458, + "learning_rate": 1.3486005089058524e-05, + "loss": 0.7737, + "step": 265 + }, + { + "epoch": 0.04070390206579954, + "grad_norm": 3.4660216974591944, + "learning_rate": 1.3536895674300256e-05, + "loss": 0.7818, + "step": 266 + }, + { + "epoch": 0.04085692425401683, + "grad_norm": 3.460945209038202, + "learning_rate": 1.3587786259541987e-05, + "loss": 0.8819, + "step": 267 + }, + { + "epoch": 0.04100994644223412, + "grad_norm": 3.174432254917918, + "learning_rate": 1.3638676844783715e-05, + "loss": 0.8147, + "step": 268 + }, + { + "epoch": 0.041162968630451414, + "grad_norm": 3.6606958430717436, + "learning_rate": 1.3689567430025446e-05, + "loss": 0.7882, + "step": 269 + }, + { + "epoch": 0.041315990818668706, + "grad_norm": 3.4598901409929566, + "learning_rate": 1.3740458015267178e-05, + "loss": 0.7396, + "step": 270 + }, + { + "epoch": 0.041469013006886, + "grad_norm": 3.837680859754261, + "learning_rate": 1.3791348600508906e-05, + "loss": 0.9517, + "step": 271 + }, + { + "epoch": 0.04162203519510329, + "grad_norm": 4.205620824295019, + "learning_rate": 1.3842239185750637e-05, + "loss": 0.8136, + "step": 272 + }, + { + "epoch": 0.04177505738332058, + "grad_norm": 4.593353976664115, + "learning_rate": 1.3893129770992369e-05, + "loss": 0.8764, + "step": 273 + }, + { + "epoch": 0.04192807957153787, + "grad_norm": 3.674973105300058, + "learning_rate": 1.3944020356234097e-05, + "loss": 0.7451, + "step": 274 + }, + { + "epoch": 0.042081101759755164, + "grad_norm": 4.080257867421216, + "learning_rate": 1.3994910941475828e-05, + "loss": 0.938, + "step": 275 + }, + { + "epoch": 0.042234123947972456, + "grad_norm": 3.6825330496591513, + "learning_rate": 1.4045801526717558e-05, + "loss": 0.9145, + "step": 276 + }, + { + "epoch": 0.04238714613618975, + "grad_norm": 3.6676110960298263, + "learning_rate": 1.4096692111959288e-05, + "loss": 0.8281, + "step": 277 + }, + { + "epoch": 0.04254016832440704, + "grad_norm": 3.9692318784638876, + "learning_rate": 1.4147582697201019e-05, + "loss": 0.8391, + "step": 278 + }, + { + "epoch": 0.04269319051262433, + "grad_norm": 4.003302405983541, + "learning_rate": 1.4198473282442749e-05, + "loss": 0.8807, + "step": 279 + }, + { + "epoch": 0.04284621270084162, + "grad_norm": 3.6051890207501125, + "learning_rate": 1.424936386768448e-05, + "loss": 0.8002, + "step": 280 + }, + { + "epoch": 0.042999234889058914, + "grad_norm": 3.741199905910224, + "learning_rate": 1.430025445292621e-05, + "loss": 0.9184, + "step": 281 + }, + { + "epoch": 0.043152257077276206, + "grad_norm": 3.989340278328752, + "learning_rate": 1.435114503816794e-05, + "loss": 0.8947, + "step": 282 + }, + { + "epoch": 0.0433052792654935, + "grad_norm": 3.64277436441139, + "learning_rate": 1.4402035623409672e-05, + "loss": 0.8931, + "step": 283 + }, + { + "epoch": 0.04345830145371079, + "grad_norm": 4.1014237450789635, + "learning_rate": 1.4452926208651401e-05, + "loss": 0.8706, + "step": 284 + }, + { + "epoch": 0.04361132364192808, + "grad_norm": 3.4366912250691297, + "learning_rate": 1.4503816793893131e-05, + "loss": 0.8406, + "step": 285 + }, + { + "epoch": 0.04376434583014537, + "grad_norm": 3.580030889617023, + "learning_rate": 1.455470737913486e-05, + "loss": 0.8221, + "step": 286 + }, + { + "epoch": 0.043917368018362664, + "grad_norm": 3.298085781868641, + "learning_rate": 1.4605597964376592e-05, + "loss": 0.7627, + "step": 287 + }, + { + "epoch": 0.044070390206579955, + "grad_norm": 3.6026238179183316, + "learning_rate": 1.4656488549618322e-05, + "loss": 0.8868, + "step": 288 + }, + { + "epoch": 0.04422341239479725, + "grad_norm": 3.5628106762524663, + "learning_rate": 1.4707379134860051e-05, + "loss": 0.8383, + "step": 289 + }, + { + "epoch": 0.04437643458301454, + "grad_norm": 4.098960888954263, + "learning_rate": 1.4758269720101783e-05, + "loss": 0.8245, + "step": 290 + }, + { + "epoch": 0.04452945677123183, + "grad_norm": 4.019679742440305, + "learning_rate": 1.4809160305343513e-05, + "loss": 0.9617, + "step": 291 + }, + { + "epoch": 0.04468247895944912, + "grad_norm": 3.870786617215219, + "learning_rate": 1.4860050890585242e-05, + "loss": 0.7536, + "step": 292 + }, + { + "epoch": 0.044835501147666414, + "grad_norm": 3.624983481704036, + "learning_rate": 1.4910941475826972e-05, + "loss": 0.7665, + "step": 293 + }, + { + "epoch": 0.044988523335883705, + "grad_norm": 3.4579342237989863, + "learning_rate": 1.4961832061068704e-05, + "loss": 0.7475, + "step": 294 + }, + { + "epoch": 0.045141545524101, + "grad_norm": 3.825987018762197, + "learning_rate": 1.5012722646310433e-05, + "loss": 0.7701, + "step": 295 + }, + { + "epoch": 0.04529456771231829, + "grad_norm": 3.925285067712268, + "learning_rate": 1.5063613231552163e-05, + "loss": 0.8424, + "step": 296 + }, + { + "epoch": 0.04544758990053558, + "grad_norm": 4.536531873258212, + "learning_rate": 1.5114503816793895e-05, + "loss": 0.8194, + "step": 297 + }, + { + "epoch": 0.04560061208875287, + "grad_norm": 3.849198877547217, + "learning_rate": 1.5165394402035624e-05, + "loss": 0.8097, + "step": 298 + }, + { + "epoch": 0.04575363427697016, + "grad_norm": 3.4268881311529, + "learning_rate": 1.5216284987277354e-05, + "loss": 0.7637, + "step": 299 + }, + { + "epoch": 0.045906656465187455, + "grad_norm": 3.844837610723275, + "learning_rate": 1.5267175572519086e-05, + "loss": 0.9146, + "step": 300 + }, + { + "epoch": 0.04605967865340475, + "grad_norm": 4.200860333219284, + "learning_rate": 1.5318066157760817e-05, + "loss": 0.8694, + "step": 301 + }, + { + "epoch": 0.04621270084162204, + "grad_norm": 3.708632369987497, + "learning_rate": 1.5368956743002547e-05, + "loss": 0.7955, + "step": 302 + }, + { + "epoch": 0.04636572302983933, + "grad_norm": 4.560895491616818, + "learning_rate": 1.5419847328244274e-05, + "loss": 0.8735, + "step": 303 + }, + { + "epoch": 0.04651874521805662, + "grad_norm": 4.1460005936560576, + "learning_rate": 1.5470737913486008e-05, + "loss": 0.9344, + "step": 304 + }, + { + "epoch": 0.046671767406273906, + "grad_norm": 3.5819446783097875, + "learning_rate": 1.5521628498727738e-05, + "loss": 0.8728, + "step": 305 + }, + { + "epoch": 0.0468247895944912, + "grad_norm": 3.5673030396101755, + "learning_rate": 1.5572519083969465e-05, + "loss": 0.8558, + "step": 306 + }, + { + "epoch": 0.04697781178270849, + "grad_norm": 4.465799436044937, + "learning_rate": 1.56234096692112e-05, + "loss": 0.9466, + "step": 307 + }, + { + "epoch": 0.04713083397092578, + "grad_norm": 3.8186144985338335, + "learning_rate": 1.567430025445293e-05, + "loss": 0.8226, + "step": 308 + }, + { + "epoch": 0.04728385615914307, + "grad_norm": 3.0526556169711796, + "learning_rate": 1.5725190839694656e-05, + "loss": 0.8292, + "step": 309 + }, + { + "epoch": 0.047436878347360364, + "grad_norm": 3.8395154224833963, + "learning_rate": 1.577608142493639e-05, + "loss": 0.9308, + "step": 310 + }, + { + "epoch": 0.047589900535577656, + "grad_norm": 3.3739437105583048, + "learning_rate": 1.582697201017812e-05, + "loss": 0.8354, + "step": 311 + }, + { + "epoch": 0.04774292272379495, + "grad_norm": 3.428552157235765, + "learning_rate": 1.5877862595419847e-05, + "loss": 0.7738, + "step": 312 + }, + { + "epoch": 0.04789594491201224, + "grad_norm": 3.8083642028444857, + "learning_rate": 1.5928753180661577e-05, + "loss": 0.8379, + "step": 313 + }, + { + "epoch": 0.04804896710022953, + "grad_norm": 3.7472390334350854, + "learning_rate": 1.597964376590331e-05, + "loss": 0.7635, + "step": 314 + }, + { + "epoch": 0.04820198928844682, + "grad_norm": 3.5905420969046977, + "learning_rate": 1.6030534351145038e-05, + "loss": 0.84, + "step": 315 + }, + { + "epoch": 0.048355011476664114, + "grad_norm": 3.371134238680956, + "learning_rate": 1.608142493638677e-05, + "loss": 0.822, + "step": 316 + }, + { + "epoch": 0.048508033664881406, + "grad_norm": 3.251211187350607, + "learning_rate": 1.6132315521628502e-05, + "loss": 0.8663, + "step": 317 + }, + { + "epoch": 0.0486610558530987, + "grad_norm": 3.710129923001892, + "learning_rate": 1.618320610687023e-05, + "loss": 0.885, + "step": 318 + }, + { + "epoch": 0.04881407804131599, + "grad_norm": 3.4804407042961243, + "learning_rate": 1.623409669211196e-05, + "loss": 0.8388, + "step": 319 + }, + { + "epoch": 0.04896710022953328, + "grad_norm": 3.3398691087172763, + "learning_rate": 1.628498727735369e-05, + "loss": 0.8033, + "step": 320 + }, + { + "epoch": 0.04912012241775057, + "grad_norm": 3.4336060940387156, + "learning_rate": 1.633587786259542e-05, + "loss": 0.8656, + "step": 321 + }, + { + "epoch": 0.049273144605967864, + "grad_norm": 3.6774669512700693, + "learning_rate": 1.638676844783715e-05, + "loss": 0.9224, + "step": 322 + }, + { + "epoch": 0.049426166794185156, + "grad_norm": 3.707606788471563, + "learning_rate": 1.643765903307888e-05, + "loss": 0.8309, + "step": 323 + }, + { + "epoch": 0.04957918898240245, + "grad_norm": 3.235739424905745, + "learning_rate": 1.648854961832061e-05, + "loss": 0.7052, + "step": 324 + }, + { + "epoch": 0.04973221117061974, + "grad_norm": 3.7080138914780796, + "learning_rate": 1.653944020356234e-05, + "loss": 0.8157, + "step": 325 + }, + { + "epoch": 0.04988523335883703, + "grad_norm": 3.166803594516287, + "learning_rate": 1.6590330788804072e-05, + "loss": 0.8085, + "step": 326 + }, + { + "epoch": 0.05003825554705432, + "grad_norm": 3.7042887553890647, + "learning_rate": 1.6641221374045802e-05, + "loss": 0.8331, + "step": 327 + }, + { + "epoch": 0.050191277735271614, + "grad_norm": 4.6732971094593125, + "learning_rate": 1.6692111959287533e-05, + "loss": 0.8703, + "step": 328 + }, + { + "epoch": 0.050344299923488905, + "grad_norm": 3.5191964993901137, + "learning_rate": 1.6743002544529263e-05, + "loss": 0.7757, + "step": 329 + }, + { + "epoch": 0.0504973221117062, + "grad_norm": 3.62621311940619, + "learning_rate": 1.6793893129770993e-05, + "loss": 0.9091, + "step": 330 + }, + { + "epoch": 0.05065034429992349, + "grad_norm": 3.4566006756009857, + "learning_rate": 1.6844783715012724e-05, + "loss": 0.8907, + "step": 331 + }, + { + "epoch": 0.05080336648814078, + "grad_norm": 4.226884860942899, + "learning_rate": 1.6895674300254454e-05, + "loss": 0.9568, + "step": 332 + }, + { + "epoch": 0.05095638867635807, + "grad_norm": 3.8334891651194445, + "learning_rate": 1.6946564885496184e-05, + "loss": 0.802, + "step": 333 + }, + { + "epoch": 0.051109410864575364, + "grad_norm": 3.643779521200893, + "learning_rate": 1.6997455470737915e-05, + "loss": 0.7904, + "step": 334 + }, + { + "epoch": 0.051262433052792655, + "grad_norm": 3.6109019840921177, + "learning_rate": 1.7048346055979645e-05, + "loss": 0.9173, + "step": 335 + }, + { + "epoch": 0.05141545524100995, + "grad_norm": 3.556033056937106, + "learning_rate": 1.7099236641221375e-05, + "loss": 0.6736, + "step": 336 + }, + { + "epoch": 0.05156847742922724, + "grad_norm": 3.4590858648740457, + "learning_rate": 1.7150127226463106e-05, + "loss": 0.7459, + "step": 337 + }, + { + "epoch": 0.05172149961744453, + "grad_norm": 3.883487644337928, + "learning_rate": 1.7201017811704836e-05, + "loss": 0.8293, + "step": 338 + }, + { + "epoch": 0.05187452180566182, + "grad_norm": 3.6817700556181308, + "learning_rate": 1.7251908396946566e-05, + "loss": 0.8543, + "step": 339 + }, + { + "epoch": 0.05202754399387911, + "grad_norm": 3.6092941197838107, + "learning_rate": 1.7302798982188297e-05, + "loss": 0.7518, + "step": 340 + }, + { + "epoch": 0.052180566182096405, + "grad_norm": 3.521936034926192, + "learning_rate": 1.7353689567430027e-05, + "loss": 0.7999, + "step": 341 + }, + { + "epoch": 0.0523335883703137, + "grad_norm": 3.2423067286052283, + "learning_rate": 1.7404580152671757e-05, + "loss": 0.8241, + "step": 342 + }, + { + "epoch": 0.05248661055853099, + "grad_norm": 3.5515781332359797, + "learning_rate": 1.7455470737913488e-05, + "loss": 0.8267, + "step": 343 + }, + { + "epoch": 0.05263963274674828, + "grad_norm": 3.361362826153268, + "learning_rate": 1.7506361323155218e-05, + "loss": 0.8923, + "step": 344 + }, + { + "epoch": 0.05279265493496557, + "grad_norm": 4.352751368237682, + "learning_rate": 1.755725190839695e-05, + "loss": 0.8396, + "step": 345 + }, + { + "epoch": 0.05294567712318286, + "grad_norm": 3.405295125419368, + "learning_rate": 1.760814249363868e-05, + "loss": 0.8015, + "step": 346 + }, + { + "epoch": 0.053098699311400155, + "grad_norm": 4.0707162407530255, + "learning_rate": 1.765903307888041e-05, + "loss": 0.9149, + "step": 347 + }, + { + "epoch": 0.053251721499617446, + "grad_norm": 3.199089427504116, + "learning_rate": 1.770992366412214e-05, + "loss": 0.859, + "step": 348 + }, + { + "epoch": 0.05340474368783474, + "grad_norm": 3.3830728370139505, + "learning_rate": 1.776081424936387e-05, + "loss": 0.9282, + "step": 349 + }, + { + "epoch": 0.05355776587605203, + "grad_norm": 3.3892952078432086, + "learning_rate": 1.78117048346056e-05, + "loss": 0.8592, + "step": 350 + }, + { + "epoch": 0.05371078806426932, + "grad_norm": 3.3837045648684434, + "learning_rate": 1.786259541984733e-05, + "loss": 0.8109, + "step": 351 + }, + { + "epoch": 0.05386381025248661, + "grad_norm": 4.01981710203223, + "learning_rate": 1.791348600508906e-05, + "loss": 0.8086, + "step": 352 + }, + { + "epoch": 0.054016832440703905, + "grad_norm": 3.268564561162783, + "learning_rate": 1.796437659033079e-05, + "loss": 0.8009, + "step": 353 + }, + { + "epoch": 0.054169854628921196, + "grad_norm": 3.207034469484103, + "learning_rate": 1.8015267175572518e-05, + "loss": 0.7459, + "step": 354 + }, + { + "epoch": 0.05432287681713849, + "grad_norm": 3.7905236555543698, + "learning_rate": 1.8066157760814252e-05, + "loss": 0.9694, + "step": 355 + }, + { + "epoch": 0.05447589900535578, + "grad_norm": 3.899335977334721, + "learning_rate": 1.8117048346055982e-05, + "loss": 0.8516, + "step": 356 + }, + { + "epoch": 0.05462892119357307, + "grad_norm": 3.6683342704748183, + "learning_rate": 1.816793893129771e-05, + "loss": 0.8954, + "step": 357 + }, + { + "epoch": 0.05478194338179036, + "grad_norm": 3.2948530407738317, + "learning_rate": 1.8218829516539443e-05, + "loss": 0.8569, + "step": 358 + }, + { + "epoch": 0.054934965570007654, + "grad_norm": 3.107869632993156, + "learning_rate": 1.8269720101781173e-05, + "loss": 0.7997, + "step": 359 + }, + { + "epoch": 0.05508798775822494, + "grad_norm": 3.740911956061003, + "learning_rate": 1.83206106870229e-05, + "loss": 0.7949, + "step": 360 + }, + { + "epoch": 0.05524100994644223, + "grad_norm": 3.5260469337243086, + "learning_rate": 1.8371501272264634e-05, + "loss": 0.8832, + "step": 361 + }, + { + "epoch": 0.05539403213465952, + "grad_norm": 4.024532626989753, + "learning_rate": 1.8422391857506364e-05, + "loss": 0.8864, + "step": 362 + }, + { + "epoch": 0.055547054322876814, + "grad_norm": 3.7363845688636372, + "learning_rate": 1.847328244274809e-05, + "loss": 0.7486, + "step": 363 + }, + { + "epoch": 0.055700076511094106, + "grad_norm": 2.989844465377914, + "learning_rate": 1.852417302798982e-05, + "loss": 0.7627, + "step": 364 + }, + { + "epoch": 0.0558530986993114, + "grad_norm": 3.4845633849092814, + "learning_rate": 1.8575063613231555e-05, + "loss": 0.8626, + "step": 365 + }, + { + "epoch": 0.05600612088752869, + "grad_norm": 3.482589493869205, + "learning_rate": 1.8625954198473282e-05, + "loss": 0.7346, + "step": 366 + }, + { + "epoch": 0.05615914307574598, + "grad_norm": 3.2941484050459255, + "learning_rate": 1.8676844783715013e-05, + "loss": 0.8794, + "step": 367 + }, + { + "epoch": 0.05631216526396327, + "grad_norm": 3.7695913912244796, + "learning_rate": 1.8727735368956746e-05, + "loss": 0.8083, + "step": 368 + }, + { + "epoch": 0.056465187452180564, + "grad_norm": 3.247852909314503, + "learning_rate": 1.8778625954198473e-05, + "loss": 0.7414, + "step": 369 + }, + { + "epoch": 0.056618209640397855, + "grad_norm": 3.3192936130223103, + "learning_rate": 1.8829516539440204e-05, + "loss": 0.9026, + "step": 370 + }, + { + "epoch": 0.05677123182861515, + "grad_norm": 3.41682048630867, + "learning_rate": 1.8880407124681937e-05, + "loss": 0.7388, + "step": 371 + }, + { + "epoch": 0.05692425401683244, + "grad_norm": 3.623094017822295, + "learning_rate": 1.8931297709923668e-05, + "loss": 0.9649, + "step": 372 + }, + { + "epoch": 0.05707727620504973, + "grad_norm": 3.7002949571305903, + "learning_rate": 1.8982188295165395e-05, + "loss": 0.8871, + "step": 373 + }, + { + "epoch": 0.05723029839326702, + "grad_norm": 3.325309840511273, + "learning_rate": 1.9033078880407125e-05, + "loss": 0.9166, + "step": 374 + }, + { + "epoch": 0.057383320581484314, + "grad_norm": 3.1858695475335383, + "learning_rate": 1.908396946564886e-05, + "loss": 0.8516, + "step": 375 + }, + { + "epoch": 0.057536342769701605, + "grad_norm": 3.507805903591568, + "learning_rate": 1.9134860050890586e-05, + "loss": 0.9728, + "step": 376 + }, + { + "epoch": 0.0576893649579189, + "grad_norm": 2.953880567303911, + "learning_rate": 1.9185750636132316e-05, + "loss": 0.9371, + "step": 377 + }, + { + "epoch": 0.05784238714613619, + "grad_norm": 3.2343974617308273, + "learning_rate": 1.923664122137405e-05, + "loss": 0.8903, + "step": 378 + }, + { + "epoch": 0.05799540933435348, + "grad_norm": 3.267836515525953, + "learning_rate": 1.9287531806615777e-05, + "loss": 0.8397, + "step": 379 + }, + { + "epoch": 0.05814843152257077, + "grad_norm": 3.594078860448196, + "learning_rate": 1.9338422391857507e-05, + "loss": 0.9597, + "step": 380 + }, + { + "epoch": 0.05830145371078806, + "grad_norm": 3.2708415507993034, + "learning_rate": 1.9389312977099238e-05, + "loss": 0.8571, + "step": 381 + }, + { + "epoch": 0.058454475899005355, + "grad_norm": 3.3418553549010057, + "learning_rate": 1.9440203562340968e-05, + "loss": 0.8239, + "step": 382 + }, + { + "epoch": 0.05860749808722265, + "grad_norm": 3.218696674016876, + "learning_rate": 1.9491094147582698e-05, + "loss": 0.8126, + "step": 383 + }, + { + "epoch": 0.05876052027543994, + "grad_norm": 3.171066381208782, + "learning_rate": 1.954198473282443e-05, + "loss": 0.757, + "step": 384 + }, + { + "epoch": 0.05891354246365723, + "grad_norm": 3.8453943663575982, + "learning_rate": 1.959287531806616e-05, + "loss": 0.9481, + "step": 385 + }, + { + "epoch": 0.05906656465187452, + "grad_norm": 3.262482788883625, + "learning_rate": 1.964376590330789e-05, + "loss": 0.8465, + "step": 386 + }, + { + "epoch": 0.05921958684009181, + "grad_norm": 3.303889728308021, + "learning_rate": 1.969465648854962e-05, + "loss": 0.7714, + "step": 387 + }, + { + "epoch": 0.059372609028309105, + "grad_norm": 3.519976681128204, + "learning_rate": 1.974554707379135e-05, + "loss": 0.832, + "step": 388 + }, + { + "epoch": 0.059525631216526396, + "grad_norm": 3.369259096460689, + "learning_rate": 1.979643765903308e-05, + "loss": 0.8669, + "step": 389 + }, + { + "epoch": 0.05967865340474369, + "grad_norm": 3.7927752076141963, + "learning_rate": 1.984732824427481e-05, + "loss": 0.846, + "step": 390 + }, + { + "epoch": 0.05983167559296098, + "grad_norm": 3.7018297119428523, + "learning_rate": 1.989821882951654e-05, + "loss": 0.819, + "step": 391 + }, + { + "epoch": 0.05998469778117827, + "grad_norm": 3.394158558872775, + "learning_rate": 1.994910941475827e-05, + "loss": 0.7892, + "step": 392 + }, + { + "epoch": 0.06013771996939556, + "grad_norm": 3.276133265082445, + "learning_rate": 2e-05, + "loss": 0.8333, + "step": 393 + }, + { + "epoch": 0.060290742157612855, + "grad_norm": 3.2855946364421804, + "learning_rate": 1.999999969293044e-05, + "loss": 0.815, + "step": 394 + }, + { + "epoch": 0.060443764345830146, + "grad_norm": 3.1185758818586318, + "learning_rate": 1.999999877172178e-05, + "loss": 0.779, + "step": 395 + }, + { + "epoch": 0.06059678653404744, + "grad_norm": 3.25149254683972, + "learning_rate": 1.9999997236374075e-05, + "loss": 0.83, + "step": 396 + }, + { + "epoch": 0.06074980872226473, + "grad_norm": 2.951296265260351, + "learning_rate": 1.9999995086887418e-05, + "loss": 0.8079, + "step": 397 + }, + { + "epoch": 0.06090283091048202, + "grad_norm": 3.0992783869324283, + "learning_rate": 1.9999992323261942e-05, + "loss": 0.8959, + "step": 398 + }, + { + "epoch": 0.06105585309869931, + "grad_norm": 3.3456301521826664, + "learning_rate": 1.9999988945497816e-05, + "loss": 0.9921, + "step": 399 + }, + { + "epoch": 0.061208875286916604, + "grad_norm": 3.569679380617617, + "learning_rate": 1.9999984953595253e-05, + "loss": 0.9131, + "step": 400 + }, + { + "epoch": 0.061361897475133896, + "grad_norm": 3.972081462044736, + "learning_rate": 1.999998034755449e-05, + "loss": 0.8658, + "step": 401 + }, + { + "epoch": 0.06151491966335119, + "grad_norm": 3.1594994773921017, + "learning_rate": 1.9999975127375815e-05, + "loss": 0.7164, + "step": 402 + }, + { + "epoch": 0.06166794185156848, + "grad_norm": 3.003388020711569, + "learning_rate": 1.999996929305955e-05, + "loss": 0.8885, + "step": 403 + }, + { + "epoch": 0.06182096403978577, + "grad_norm": 3.9041072645619974, + "learning_rate": 1.9999962844606046e-05, + "loss": 0.9423, + "step": 404 + }, + { + "epoch": 0.06197398622800306, + "grad_norm": 3.604645472820965, + "learning_rate": 1.9999955782015706e-05, + "loss": 0.8915, + "step": 405 + }, + { + "epoch": 0.062127008416220354, + "grad_norm": 3.0395146768409615, + "learning_rate": 1.9999948105288963e-05, + "loss": 0.8001, + "step": 406 + }, + { + "epoch": 0.062280030604437646, + "grad_norm": 3.3622363017187085, + "learning_rate": 1.9999939814426283e-05, + "loss": 0.9596, + "step": 407 + }, + { + "epoch": 0.06243305279265494, + "grad_norm": 3.290114424412209, + "learning_rate": 1.9999930909428183e-05, + "loss": 0.8567, + "step": 408 + }, + { + "epoch": 0.06258607498087222, + "grad_norm": 3.4438483746832245, + "learning_rate": 1.9999921390295203e-05, + "loss": 0.9297, + "step": 409 + }, + { + "epoch": 0.06273909716908951, + "grad_norm": 3.407358852073633, + "learning_rate": 1.999991125702793e-05, + "loss": 0.9102, + "step": 410 + }, + { + "epoch": 0.0628921193573068, + "grad_norm": 3.9536948948238693, + "learning_rate": 1.999990050962699e-05, + "loss": 0.7925, + "step": 411 + }, + { + "epoch": 0.0630451415455241, + "grad_norm": 3.1915535537530517, + "learning_rate": 1.9999889148093036e-05, + "loss": 0.7997, + "step": 412 + }, + { + "epoch": 0.06319816373374139, + "grad_norm": 3.3935055650270645, + "learning_rate": 1.9999877172426775e-05, + "loss": 0.7743, + "step": 413 + }, + { + "epoch": 0.06335118592195868, + "grad_norm": 3.4416667252947377, + "learning_rate": 1.9999864582628932e-05, + "loss": 0.8931, + "step": 414 + }, + { + "epoch": 0.06350420811017597, + "grad_norm": 3.791402724599172, + "learning_rate": 1.9999851378700286e-05, + "loss": 0.979, + "step": 415 + }, + { + "epoch": 0.06365723029839326, + "grad_norm": 3.1724547503182703, + "learning_rate": 1.999983756064165e-05, + "loss": 0.7635, + "step": 416 + }, + { + "epoch": 0.06381025248661056, + "grad_norm": 3.2774243411313253, + "learning_rate": 1.999982312845387e-05, + "loss": 0.841, + "step": 417 + }, + { + "epoch": 0.06396327467482785, + "grad_norm": 3.1958537216114604, + "learning_rate": 1.9999808082137828e-05, + "loss": 0.9372, + "step": 418 + }, + { + "epoch": 0.06411629686304514, + "grad_norm": 3.5937707452395555, + "learning_rate": 1.9999792421694454e-05, + "loss": 1.0499, + "step": 419 + }, + { + "epoch": 0.06426931905126243, + "grad_norm": 3.0929075141412032, + "learning_rate": 1.999977614712471e-05, + "loss": 0.9361, + "step": 420 + }, + { + "epoch": 0.06442234123947972, + "grad_norm": 3.6154685592166524, + "learning_rate": 1.9999759258429592e-05, + "loss": 0.8855, + "step": 421 + }, + { + "epoch": 0.06457536342769701, + "grad_norm": 3.317458456160721, + "learning_rate": 1.9999741755610137e-05, + "loss": 0.762, + "step": 422 + }, + { + "epoch": 0.0647283856159143, + "grad_norm": 3.552919568915116, + "learning_rate": 1.9999723638667424e-05, + "loss": 0.9853, + "step": 423 + }, + { + "epoch": 0.0648814078041316, + "grad_norm": 3.614389994148897, + "learning_rate": 1.9999704907602564e-05, + "loss": 0.8873, + "step": 424 + }, + { + "epoch": 0.06503442999234889, + "grad_norm": 3.5998150028550064, + "learning_rate": 1.9999685562416704e-05, + "loss": 0.9252, + "step": 425 + }, + { + "epoch": 0.06518745218056618, + "grad_norm": 3.049004959216067, + "learning_rate": 1.9999665603111035e-05, + "loss": 0.8648, + "step": 426 + }, + { + "epoch": 0.06534047436878347, + "grad_norm": 3.1745175809636237, + "learning_rate": 1.9999645029686784e-05, + "loss": 0.8266, + "step": 427 + }, + { + "epoch": 0.06549349655700076, + "grad_norm": 3.4882134719187805, + "learning_rate": 1.9999623842145212e-05, + "loss": 0.7766, + "step": 428 + }, + { + "epoch": 0.06564651874521805, + "grad_norm": 3.506680615344111, + "learning_rate": 1.999960204048762e-05, + "loss": 0.8902, + "step": 429 + }, + { + "epoch": 0.06579954093343535, + "grad_norm": 3.343900859921755, + "learning_rate": 1.9999579624715347e-05, + "loss": 0.8235, + "step": 430 + }, + { + "epoch": 0.06595256312165264, + "grad_norm": 3.506632382776108, + "learning_rate": 1.9999556594829775e-05, + "loss": 0.8132, + "step": 431 + }, + { + "epoch": 0.06610558530986993, + "grad_norm": 3.058113063265004, + "learning_rate": 1.9999532950832313e-05, + "loss": 0.8041, + "step": 432 + }, + { + "epoch": 0.06625860749808722, + "grad_norm": 3.3485252385202715, + "learning_rate": 1.999950869272441e-05, + "loss": 0.8817, + "step": 433 + }, + { + "epoch": 0.06641162968630451, + "grad_norm": 3.5781509996663234, + "learning_rate": 1.9999483820507562e-05, + "loss": 0.9097, + "step": 434 + }, + { + "epoch": 0.0665646518745218, + "grad_norm": 3.0972342262671324, + "learning_rate": 1.9999458334183296e-05, + "loss": 0.7467, + "step": 435 + }, + { + "epoch": 0.0667176740627391, + "grad_norm": 3.106512537323738, + "learning_rate": 1.9999432233753176e-05, + "loss": 0.8818, + "step": 436 + }, + { + "epoch": 0.06687069625095639, + "grad_norm": 3.2596638513194662, + "learning_rate": 1.9999405519218804e-05, + "loss": 0.9509, + "step": 437 + }, + { + "epoch": 0.06702371843917368, + "grad_norm": 3.477762465985897, + "learning_rate": 1.999937819058182e-05, + "loss": 0.8831, + "step": 438 + }, + { + "epoch": 0.06717674062739097, + "grad_norm": 3.3767003024179054, + "learning_rate": 1.9999350247843904e-05, + "loss": 0.9931, + "step": 439 + }, + { + "epoch": 0.06732976281560826, + "grad_norm": 3.0165691988373786, + "learning_rate": 1.999932169100677e-05, + "loss": 0.8623, + "step": 440 + }, + { + "epoch": 0.06748278500382555, + "grad_norm": 3.0751006001505594, + "learning_rate": 1.9999292520072177e-05, + "loss": 0.6962, + "step": 441 + }, + { + "epoch": 0.06763580719204285, + "grad_norm": 3.6467793080534756, + "learning_rate": 1.999926273504191e-05, + "loss": 0.9321, + "step": 442 + }, + { + "epoch": 0.06778882938026014, + "grad_norm": 3.0687590959557633, + "learning_rate": 1.99992323359178e-05, + "loss": 0.8116, + "step": 443 + }, + { + "epoch": 0.06794185156847743, + "grad_norm": 3.43496703843989, + "learning_rate": 1.9999201322701717e-05, + "loss": 0.8043, + "step": 444 + }, + { + "epoch": 0.06809487375669472, + "grad_norm": 3.544779183403212, + "learning_rate": 1.9999169695395566e-05, + "loss": 0.8923, + "step": 445 + }, + { + "epoch": 0.06824789594491201, + "grad_norm": 3.4034242125677707, + "learning_rate": 1.9999137454001282e-05, + "loss": 0.8836, + "step": 446 + }, + { + "epoch": 0.0684009181331293, + "grad_norm": 2.9508936139483257, + "learning_rate": 1.9999104598520854e-05, + "loss": 0.8638, + "step": 447 + }, + { + "epoch": 0.0685539403213466, + "grad_norm": 3.109658780002175, + "learning_rate": 1.9999071128956294e-05, + "loss": 0.7898, + "step": 448 + }, + { + "epoch": 0.06870696250956389, + "grad_norm": 3.3036335036928475, + "learning_rate": 1.9999037045309663e-05, + "loss": 0.9348, + "step": 449 + }, + { + "epoch": 0.06885998469778118, + "grad_norm": 2.931573589022011, + "learning_rate": 1.9999002347583048e-05, + "loss": 0.8075, + "step": 450 + }, + { + "epoch": 0.06901300688599847, + "grad_norm": 3.1926862679514265, + "learning_rate": 1.999896703577858e-05, + "loss": 0.8226, + "step": 451 + }, + { + "epoch": 0.06916602907421576, + "grad_norm": 3.455774568574597, + "learning_rate": 1.9998931109898433e-05, + "loss": 0.872, + "step": 452 + }, + { + "epoch": 0.06931905126243305, + "grad_norm": 3.1693621632015807, + "learning_rate": 1.999889456994481e-05, + "loss": 0.7752, + "step": 453 + }, + { + "epoch": 0.06947207345065035, + "grad_norm": 2.9868239567077977, + "learning_rate": 1.9998857415919955e-05, + "loss": 0.9065, + "step": 454 + }, + { + "epoch": 0.06962509563886764, + "grad_norm": 3.0397111537097006, + "learning_rate": 1.999881964782615e-05, + "loss": 0.7929, + "step": 455 + }, + { + "epoch": 0.06977811782708493, + "grad_norm": 3.549361128360661, + "learning_rate": 1.9998781265665715e-05, + "loss": 0.8541, + "step": 456 + }, + { + "epoch": 0.06993114001530222, + "grad_norm": 3.044784684574379, + "learning_rate": 1.9998742269441008e-05, + "loss": 0.8231, + "step": 457 + }, + { + "epoch": 0.07008416220351951, + "grad_norm": 3.1048626907375954, + "learning_rate": 1.9998702659154423e-05, + "loss": 0.7968, + "step": 458 + }, + { + "epoch": 0.0702371843917368, + "grad_norm": 3.2945028895738107, + "learning_rate": 1.999866243480839e-05, + "loss": 0.8379, + "step": 459 + }, + { + "epoch": 0.0703902065799541, + "grad_norm": 3.178438471947671, + "learning_rate": 1.9998621596405384e-05, + "loss": 0.9277, + "step": 460 + }, + { + "epoch": 0.07054322876817139, + "grad_norm": 3.42484137351198, + "learning_rate": 1.9998580143947914e-05, + "loss": 0.8155, + "step": 461 + }, + { + "epoch": 0.07069625095638868, + "grad_norm": 3.0491149098944135, + "learning_rate": 1.999853807743852e-05, + "loss": 0.7763, + "step": 462 + }, + { + "epoch": 0.07084927314460597, + "grad_norm": 3.376002345401871, + "learning_rate": 1.9998495396879786e-05, + "loss": 0.9263, + "step": 463 + }, + { + "epoch": 0.07100229533282326, + "grad_norm": 3.2633106035230766, + "learning_rate": 1.9998452102274336e-05, + "loss": 0.9468, + "step": 464 + }, + { + "epoch": 0.07115531752104055, + "grad_norm": 3.1933206048414293, + "learning_rate": 1.9998408193624834e-05, + "loss": 0.8901, + "step": 465 + }, + { + "epoch": 0.07130833970925785, + "grad_norm": 3.4082992385767206, + "learning_rate": 1.999836367093396e-05, + "loss": 0.9357, + "step": 466 + }, + { + "epoch": 0.07146136189747514, + "grad_norm": 3.4102970874735745, + "learning_rate": 1.999831853420447e-05, + "loss": 0.9104, + "step": 467 + }, + { + "epoch": 0.07161438408569243, + "grad_norm": 3.014808972109428, + "learning_rate": 1.9998272783439118e-05, + "loss": 0.8472, + "step": 468 + }, + { + "epoch": 0.07176740627390972, + "grad_norm": 3.0016946749987983, + "learning_rate": 1.9998226418640724e-05, + "loss": 0.8353, + "step": 469 + }, + { + "epoch": 0.07192042846212701, + "grad_norm": 3.4092515518117836, + "learning_rate": 1.999817943981213e-05, + "loss": 0.7995, + "step": 470 + }, + { + "epoch": 0.0720734506503443, + "grad_norm": 2.9282228456092887, + "learning_rate": 1.9998131846956225e-05, + "loss": 0.899, + "step": 471 + }, + { + "epoch": 0.0722264728385616, + "grad_norm": 3.4435930622433037, + "learning_rate": 1.9998083640075935e-05, + "loss": 0.9694, + "step": 472 + }, + { + "epoch": 0.07237949502677889, + "grad_norm": 3.284470938610737, + "learning_rate": 1.9998034819174207e-05, + "loss": 0.878, + "step": 473 + }, + { + "epoch": 0.07253251721499618, + "grad_norm": 3.2453744318117943, + "learning_rate": 1.9997985384254057e-05, + "loss": 0.7952, + "step": 474 + }, + { + "epoch": 0.07268553940321347, + "grad_norm": 3.577377408891758, + "learning_rate": 1.9997935335318507e-05, + "loss": 0.8137, + "step": 475 + }, + { + "epoch": 0.07283856159143076, + "grad_norm": 3.2705975222934556, + "learning_rate": 1.9997884672370637e-05, + "loss": 0.918, + "step": 476 + }, + { + "epoch": 0.07299158377964805, + "grad_norm": 3.4645999740507385, + "learning_rate": 1.9997833395413554e-05, + "loss": 0.7966, + "step": 477 + }, + { + "epoch": 0.07314460596786534, + "grad_norm": 3.241023758964187, + "learning_rate": 1.9997781504450416e-05, + "loss": 0.7054, + "step": 478 + }, + { + "epoch": 0.07329762815608264, + "grad_norm": 3.1306221791660174, + "learning_rate": 1.99977289994844e-05, + "loss": 0.8397, + "step": 479 + }, + { + "epoch": 0.07345065034429993, + "grad_norm": 3.5542684374846676, + "learning_rate": 1.999767588051874e-05, + "loss": 0.9649, + "step": 480 + }, + { + "epoch": 0.07360367253251722, + "grad_norm": 3.1099423042693375, + "learning_rate": 1.9997622147556687e-05, + "loss": 0.8699, + "step": 481 + }, + { + "epoch": 0.07375669472073451, + "grad_norm": 3.092152194487772, + "learning_rate": 1.9997567800601548e-05, + "loss": 0.9186, + "step": 482 + }, + { + "epoch": 0.0739097169089518, + "grad_norm": 3.402202777509611, + "learning_rate": 1.9997512839656665e-05, + "loss": 0.8209, + "step": 483 + }, + { + "epoch": 0.0740627390971691, + "grad_norm": 2.9907255336981184, + "learning_rate": 1.9997457264725404e-05, + "loss": 0.9086, + "step": 484 + }, + { + "epoch": 0.07421576128538639, + "grad_norm": 3.131981310207802, + "learning_rate": 1.9997401075811182e-05, + "loss": 0.8904, + "step": 485 + }, + { + "epoch": 0.07436878347360368, + "grad_norm": 3.209634258627602, + "learning_rate": 1.999734427291745e-05, + "loss": 0.7896, + "step": 486 + }, + { + "epoch": 0.07452180566182097, + "grad_norm": 3.340921344451802, + "learning_rate": 1.9997286856047696e-05, + "loss": 0.8265, + "step": 487 + }, + { + "epoch": 0.07467482785003826, + "grad_norm": 3.13565494738022, + "learning_rate": 1.999722882520545e-05, + "loss": 0.8693, + "step": 488 + }, + { + "epoch": 0.07482785003825555, + "grad_norm": 3.02235992945377, + "learning_rate": 1.999717018039427e-05, + "loss": 0.8724, + "step": 489 + }, + { + "epoch": 0.07498087222647284, + "grad_norm": 3.4023434406169795, + "learning_rate": 1.999711092161776e-05, + "loss": 0.9148, + "step": 490 + }, + { + "epoch": 0.07513389441469014, + "grad_norm": 3.343264311302944, + "learning_rate": 1.999705104887956e-05, + "loss": 0.8204, + "step": 491 + }, + { + "epoch": 0.07528691660290743, + "grad_norm": 3.01072439243528, + "learning_rate": 1.9996990562183344e-05, + "loss": 0.8356, + "step": 492 + }, + { + "epoch": 0.07543993879112472, + "grad_norm": 3.0677819421014543, + "learning_rate": 1.999692946153283e-05, + "loss": 0.9457, + "step": 493 + }, + { + "epoch": 0.07559296097934201, + "grad_norm": 3.049481965152659, + "learning_rate": 1.999686774693177e-05, + "loss": 0.8406, + "step": 494 + }, + { + "epoch": 0.0757459831675593, + "grad_norm": 3.1273680772157997, + "learning_rate": 1.9996805418383957e-05, + "loss": 0.8543, + "step": 495 + }, + { + "epoch": 0.0758990053557766, + "grad_norm": 3.2658135510698543, + "learning_rate": 1.999674247589321e-05, + "loss": 0.801, + "step": 496 + }, + { + "epoch": 0.07605202754399389, + "grad_norm": 3.2022841129021966, + "learning_rate": 1.9996678919463403e-05, + "loss": 0.9407, + "step": 497 + }, + { + "epoch": 0.07620504973221118, + "grad_norm": 3.2241072818356846, + "learning_rate": 1.9996614749098438e-05, + "loss": 0.9081, + "step": 498 + }, + { + "epoch": 0.07635807192042846, + "grad_norm": 3.141672970165098, + "learning_rate": 1.999654996480225e-05, + "loss": 0.8296, + "step": 499 + }, + { + "epoch": 0.07651109410864575, + "grad_norm": 2.8655818640002884, + "learning_rate": 1.9996484566578824e-05, + "loss": 0.7494, + "step": 500 + }, + { + "epoch": 0.07666411629686304, + "grad_norm": 3.005360495550959, + "learning_rate": 1.9996418554432175e-05, + "loss": 0.7814, + "step": 501 + }, + { + "epoch": 0.07681713848508033, + "grad_norm": 3.3007083879137364, + "learning_rate": 1.999635192836636e-05, + "loss": 0.8228, + "step": 502 + }, + { + "epoch": 0.07697016067329762, + "grad_norm": 3.239789821398647, + "learning_rate": 1.9996284688385458e-05, + "loss": 0.8911, + "step": 503 + }, + { + "epoch": 0.07712318286151491, + "grad_norm": 3.16801784653872, + "learning_rate": 1.9996216834493614e-05, + "loss": 0.8827, + "step": 504 + }, + { + "epoch": 0.0772762050497322, + "grad_norm": 3.2341267297113223, + "learning_rate": 1.9996148366694987e-05, + "loss": 0.8773, + "step": 505 + }, + { + "epoch": 0.0774292272379495, + "grad_norm": 3.0993619034994526, + "learning_rate": 1.9996079284993785e-05, + "loss": 0.898, + "step": 506 + }, + { + "epoch": 0.07758224942616679, + "grad_norm": 2.9243224608176637, + "learning_rate": 1.999600958939425e-05, + "loss": 0.8461, + "step": 507 + }, + { + "epoch": 0.07773527161438408, + "grad_norm": 3.307290573644802, + "learning_rate": 1.9995939279900658e-05, + "loss": 0.91, + "step": 508 + }, + { + "epoch": 0.07788829380260137, + "grad_norm": 2.9365920881459986, + "learning_rate": 1.999586835651733e-05, + "loss": 0.9606, + "step": 509 + }, + { + "epoch": 0.07804131599081866, + "grad_norm": 2.724422019837075, + "learning_rate": 1.999579681924863e-05, + "loss": 0.8498, + "step": 510 + }, + { + "epoch": 0.07819433817903595, + "grad_norm": 2.9876304383699326, + "learning_rate": 1.9995724668098936e-05, + "loss": 0.7418, + "step": 511 + }, + { + "epoch": 0.07834736036725325, + "grad_norm": 2.864681589216531, + "learning_rate": 1.999565190307269e-05, + "loss": 0.8075, + "step": 512 + }, + { + "epoch": 0.07850038255547054, + "grad_norm": 2.8644979232121517, + "learning_rate": 1.9995578524174354e-05, + "loss": 0.7353, + "step": 513 + }, + { + "epoch": 0.07865340474368783, + "grad_norm": 3.2377935340367707, + "learning_rate": 1.999550453140844e-05, + "loss": 0.8998, + "step": 514 + }, + { + "epoch": 0.07880642693190512, + "grad_norm": 3.0349644799969457, + "learning_rate": 1.999542992477949e-05, + "loss": 0.8999, + "step": 515 + }, + { + "epoch": 0.07895944912012241, + "grad_norm": 3.187108910079354, + "learning_rate": 1.9995354704292086e-05, + "loss": 0.9367, + "step": 516 + }, + { + "epoch": 0.0791124713083397, + "grad_norm": 3.6556676597386955, + "learning_rate": 1.9995278869950848e-05, + "loss": 0.8974, + "step": 517 + }, + { + "epoch": 0.079265493496557, + "grad_norm": 3.149907651192199, + "learning_rate": 1.9995202421760432e-05, + "loss": 0.8972, + "step": 518 + }, + { + "epoch": 0.07941851568477429, + "grad_norm": 3.1073516404536368, + "learning_rate": 1.9995125359725534e-05, + "loss": 0.841, + "step": 519 + }, + { + "epoch": 0.07957153787299158, + "grad_norm": 3.3022754605273668, + "learning_rate": 1.9995047683850887e-05, + "loss": 0.9602, + "step": 520 + }, + { + "epoch": 0.07972456006120887, + "grad_norm": 3.0277051105867216, + "learning_rate": 1.999496939414126e-05, + "loss": 0.8321, + "step": 521 + }, + { + "epoch": 0.07987758224942616, + "grad_norm": 3.2127384679862416, + "learning_rate": 1.9994890490601463e-05, + "loss": 0.7626, + "step": 522 + }, + { + "epoch": 0.08003060443764345, + "grad_norm": 3.1231645870718814, + "learning_rate": 1.999481097323634e-05, + "loss": 0.8598, + "step": 523 + }, + { + "epoch": 0.08018362662586075, + "grad_norm": 2.919715281820122, + "learning_rate": 1.9994730842050776e-05, + "loss": 0.8244, + "step": 524 + }, + { + "epoch": 0.08033664881407804, + "grad_norm": 2.822749796794928, + "learning_rate": 1.999465009704969e-05, + "loss": 0.8562, + "step": 525 + }, + { + "epoch": 0.08048967100229533, + "grad_norm": 2.91444240792536, + "learning_rate": 1.9994568738238046e-05, + "loss": 1.0529, + "step": 526 + }, + { + "epoch": 0.08064269319051262, + "grad_norm": 3.210762650851107, + "learning_rate": 1.9994486765620834e-05, + "loss": 0.9525, + "step": 527 + }, + { + "epoch": 0.08079571537872991, + "grad_norm": 3.1679560833629528, + "learning_rate": 1.9994404179203092e-05, + "loss": 0.8337, + "step": 528 + }, + { + "epoch": 0.0809487375669472, + "grad_norm": 2.833453068833028, + "learning_rate": 1.9994320978989887e-05, + "loss": 0.9588, + "step": 529 + }, + { + "epoch": 0.0811017597551645, + "grad_norm": 3.2638311444981998, + "learning_rate": 1.9994237164986338e-05, + "loss": 0.9316, + "step": 530 + }, + { + "epoch": 0.08125478194338179, + "grad_norm": 3.4811522175868608, + "learning_rate": 1.9994152737197586e-05, + "loss": 0.9002, + "step": 531 + }, + { + "epoch": 0.08140780413159908, + "grad_norm": 3.0731278665818036, + "learning_rate": 1.9994067695628815e-05, + "loss": 0.8697, + "step": 532 + }, + { + "epoch": 0.08156082631981637, + "grad_norm": 2.8689492083403647, + "learning_rate": 1.999398204028525e-05, + "loss": 0.8766, + "step": 533 + }, + { + "epoch": 0.08171384850803366, + "grad_norm": 2.972650733628086, + "learning_rate": 1.9993895771172153e-05, + "loss": 0.7181, + "step": 534 + }, + { + "epoch": 0.08186687069625095, + "grad_norm": 2.880097853744865, + "learning_rate": 1.9993808888294816e-05, + "loss": 0.8629, + "step": 535 + }, + { + "epoch": 0.08201989288446825, + "grad_norm": 2.8699706044768116, + "learning_rate": 1.9993721391658584e-05, + "loss": 0.8897, + "step": 536 + }, + { + "epoch": 0.08217291507268554, + "grad_norm": 3.2307318749796448, + "learning_rate": 1.9993633281268825e-05, + "loss": 0.79, + "step": 537 + }, + { + "epoch": 0.08232593726090283, + "grad_norm": 3.4903511369450237, + "learning_rate": 1.999354455713095e-05, + "loss": 0.968, + "step": 538 + }, + { + "epoch": 0.08247895944912012, + "grad_norm": 2.487957846673477, + "learning_rate": 1.9993455219250407e-05, + "loss": 0.8133, + "step": 539 + }, + { + "epoch": 0.08263198163733741, + "grad_norm": 3.022484553326492, + "learning_rate": 1.999336526763269e-05, + "loss": 0.8825, + "step": 540 + }, + { + "epoch": 0.0827850038255547, + "grad_norm": 2.8453697669895015, + "learning_rate": 1.9993274702283313e-05, + "loss": 0.8821, + "step": 541 + }, + { + "epoch": 0.082938026013772, + "grad_norm": 3.3972904425220944, + "learning_rate": 1.999318352320784e-05, + "loss": 0.9181, + "step": 542 + }, + { + "epoch": 0.08309104820198929, + "grad_norm": 3.389004916540709, + "learning_rate": 1.999309173041188e-05, + "loss": 0.9292, + "step": 543 + }, + { + "epoch": 0.08324407039020658, + "grad_norm": 3.0472330893944806, + "learning_rate": 1.999299932390106e-05, + "loss": 0.7503, + "step": 544 + }, + { + "epoch": 0.08339709257842387, + "grad_norm": 3.062998869061395, + "learning_rate": 1.9992906303681057e-05, + "loss": 0.9149, + "step": 545 + }, + { + "epoch": 0.08355011476664116, + "grad_norm": 3.662910791698355, + "learning_rate": 1.999281266975759e-05, + "loss": 0.8746, + "step": 546 + }, + { + "epoch": 0.08370313695485845, + "grad_norm": 2.9445135400870637, + "learning_rate": 1.99927184221364e-05, + "loss": 0.8416, + "step": 547 + }, + { + "epoch": 0.08385615914307575, + "grad_norm": 2.9881136587386026, + "learning_rate": 1.9992623560823284e-05, + "loss": 0.86, + "step": 548 + }, + { + "epoch": 0.08400918133129304, + "grad_norm": 2.8957772667517085, + "learning_rate": 1.999252808582406e-05, + "loss": 0.8825, + "step": 549 + }, + { + "epoch": 0.08416220351951033, + "grad_norm": 2.7973789830398883, + "learning_rate": 1.99924319971446e-05, + "loss": 0.8052, + "step": 550 + }, + { + "epoch": 0.08431522570772762, + "grad_norm": 2.778130072053958, + "learning_rate": 1.9992335294790797e-05, + "loss": 0.7703, + "step": 551 + }, + { + "epoch": 0.08446824789594491, + "grad_norm": 3.1327192932994405, + "learning_rate": 1.9992237978768593e-05, + "loss": 0.9049, + "step": 552 + }, + { + "epoch": 0.0846212700841622, + "grad_norm": 3.1565650717986977, + "learning_rate": 1.9992140049083968e-05, + "loss": 0.8519, + "step": 553 + }, + { + "epoch": 0.0847742922723795, + "grad_norm": 3.1756761607270194, + "learning_rate": 1.999204150574293e-05, + "loss": 1.0047, + "step": 554 + }, + { + "epoch": 0.08492731446059679, + "grad_norm": 2.6052032947874864, + "learning_rate": 1.9991942348751534e-05, + "loss": 0.8446, + "step": 555 + }, + { + "epoch": 0.08508033664881408, + "grad_norm": 3.3747384938061846, + "learning_rate": 1.9991842578115872e-05, + "loss": 0.9783, + "step": 556 + }, + { + "epoch": 0.08523335883703137, + "grad_norm": 3.1466121416772532, + "learning_rate": 1.999174219384207e-05, + "loss": 0.9396, + "step": 557 + }, + { + "epoch": 0.08538638102524866, + "grad_norm": 3.203806072282312, + "learning_rate": 1.999164119593629e-05, + "loss": 0.8774, + "step": 558 + }, + { + "epoch": 0.08553940321346595, + "grad_norm": 2.8295832347751544, + "learning_rate": 1.9991539584404734e-05, + "loss": 0.7787, + "step": 559 + }, + { + "epoch": 0.08569242540168324, + "grad_norm": 2.9986481486485315, + "learning_rate": 1.9991437359253647e-05, + "loss": 0.8303, + "step": 560 + }, + { + "epoch": 0.08584544758990054, + "grad_norm": 3.1163102632937525, + "learning_rate": 1.9991334520489304e-05, + "loss": 0.8202, + "step": 561 + }, + { + "epoch": 0.08599846977811783, + "grad_norm": 3.3672338761393616, + "learning_rate": 1.9991231068118024e-05, + "loss": 0.8453, + "step": 562 + }, + { + "epoch": 0.08615149196633512, + "grad_norm": 2.9326058066627123, + "learning_rate": 1.9991127002146157e-05, + "loss": 0.816, + "step": 563 + }, + { + "epoch": 0.08630451415455241, + "grad_norm": 2.9286950810713277, + "learning_rate": 1.9991022322580096e-05, + "loss": 0.893, + "step": 564 + }, + { + "epoch": 0.0864575363427697, + "grad_norm": 3.390923338954129, + "learning_rate": 1.999091702942627e-05, + "loss": 0.942, + "step": 565 + }, + { + "epoch": 0.086610558530987, + "grad_norm": 3.100849028861991, + "learning_rate": 1.9990811122691142e-05, + "loss": 0.8572, + "step": 566 + }, + { + "epoch": 0.08676358071920429, + "grad_norm": 3.024939074035441, + "learning_rate": 1.9990704602381222e-05, + "loss": 0.9249, + "step": 567 + }, + { + "epoch": 0.08691660290742158, + "grad_norm": 3.086292608040693, + "learning_rate": 1.9990597468503044e-05, + "loss": 0.7923, + "step": 568 + }, + { + "epoch": 0.08706962509563887, + "grad_norm": 2.8094297793401397, + "learning_rate": 1.9990489721063194e-05, + "loss": 0.7916, + "step": 569 + }, + { + "epoch": 0.08722264728385616, + "grad_norm": 2.906165406278737, + "learning_rate": 1.999038136006829e-05, + "loss": 0.8567, + "step": 570 + }, + { + "epoch": 0.08737566947207345, + "grad_norm": 3.027894811082444, + "learning_rate": 1.9990272385524983e-05, + "loss": 0.8776, + "step": 571 + }, + { + "epoch": 0.08752869166029074, + "grad_norm": 2.951885113296094, + "learning_rate": 1.9990162797439964e-05, + "loss": 0.7175, + "step": 572 + }, + { + "epoch": 0.08768171384850804, + "grad_norm": 2.8448885126107464, + "learning_rate": 1.9990052595819966e-05, + "loss": 0.7836, + "step": 573 + }, + { + "epoch": 0.08783473603672533, + "grad_norm": 3.0008723156763635, + "learning_rate": 1.998994178067176e-05, + "loss": 0.9061, + "step": 574 + }, + { + "epoch": 0.08798775822494262, + "grad_norm": 2.766808323475977, + "learning_rate": 1.9989830352002144e-05, + "loss": 0.7398, + "step": 575 + }, + { + "epoch": 0.08814078041315991, + "grad_norm": 3.089772730908918, + "learning_rate": 1.998971830981797e-05, + "loss": 0.9764, + "step": 576 + }, + { + "epoch": 0.0882938026013772, + "grad_norm": 3.2210205902228903, + "learning_rate": 1.9989605654126113e-05, + "loss": 0.88, + "step": 577 + }, + { + "epoch": 0.0884468247895945, + "grad_norm": 3.3361093624428526, + "learning_rate": 1.9989492384933493e-05, + "loss": 0.9849, + "step": 578 + }, + { + "epoch": 0.08859984697781179, + "grad_norm": 2.9746314886323773, + "learning_rate": 1.9989378502247067e-05, + "loss": 0.8476, + "step": 579 + }, + { + "epoch": 0.08875286916602908, + "grad_norm": 2.8992176182889104, + "learning_rate": 1.9989264006073826e-05, + "loss": 0.7202, + "step": 580 + }, + { + "epoch": 0.08890589135424637, + "grad_norm": 2.7055032415422726, + "learning_rate": 1.998914889642081e-05, + "loss": 0.7931, + "step": 581 + }, + { + "epoch": 0.08905891354246366, + "grad_norm": 2.97229656950559, + "learning_rate": 1.9989033173295075e-05, + "loss": 0.7473, + "step": 582 + }, + { + "epoch": 0.08921193573068095, + "grad_norm": 3.013939775038768, + "learning_rate": 1.9988916836703738e-05, + "loss": 0.8688, + "step": 583 + }, + { + "epoch": 0.08936495791889824, + "grad_norm": 3.2961620765077106, + "learning_rate": 1.998879988665394e-05, + "loss": 0.9014, + "step": 584 + }, + { + "epoch": 0.08951798010711554, + "grad_norm": 3.136110348623102, + "learning_rate": 1.998868232315287e-05, + "loss": 0.8517, + "step": 585 + }, + { + "epoch": 0.08967100229533283, + "grad_norm": 2.873765409966888, + "learning_rate": 1.998856414620774e-05, + "loss": 0.8392, + "step": 586 + }, + { + "epoch": 0.08982402448355012, + "grad_norm": 3.0620369003779997, + "learning_rate": 1.9988445355825808e-05, + "loss": 0.9311, + "step": 587 + }, + { + "epoch": 0.08997704667176741, + "grad_norm": 2.9488999826259583, + "learning_rate": 1.9988325952014375e-05, + "loss": 0.7894, + "step": 588 + }, + { + "epoch": 0.0901300688599847, + "grad_norm": 3.364620998306335, + "learning_rate": 1.998820593478077e-05, + "loss": 0.9131, + "step": 589 + }, + { + "epoch": 0.090283091048202, + "grad_norm": 3.486214934509759, + "learning_rate": 1.9988085304132362e-05, + "loss": 0.9463, + "step": 590 + }, + { + "epoch": 0.09043611323641929, + "grad_norm": 3.103071840216333, + "learning_rate": 1.9987964060076565e-05, + "loss": 0.9319, + "step": 591 + }, + { + "epoch": 0.09058913542463658, + "grad_norm": 2.7855301279351736, + "learning_rate": 1.998784220262082e-05, + "loss": 0.8917, + "step": 592 + }, + { + "epoch": 0.09074215761285387, + "grad_norm": 2.628571920591738, + "learning_rate": 1.998771973177261e-05, + "loss": 0.8585, + "step": 593 + }, + { + "epoch": 0.09089517980107116, + "grad_norm": 3.0308283014483135, + "learning_rate": 1.9987596647539464e-05, + "loss": 0.8611, + "step": 594 + }, + { + "epoch": 0.09104820198928845, + "grad_norm": 3.395921773906306, + "learning_rate": 1.9987472949928936e-05, + "loss": 0.8705, + "step": 595 + }, + { + "epoch": 0.09120122417750574, + "grad_norm": 3.185931373181077, + "learning_rate": 1.9987348638948623e-05, + "loss": 0.8794, + "step": 596 + }, + { + "epoch": 0.09135424636572304, + "grad_norm": 3.2153230873365857, + "learning_rate": 1.9987223714606156e-05, + "loss": 0.9029, + "step": 597 + }, + { + "epoch": 0.09150726855394033, + "grad_norm": 2.615904068828914, + "learning_rate": 1.9987098176909213e-05, + "loss": 0.8384, + "step": 598 + }, + { + "epoch": 0.09166029074215762, + "grad_norm": 3.1448963558441023, + "learning_rate": 1.99869720258655e-05, + "loss": 0.8621, + "step": 599 + }, + { + "epoch": 0.09181331293037491, + "grad_norm": 2.848509682376616, + "learning_rate": 1.9986845261482767e-05, + "loss": 0.8979, + "step": 600 + }, + { + "epoch": 0.0919663351185922, + "grad_norm": 2.8631042534970117, + "learning_rate": 1.9986717883768796e-05, + "loss": 0.7573, + "step": 601 + }, + { + "epoch": 0.0921193573068095, + "grad_norm": 2.877288020891183, + "learning_rate": 1.998658989273141e-05, + "loss": 0.8908, + "step": 602 + }, + { + "epoch": 0.09227237949502679, + "grad_norm": 2.9466061917507855, + "learning_rate": 1.9986461288378475e-05, + "loss": 0.8146, + "step": 603 + }, + { + "epoch": 0.09242540168324408, + "grad_norm": 3.2326794041122344, + "learning_rate": 1.9986332070717882e-05, + "loss": 0.866, + "step": 604 + }, + { + "epoch": 0.09257842387146137, + "grad_norm": 3.2136492245233987, + "learning_rate": 1.9986202239757572e-05, + "loss": 0.9044, + "step": 605 + }, + { + "epoch": 0.09273144605967866, + "grad_norm": 2.9185939116175796, + "learning_rate": 1.9986071795505516e-05, + "loss": 0.799, + "step": 606 + }, + { + "epoch": 0.09288446824789595, + "grad_norm": 2.5518184921354545, + "learning_rate": 1.9985940737969724e-05, + "loss": 0.7654, + "step": 607 + }, + { + "epoch": 0.09303749043611324, + "grad_norm": 3.4896794353885157, + "learning_rate": 1.9985809067158246e-05, + "loss": 0.8924, + "step": 608 + }, + { + "epoch": 0.09319051262433053, + "grad_norm": 2.9217667565577408, + "learning_rate": 1.998567678307917e-05, + "loss": 0.7938, + "step": 609 + }, + { + "epoch": 0.09334353481254781, + "grad_norm": 2.9217531851694076, + "learning_rate": 1.9985543885740616e-05, + "loss": 0.8824, + "step": 610 + }, + { + "epoch": 0.0934965570007651, + "grad_norm": 3.466534744413211, + "learning_rate": 1.998541037515075e-05, + "loss": 0.8839, + "step": 611 + }, + { + "epoch": 0.0936495791889824, + "grad_norm": 2.8647068519856473, + "learning_rate": 1.998527625131777e-05, + "loss": 0.7641, + "step": 612 + }, + { + "epoch": 0.09380260137719969, + "grad_norm": 2.990874053782021, + "learning_rate": 1.9985141514249913e-05, + "loss": 0.8285, + "step": 613 + }, + { + "epoch": 0.09395562356541698, + "grad_norm": 3.213841555154716, + "learning_rate": 1.9985006163955454e-05, + "loss": 0.8348, + "step": 614 + }, + { + "epoch": 0.09410864575363427, + "grad_norm": 2.897833932939109, + "learning_rate": 1.9984870200442704e-05, + "loss": 0.8974, + "step": 615 + }, + { + "epoch": 0.09426166794185156, + "grad_norm": 3.255093140463334, + "learning_rate": 1.998473362372001e-05, + "loss": 0.77, + "step": 616 + }, + { + "epoch": 0.09441469013006885, + "grad_norm": 3.5541022603639654, + "learning_rate": 1.9984596433795768e-05, + "loss": 0.9675, + "step": 617 + }, + { + "epoch": 0.09456771231828615, + "grad_norm": 3.156911376432198, + "learning_rate": 1.99844586306784e-05, + "loss": 0.7643, + "step": 618 + }, + { + "epoch": 0.09472073450650344, + "grad_norm": 3.0327127020510014, + "learning_rate": 1.9984320214376367e-05, + "loss": 0.9454, + "step": 619 + }, + { + "epoch": 0.09487375669472073, + "grad_norm": 2.7145164111445763, + "learning_rate": 1.998418118489817e-05, + "loss": 0.78, + "step": 620 + }, + { + "epoch": 0.09502677888293802, + "grad_norm": 3.042572988904212, + "learning_rate": 1.998404154225235e-05, + "loss": 0.8324, + "step": 621 + }, + { + "epoch": 0.09517980107115531, + "grad_norm": 2.590002583866103, + "learning_rate": 1.9983901286447477e-05, + "loss": 0.7565, + "step": 622 + }, + { + "epoch": 0.0953328232593726, + "grad_norm": 3.2287495750309696, + "learning_rate": 1.9983760417492173e-05, + "loss": 0.7237, + "step": 623 + }, + { + "epoch": 0.0954858454475899, + "grad_norm": 2.9615171389313732, + "learning_rate": 1.9983618935395084e-05, + "loss": 0.8673, + "step": 624 + }, + { + "epoch": 0.09563886763580719, + "grad_norm": 2.975886701578284, + "learning_rate": 1.9983476840164896e-05, + "loss": 0.8856, + "step": 625 + }, + { + "epoch": 0.09579188982402448, + "grad_norm": 3.3574876890478054, + "learning_rate": 1.9983334131810346e-05, + "loss": 0.9716, + "step": 626 + }, + { + "epoch": 0.09594491201224177, + "grad_norm": 2.7507257274616648, + "learning_rate": 1.9983190810340186e-05, + "loss": 0.9263, + "step": 627 + }, + { + "epoch": 0.09609793420045906, + "grad_norm": 3.191008180869532, + "learning_rate": 1.9983046875763228e-05, + "loss": 0.931, + "step": 628 + }, + { + "epoch": 0.09625095638867635, + "grad_norm": 2.853979783408991, + "learning_rate": 1.998290232808831e-05, + "loss": 0.786, + "step": 629 + }, + { + "epoch": 0.09640397857689365, + "grad_norm": 3.0623527098455945, + "learning_rate": 1.99827571673243e-05, + "loss": 0.9594, + "step": 630 + }, + { + "epoch": 0.09655700076511094, + "grad_norm": 3.0612621394357387, + "learning_rate": 1.9982611393480124e-05, + "loss": 0.8467, + "step": 631 + }, + { + "epoch": 0.09671002295332823, + "grad_norm": 3.145922754185688, + "learning_rate": 1.9982465006564727e-05, + "loss": 0.837, + "step": 632 + }, + { + "epoch": 0.09686304514154552, + "grad_norm": 3.0267411008622602, + "learning_rate": 1.9982318006587107e-05, + "loss": 0.9817, + "step": 633 + }, + { + "epoch": 0.09701606732976281, + "grad_norm": 2.9130437517324594, + "learning_rate": 1.9982170393556282e-05, + "loss": 0.7791, + "step": 634 + }, + { + "epoch": 0.0971690895179801, + "grad_norm": 2.8442538652688922, + "learning_rate": 1.9982022167481324e-05, + "loss": 0.9116, + "step": 635 + }, + { + "epoch": 0.0973221117061974, + "grad_norm": 2.912158188776152, + "learning_rate": 1.9981873328371338e-05, + "loss": 0.8294, + "step": 636 + }, + { + "epoch": 0.09747513389441469, + "grad_norm": 3.099465213513769, + "learning_rate": 1.9981723876235457e-05, + "loss": 0.8317, + "step": 637 + }, + { + "epoch": 0.09762815608263198, + "grad_norm": 2.9188799056701296, + "learning_rate": 1.9981573811082868e-05, + "loss": 0.8401, + "step": 638 + }, + { + "epoch": 0.09778117827084927, + "grad_norm": 2.819614664399572, + "learning_rate": 1.998142313292278e-05, + "loss": 0.8368, + "step": 639 + }, + { + "epoch": 0.09793420045906656, + "grad_norm": 2.6738959579696577, + "learning_rate": 1.9981271841764452e-05, + "loss": 0.9057, + "step": 640 + }, + { + "epoch": 0.09808722264728385, + "grad_norm": 2.9601321974365735, + "learning_rate": 1.9981119937617174e-05, + "loss": 0.8107, + "step": 641 + }, + { + "epoch": 0.09824024483550114, + "grad_norm": 3.0233216111788472, + "learning_rate": 1.9980967420490273e-05, + "loss": 0.9779, + "step": 642 + }, + { + "epoch": 0.09839326702371844, + "grad_norm": 2.7557734535440166, + "learning_rate": 1.9980814290393115e-05, + "loss": 0.838, + "step": 643 + }, + { + "epoch": 0.09854628921193573, + "grad_norm": 2.704043797297548, + "learning_rate": 1.998066054733511e-05, + "loss": 0.8751, + "step": 644 + }, + { + "epoch": 0.09869931140015302, + "grad_norm": 3.1041651898470657, + "learning_rate": 1.9980506191325694e-05, + "loss": 0.8846, + "step": 645 + }, + { + "epoch": 0.09885233358837031, + "grad_norm": 2.6868333111971476, + "learning_rate": 1.9980351222374347e-05, + "loss": 0.8375, + "step": 646 + }, + { + "epoch": 0.0990053557765876, + "grad_norm": 2.777208715705951, + "learning_rate": 1.9980195640490592e-05, + "loss": 0.8029, + "step": 647 + }, + { + "epoch": 0.0991583779648049, + "grad_norm": 3.4013194486190548, + "learning_rate": 1.9980039445683978e-05, + "loss": 0.8727, + "step": 648 + }, + { + "epoch": 0.09931140015302219, + "grad_norm": 3.0668546286557543, + "learning_rate": 1.99798826379641e-05, + "loss": 0.9092, + "step": 649 + }, + { + "epoch": 0.09946442234123948, + "grad_norm": 3.0453985288017904, + "learning_rate": 1.9979725217340587e-05, + "loss": 0.8486, + "step": 650 + }, + { + "epoch": 0.09961744452945677, + "grad_norm": 2.7942211304315143, + "learning_rate": 1.9979567183823108e-05, + "loss": 0.8158, + "step": 651 + }, + { + "epoch": 0.09977046671767406, + "grad_norm": 3.3292124901062206, + "learning_rate": 1.9979408537421367e-05, + "loss": 0.9559, + "step": 652 + }, + { + "epoch": 0.09992348890589135, + "grad_norm": 3.134527582640561, + "learning_rate": 1.997924927814511e-05, + "loss": 0.8503, + "step": 653 + }, + { + "epoch": 0.10007651109410864, + "grad_norm": 2.9257436635589857, + "learning_rate": 1.9979089406004115e-05, + "loss": 0.7602, + "step": 654 + }, + { + "epoch": 0.10022953328232594, + "grad_norm": 2.9114862208984373, + "learning_rate": 1.99789289210082e-05, + "loss": 0.8381, + "step": 655 + }, + { + "epoch": 0.10038255547054323, + "grad_norm": 3.281387510746206, + "learning_rate": 1.9978767823167224e-05, + "loss": 0.8544, + "step": 656 + }, + { + "epoch": 0.10053557765876052, + "grad_norm": 2.5934944798509774, + "learning_rate": 1.9978606112491076e-05, + "loss": 0.8238, + "step": 657 + }, + { + "epoch": 0.10068859984697781, + "grad_norm": 2.803648838613119, + "learning_rate": 1.9978443788989695e-05, + "loss": 0.8357, + "step": 658 + }, + { + "epoch": 0.1008416220351951, + "grad_norm": 3.2256668016008705, + "learning_rate": 1.9978280852673038e-05, + "loss": 0.9185, + "step": 659 + }, + { + "epoch": 0.1009946442234124, + "grad_norm": 2.9180166328828108, + "learning_rate": 1.9978117303551127e-05, + "loss": 0.9, + "step": 660 + }, + { + "epoch": 0.10114766641162969, + "grad_norm": 2.759551164175027, + "learning_rate": 1.997795314163399e-05, + "loss": 0.8243, + "step": 661 + }, + { + "epoch": 0.10130068859984698, + "grad_norm": 2.8452679100467564, + "learning_rate": 1.997778836693172e-05, + "loss": 0.8834, + "step": 662 + }, + { + "epoch": 0.10145371078806427, + "grad_norm": 2.85925562143535, + "learning_rate": 1.9977622979454433e-05, + "loss": 0.8022, + "step": 663 + }, + { + "epoch": 0.10160673297628156, + "grad_norm": 2.925375455401089, + "learning_rate": 1.9977456979212286e-05, + "loss": 0.8822, + "step": 664 + }, + { + "epoch": 0.10175975516449885, + "grad_norm": 2.725147333625341, + "learning_rate": 1.9977290366215473e-05, + "loss": 0.8138, + "step": 665 + }, + { + "epoch": 0.10191277735271614, + "grad_norm": 3.1953282021420013, + "learning_rate": 1.997712314047423e-05, + "loss": 0.857, + "step": 666 + }, + { + "epoch": 0.10206579954093344, + "grad_norm": 3.0746423357238943, + "learning_rate": 1.9976955301998822e-05, + "loss": 0.8515, + "step": 667 + }, + { + "epoch": 0.10221882172915073, + "grad_norm": 2.900938526635504, + "learning_rate": 1.997678685079956e-05, + "loss": 0.8601, + "step": 668 + }, + { + "epoch": 0.10237184391736802, + "grad_norm": 2.689075444923604, + "learning_rate": 1.997661778688679e-05, + "loss": 0.8373, + "step": 669 + }, + { + "epoch": 0.10252486610558531, + "grad_norm": 2.859953745768656, + "learning_rate": 1.9976448110270888e-05, + "loss": 0.7555, + "step": 670 + }, + { + "epoch": 0.1026778882938026, + "grad_norm": 3.145897880802856, + "learning_rate": 1.997627782096228e-05, + "loss": 0.9535, + "step": 671 + }, + { + "epoch": 0.1028309104820199, + "grad_norm": 2.969611797938469, + "learning_rate": 1.9976106918971428e-05, + "loss": 0.8239, + "step": 672 + }, + { + "epoch": 0.10298393267023719, + "grad_norm": 2.9387550683170014, + "learning_rate": 1.9975935404308818e-05, + "loss": 0.8714, + "step": 673 + }, + { + "epoch": 0.10313695485845448, + "grad_norm": 2.9648398626987444, + "learning_rate": 1.9975763276984993e-05, + "loss": 0.8172, + "step": 674 + }, + { + "epoch": 0.10328997704667177, + "grad_norm": 3.295868102675818, + "learning_rate": 1.9975590537010515e-05, + "loss": 0.9202, + "step": 675 + }, + { + "epoch": 0.10344299923488906, + "grad_norm": 3.22903200685603, + "learning_rate": 1.9975417184396005e-05, + "loss": 0.9028, + "step": 676 + }, + { + "epoch": 0.10359602142310635, + "grad_norm": 2.825966414769792, + "learning_rate": 1.9975243219152095e-05, + "loss": 0.9263, + "step": 677 + }, + { + "epoch": 0.10374904361132364, + "grad_norm": 3.030014157036485, + "learning_rate": 1.9975068641289478e-05, + "loss": 0.718, + "step": 678 + }, + { + "epoch": 0.10390206579954094, + "grad_norm": 3.475541487458047, + "learning_rate": 1.9974893450818875e-05, + "loss": 0.9376, + "step": 679 + }, + { + "epoch": 0.10405508798775823, + "grad_norm": 3.2549605814152356, + "learning_rate": 1.997471764775104e-05, + "loss": 0.7524, + "step": 680 + }, + { + "epoch": 0.10420811017597552, + "grad_norm": 3.126998944683233, + "learning_rate": 1.9974541232096774e-05, + "loss": 0.9323, + "step": 681 + }, + { + "epoch": 0.10436113236419281, + "grad_norm": 3.0013467824393025, + "learning_rate": 1.997436420386691e-05, + "loss": 0.806, + "step": 682 + }, + { + "epoch": 0.1045141545524101, + "grad_norm": 3.0259375619997253, + "learning_rate": 1.997418656307232e-05, + "loss": 0.8783, + "step": 683 + }, + { + "epoch": 0.1046671767406274, + "grad_norm": 3.0450368516418447, + "learning_rate": 1.9974008309723917e-05, + "loss": 0.9218, + "step": 684 + }, + { + "epoch": 0.10482019892884469, + "grad_norm": 2.825921818621975, + "learning_rate": 1.997382944383264e-05, + "loss": 0.8584, + "step": 685 + }, + { + "epoch": 0.10497322111706198, + "grad_norm": 2.912770169006268, + "learning_rate": 1.9973649965409483e-05, + "loss": 0.8629, + "step": 686 + }, + { + "epoch": 0.10512624330527927, + "grad_norm": 2.5954835007580477, + "learning_rate": 1.9973469874465464e-05, + "loss": 0.8466, + "step": 687 + }, + { + "epoch": 0.10527926549349656, + "grad_norm": 2.917363261917072, + "learning_rate": 1.997328917101164e-05, + "loss": 0.8829, + "step": 688 + }, + { + "epoch": 0.10543228768171385, + "grad_norm": 2.812270404547237, + "learning_rate": 1.9973107855059116e-05, + "loss": 0.8442, + "step": 689 + }, + { + "epoch": 0.10558530986993114, + "grad_norm": 3.0483780333514257, + "learning_rate": 1.9972925926619023e-05, + "loss": 0.8695, + "step": 690 + }, + { + "epoch": 0.10573833205814843, + "grad_norm": 2.9664567475153074, + "learning_rate": 1.9972743385702535e-05, + "loss": 0.9386, + "step": 691 + }, + { + "epoch": 0.10589135424636573, + "grad_norm": 2.7137242073331804, + "learning_rate": 1.9972560232320863e-05, + "loss": 0.7839, + "step": 692 + }, + { + "epoch": 0.10604437643458302, + "grad_norm": 3.784136448397792, + "learning_rate": 1.9972376466485252e-05, + "loss": 0.9644, + "step": 693 + }, + { + "epoch": 0.10619739862280031, + "grad_norm": 2.6044337361202183, + "learning_rate": 1.997219208820699e-05, + "loss": 0.8399, + "step": 694 + }, + { + "epoch": 0.1063504208110176, + "grad_norm": 3.106258109412711, + "learning_rate": 1.99720070974974e-05, + "loss": 0.8544, + "step": 695 + }, + { + "epoch": 0.10650344299923489, + "grad_norm": 3.1386122334768376, + "learning_rate": 1.9971821494367844e-05, + "loss": 0.9707, + "step": 696 + }, + { + "epoch": 0.10665646518745218, + "grad_norm": 2.912783268372566, + "learning_rate": 1.997163527882972e-05, + "loss": 0.8637, + "step": 697 + }, + { + "epoch": 0.10680948737566948, + "grad_norm": 2.794401741879464, + "learning_rate": 1.9971448450894467e-05, + "loss": 0.7666, + "step": 698 + }, + { + "epoch": 0.10696250956388677, + "grad_norm": 2.787277032267778, + "learning_rate": 1.9971261010573553e-05, + "loss": 0.9049, + "step": 699 + }, + { + "epoch": 0.10711553175210406, + "grad_norm": 3.1363693151187535, + "learning_rate": 1.9971072957878494e-05, + "loss": 0.8289, + "step": 700 + }, + { + "epoch": 0.10726855394032135, + "grad_norm": 3.016667481448861, + "learning_rate": 1.9970884292820837e-05, + "loss": 0.7704, + "step": 701 + }, + { + "epoch": 0.10742157612853864, + "grad_norm": 3.2062104544910355, + "learning_rate": 1.997069501541217e-05, + "loss": 0.8958, + "step": 702 + }, + { + "epoch": 0.10757459831675593, + "grad_norm": 2.680574521626054, + "learning_rate": 1.9970505125664116e-05, + "loss": 0.8637, + "step": 703 + }, + { + "epoch": 0.10772762050497323, + "grad_norm": 2.674813823925464, + "learning_rate": 1.9970314623588335e-05, + "loss": 0.7913, + "step": 704 + }, + { + "epoch": 0.10788064269319052, + "grad_norm": 2.9406076887807706, + "learning_rate": 1.9970123509196533e-05, + "loss": 0.9162, + "step": 705 + }, + { + "epoch": 0.10803366488140781, + "grad_norm": 3.261303730361806, + "learning_rate": 1.996993178250044e-05, + "loss": 0.8425, + "step": 706 + }, + { + "epoch": 0.1081866870696251, + "grad_norm": 2.8193883845987737, + "learning_rate": 1.9969739443511835e-05, + "loss": 0.748, + "step": 707 + }, + { + "epoch": 0.10833970925784239, + "grad_norm": 2.7504729360532973, + "learning_rate": 1.996954649224253e-05, + "loss": 0.8859, + "step": 708 + }, + { + "epoch": 0.10849273144605968, + "grad_norm": 2.7955831501250716, + "learning_rate": 1.996935292870437e-05, + "loss": 0.8175, + "step": 709 + }, + { + "epoch": 0.10864575363427698, + "grad_norm": 2.9447725337658444, + "learning_rate": 1.9969158752909247e-05, + "loss": 0.7753, + "step": 710 + }, + { + "epoch": 0.10879877582249427, + "grad_norm": 2.78759682667099, + "learning_rate": 1.9968963964869088e-05, + "loss": 0.958, + "step": 711 + }, + { + "epoch": 0.10895179801071156, + "grad_norm": 3.074769801484582, + "learning_rate": 1.9968768564595856e-05, + "loss": 0.9086, + "step": 712 + }, + { + "epoch": 0.10910482019892885, + "grad_norm": 2.9070592094240797, + "learning_rate": 1.9968572552101544e-05, + "loss": 0.9174, + "step": 713 + }, + { + "epoch": 0.10925784238714614, + "grad_norm": 3.233405582020585, + "learning_rate": 1.9968375927398195e-05, + "loss": 0.9356, + "step": 714 + }, + { + "epoch": 0.10941086457536343, + "grad_norm": 3.022272822063928, + "learning_rate": 1.9968178690497884e-05, + "loss": 0.851, + "step": 715 + }, + { + "epoch": 0.10956388676358073, + "grad_norm": 2.8195304743616285, + "learning_rate": 1.9967980841412722e-05, + "loss": 0.7116, + "step": 716 + }, + { + "epoch": 0.10971690895179802, + "grad_norm": 2.9245399474491602, + "learning_rate": 1.9967782380154864e-05, + "loss": 0.8679, + "step": 717 + }, + { + "epoch": 0.10986993114001531, + "grad_norm": 2.988694346304424, + "learning_rate": 1.9967583306736494e-05, + "loss": 0.7599, + "step": 718 + }, + { + "epoch": 0.1100229533282326, + "grad_norm": 3.1688493992216618, + "learning_rate": 1.996738362116984e-05, + "loss": 0.9484, + "step": 719 + }, + { + "epoch": 0.11017597551644988, + "grad_norm": 2.9048182310387776, + "learning_rate": 1.996718332346717e-05, + "loss": 0.7532, + "step": 720 + }, + { + "epoch": 0.11032899770466717, + "grad_norm": 2.637213696698448, + "learning_rate": 1.9966982413640772e-05, + "loss": 0.8024, + "step": 721 + }, + { + "epoch": 0.11048201989288446, + "grad_norm": 2.89859467937182, + "learning_rate": 1.9966780891703e-05, + "loss": 0.9697, + "step": 722 + }, + { + "epoch": 0.11063504208110175, + "grad_norm": 2.836160191245258, + "learning_rate": 1.996657875766622e-05, + "loss": 0.8161, + "step": 723 + }, + { + "epoch": 0.11078806426931904, + "grad_norm": 3.0972974916050564, + "learning_rate": 1.996637601154285e-05, + "loss": 0.905, + "step": 724 + }, + { + "epoch": 0.11094108645753634, + "grad_norm": 2.746119102608403, + "learning_rate": 1.9966172653345337e-05, + "loss": 0.7602, + "step": 725 + }, + { + "epoch": 0.11109410864575363, + "grad_norm": 2.6662164718564765, + "learning_rate": 1.9965968683086177e-05, + "loss": 0.8078, + "step": 726 + }, + { + "epoch": 0.11124713083397092, + "grad_norm": 3.135727407852158, + "learning_rate": 1.9965764100777892e-05, + "loss": 0.8687, + "step": 727 + }, + { + "epoch": 0.11140015302218821, + "grad_norm": 2.902712664303539, + "learning_rate": 1.996555890643305e-05, + "loss": 0.8878, + "step": 728 + }, + { + "epoch": 0.1115531752104055, + "grad_norm": 3.2232593590832535, + "learning_rate": 1.9965353100064246e-05, + "loss": 0.8875, + "step": 729 + }, + { + "epoch": 0.1117061973986228, + "grad_norm": 3.098086025014026, + "learning_rate": 1.9965146681684126e-05, + "loss": 0.8271, + "step": 730 + }, + { + "epoch": 0.11185921958684009, + "grad_norm": 2.852123909711368, + "learning_rate": 1.996493965130536e-05, + "loss": 0.7657, + "step": 731 + }, + { + "epoch": 0.11201224177505738, + "grad_norm": 3.08617436112628, + "learning_rate": 1.9964732008940673e-05, + "loss": 0.9049, + "step": 732 + }, + { + "epoch": 0.11216526396327467, + "grad_norm": 3.0408480858160787, + "learning_rate": 1.996452375460281e-05, + "loss": 0.9891, + "step": 733 + }, + { + "epoch": 0.11231828615149196, + "grad_norm": 2.5643968058230313, + "learning_rate": 1.9964314888304563e-05, + "loss": 0.8169, + "step": 734 + }, + { + "epoch": 0.11247130833970925, + "grad_norm": 3.2776028562449184, + "learning_rate": 1.9964105410058754e-05, + "loss": 0.8984, + "step": 735 + }, + { + "epoch": 0.11262433052792654, + "grad_norm": 3.1486146940763375, + "learning_rate": 1.9963895319878252e-05, + "loss": 0.8662, + "step": 736 + }, + { + "epoch": 0.11277735271614384, + "grad_norm": 3.0577728216797855, + "learning_rate": 1.996368461777596e-05, + "loss": 1.0304, + "step": 737 + }, + { + "epoch": 0.11293037490436113, + "grad_norm": 2.558019479134406, + "learning_rate": 1.996347330376482e-05, + "loss": 0.7169, + "step": 738 + }, + { + "epoch": 0.11308339709257842, + "grad_norm": 2.7971191724016577, + "learning_rate": 1.9963261377857805e-05, + "loss": 0.8585, + "step": 739 + }, + { + "epoch": 0.11323641928079571, + "grad_norm": 2.7918459247533183, + "learning_rate": 1.996304884006793e-05, + "loss": 0.8193, + "step": 740 + }, + { + "epoch": 0.113389441469013, + "grad_norm": 3.0171317643440276, + "learning_rate": 1.9962835690408255e-05, + "loss": 0.9036, + "step": 741 + }, + { + "epoch": 0.1135424636572303, + "grad_norm": 2.825490934258436, + "learning_rate": 1.9962621928891863e-05, + "loss": 0.6977, + "step": 742 + }, + { + "epoch": 0.11369548584544759, + "grad_norm": 2.946410812576731, + "learning_rate": 1.9962407555531884e-05, + "loss": 0.9062, + "step": 743 + }, + { + "epoch": 0.11384850803366488, + "grad_norm": 2.63568342518963, + "learning_rate": 1.9962192570341485e-05, + "loss": 0.8382, + "step": 744 + }, + { + "epoch": 0.11400153022188217, + "grad_norm": 2.6540164369542145, + "learning_rate": 1.9961976973333868e-05, + "loss": 0.8843, + "step": 745 + }, + { + "epoch": 0.11415455241009946, + "grad_norm": 3.1360366019870254, + "learning_rate": 1.996176076452227e-05, + "loss": 0.9828, + "step": 746 + }, + { + "epoch": 0.11430757459831675, + "grad_norm": 2.4591066537053603, + "learning_rate": 1.996154394391998e-05, + "loss": 0.8476, + "step": 747 + }, + { + "epoch": 0.11446059678653404, + "grad_norm": 2.9487269261454645, + "learning_rate": 1.9961326511540303e-05, + "loss": 0.907, + "step": 748 + }, + { + "epoch": 0.11461361897475134, + "grad_norm": 2.9947147155525338, + "learning_rate": 1.996110846739659e-05, + "loss": 0.8579, + "step": 749 + }, + { + "epoch": 0.11476664116296863, + "grad_norm": 3.037066986180684, + "learning_rate": 1.9960889811502247e-05, + "loss": 0.8599, + "step": 750 + }, + { + "epoch": 0.11491966335118592, + "grad_norm": 3.051458512563892, + "learning_rate": 1.9960670543870692e-05, + "loss": 0.8731, + "step": 751 + }, + { + "epoch": 0.11507268553940321, + "grad_norm": 2.8398841037795317, + "learning_rate": 1.996045066451539e-05, + "loss": 0.8207, + "step": 752 + }, + { + "epoch": 0.1152257077276205, + "grad_norm": 2.7469614353497955, + "learning_rate": 1.9960230173449845e-05, + "loss": 0.9335, + "step": 753 + }, + { + "epoch": 0.1153787299158378, + "grad_norm": 2.935660071595326, + "learning_rate": 1.9960009070687603e-05, + "loss": 0.8881, + "step": 754 + }, + { + "epoch": 0.11553175210405509, + "grad_norm": 2.5830086234835994, + "learning_rate": 1.9959787356242243e-05, + "loss": 0.8157, + "step": 755 + }, + { + "epoch": 0.11568477429227238, + "grad_norm": 2.902450900848275, + "learning_rate": 1.9959565030127375e-05, + "loss": 0.8613, + "step": 756 + }, + { + "epoch": 0.11583779648048967, + "grad_norm": 3.1217190149525544, + "learning_rate": 1.9959342092356656e-05, + "loss": 0.8571, + "step": 757 + }, + { + "epoch": 0.11599081866870696, + "grad_norm": 3.0119235092110737, + "learning_rate": 1.995911854294378e-05, + "loss": 0.7914, + "step": 758 + }, + { + "epoch": 0.11614384085692425, + "grad_norm": 3.270646329337315, + "learning_rate": 1.9958894381902473e-05, + "loss": 0.9937, + "step": 759 + }, + { + "epoch": 0.11629686304514154, + "grad_norm": 2.6432702250244597, + "learning_rate": 1.99586696092465e-05, + "loss": 0.9079, + "step": 760 + }, + { + "epoch": 0.11644988523335884, + "grad_norm": 2.86168615726239, + "learning_rate": 1.9958444224989673e-05, + "loss": 0.7816, + "step": 761 + }, + { + "epoch": 0.11660290742157613, + "grad_norm": 3.1978664221309447, + "learning_rate": 1.9958218229145828e-05, + "loss": 0.8444, + "step": 762 + }, + { + "epoch": 0.11675592960979342, + "grad_norm": 2.932323475678271, + "learning_rate": 1.995799162172884e-05, + "loss": 0.9456, + "step": 763 + }, + { + "epoch": 0.11690895179801071, + "grad_norm": 3.038083319492739, + "learning_rate": 1.9957764402752632e-05, + "loss": 0.9396, + "step": 764 + }, + { + "epoch": 0.117061973986228, + "grad_norm": 2.848333848765862, + "learning_rate": 1.995753657223116e-05, + "loss": 0.9366, + "step": 765 + }, + { + "epoch": 0.1172149961744453, + "grad_norm": 3.0057853013541935, + "learning_rate": 1.995730813017841e-05, + "loss": 0.9175, + "step": 766 + }, + { + "epoch": 0.11736801836266259, + "grad_norm": 2.872981414780168, + "learning_rate": 1.9957079076608416e-05, + "loss": 0.904, + "step": 767 + }, + { + "epoch": 0.11752104055087988, + "grad_norm": 2.77278331612368, + "learning_rate": 1.9956849411535243e-05, + "loss": 0.8077, + "step": 768 + }, + { + "epoch": 0.11767406273909717, + "grad_norm": 2.809951235769412, + "learning_rate": 1.9956619134973e-05, + "loss": 0.8337, + "step": 769 + }, + { + "epoch": 0.11782708492731446, + "grad_norm": 2.5883083215594413, + "learning_rate": 1.995638824693582e-05, + "loss": 0.8807, + "step": 770 + }, + { + "epoch": 0.11798010711553175, + "grad_norm": 2.8036993992336203, + "learning_rate": 1.9956156747437892e-05, + "loss": 0.8703, + "step": 771 + }, + { + "epoch": 0.11813312930374904, + "grad_norm": 3.3243628707668234, + "learning_rate": 1.9955924636493427e-05, + "loss": 0.7931, + "step": 772 + }, + { + "epoch": 0.11828615149196633, + "grad_norm": 2.873881310062135, + "learning_rate": 1.995569191411668e-05, + "loss": 0.9049, + "step": 773 + }, + { + "epoch": 0.11843917368018363, + "grad_norm": 2.7069595333520895, + "learning_rate": 1.995545858032195e-05, + "loss": 0.8754, + "step": 774 + }, + { + "epoch": 0.11859219586840092, + "grad_norm": 2.9096351619045002, + "learning_rate": 1.9955224635123563e-05, + "loss": 0.7806, + "step": 775 + }, + { + "epoch": 0.11874521805661821, + "grad_norm": 2.8827356138086566, + "learning_rate": 1.9954990078535882e-05, + "loss": 0.8809, + "step": 776 + }, + { + "epoch": 0.1188982402448355, + "grad_norm": 2.826962196189732, + "learning_rate": 1.9954754910573322e-05, + "loss": 0.87, + "step": 777 + }, + { + "epoch": 0.11905126243305279, + "grad_norm": 2.8069784613392637, + "learning_rate": 1.9954519131250315e-05, + "loss": 0.8496, + "step": 778 + }, + { + "epoch": 0.11920428462127008, + "grad_norm": 2.971601835723486, + "learning_rate": 1.9954282740581347e-05, + "loss": 0.8614, + "step": 779 + }, + { + "epoch": 0.11935730680948738, + "grad_norm": 2.9075497971975746, + "learning_rate": 1.9954045738580935e-05, + "loss": 0.8203, + "step": 780 + }, + { + "epoch": 0.11951032899770467, + "grad_norm": 2.9402914645046336, + "learning_rate": 1.9953808125263634e-05, + "loss": 0.9068, + "step": 781 + }, + { + "epoch": 0.11966335118592196, + "grad_norm": 2.7248354648371245, + "learning_rate": 1.9953569900644038e-05, + "loss": 0.8354, + "step": 782 + }, + { + "epoch": 0.11981637337413925, + "grad_norm": 2.738997062960359, + "learning_rate": 1.9953331064736772e-05, + "loss": 0.8416, + "step": 783 + }, + { + "epoch": 0.11996939556235654, + "grad_norm": 2.408678484778986, + "learning_rate": 1.9953091617556508e-05, + "loss": 0.9647, + "step": 784 + }, + { + "epoch": 0.12012241775057383, + "grad_norm": 3.397543310223297, + "learning_rate": 1.995285155911795e-05, + "loss": 0.9912, + "step": 785 + }, + { + "epoch": 0.12027543993879113, + "grad_norm": 2.492146839514386, + "learning_rate": 1.9952610889435847e-05, + "loss": 0.7809, + "step": 786 + }, + { + "epoch": 0.12042846212700842, + "grad_norm": 3.186689185906449, + "learning_rate": 1.995236960852497e-05, + "loss": 0.8792, + "step": 787 + }, + { + "epoch": 0.12058148431522571, + "grad_norm": 2.6501731876923884, + "learning_rate": 1.9952127716400147e-05, + "loss": 0.9151, + "step": 788 + }, + { + "epoch": 0.120734506503443, + "grad_norm": 2.5758608454690717, + "learning_rate": 1.9951885213076224e-05, + "loss": 0.772, + "step": 789 + }, + { + "epoch": 0.12088752869166029, + "grad_norm": 2.8652593890546627, + "learning_rate": 1.9951642098568098e-05, + "loss": 0.8747, + "step": 790 + }, + { + "epoch": 0.12104055087987758, + "grad_norm": 3.0338057992308727, + "learning_rate": 1.9951398372890698e-05, + "loss": 0.8661, + "step": 791 + }, + { + "epoch": 0.12119357306809488, + "grad_norm": 2.921221055918673, + "learning_rate": 1.9951154036058996e-05, + "loss": 0.9477, + "step": 792 + }, + { + "epoch": 0.12134659525631217, + "grad_norm": 2.9305423440900045, + "learning_rate": 1.9950909088087998e-05, + "loss": 0.8084, + "step": 793 + }, + { + "epoch": 0.12149961744452946, + "grad_norm": 2.8420265459295773, + "learning_rate": 1.995066352899274e-05, + "loss": 0.7906, + "step": 794 + }, + { + "epoch": 0.12165263963274675, + "grad_norm": 2.6560288005388255, + "learning_rate": 1.995041735878831e-05, + "loss": 0.9672, + "step": 795 + }, + { + "epoch": 0.12180566182096404, + "grad_norm": 2.899871208782027, + "learning_rate": 1.9950170577489823e-05, + "loss": 0.8082, + "step": 796 + }, + { + "epoch": 0.12195868400918133, + "grad_norm": 2.426459161276832, + "learning_rate": 1.9949923185112437e-05, + "loss": 0.7931, + "step": 797 + }, + { + "epoch": 0.12211170619739863, + "grad_norm": 2.840994852998752, + "learning_rate": 1.9949675181671343e-05, + "loss": 0.7934, + "step": 798 + }, + { + "epoch": 0.12226472838561592, + "grad_norm": 2.808195404634968, + "learning_rate": 1.9949426567181773e-05, + "loss": 0.8969, + "step": 799 + }, + { + "epoch": 0.12241775057383321, + "grad_norm": 2.534032946860274, + "learning_rate": 1.9949177341658995e-05, + "loss": 0.8507, + "step": 800 + }, + { + "epoch": 0.1225707727620505, + "grad_norm": 2.9255867633682167, + "learning_rate": 1.9948927505118312e-05, + "loss": 0.9297, + "step": 801 + }, + { + "epoch": 0.12272379495026779, + "grad_norm": 2.644371265898017, + "learning_rate": 1.9948677057575074e-05, + "loss": 0.7543, + "step": 802 + }, + { + "epoch": 0.12287681713848508, + "grad_norm": 3.1021955628731375, + "learning_rate": 1.9948425999044657e-05, + "loss": 0.8462, + "step": 803 + }, + { + "epoch": 0.12302983932670238, + "grad_norm": 2.8070466213969336, + "learning_rate": 1.9948174329542483e-05, + "loss": 0.8161, + "step": 804 + }, + { + "epoch": 0.12318286151491967, + "grad_norm": 2.5722394249819294, + "learning_rate": 1.9947922049084007e-05, + "loss": 0.8785, + "step": 805 + }, + { + "epoch": 0.12333588370313696, + "grad_norm": 2.872258390308318, + "learning_rate": 1.9947669157684718e-05, + "loss": 0.9878, + "step": 806 + }, + { + "epoch": 0.12348890589135425, + "grad_norm": 2.8229511128297933, + "learning_rate": 1.9947415655360148e-05, + "loss": 0.8757, + "step": 807 + }, + { + "epoch": 0.12364192807957154, + "grad_norm": 2.974193887323895, + "learning_rate": 1.994716154212587e-05, + "loss": 0.8089, + "step": 808 + }, + { + "epoch": 0.12379495026778883, + "grad_norm": 2.452950207328061, + "learning_rate": 1.9946906817997495e-05, + "loss": 0.8796, + "step": 809 + }, + { + "epoch": 0.12394797245600613, + "grad_norm": 2.6646128965830385, + "learning_rate": 1.9946651482990654e-05, + "loss": 0.8494, + "step": 810 + }, + { + "epoch": 0.12410099464422342, + "grad_norm": 2.974247053047141, + "learning_rate": 1.9946395537121033e-05, + "loss": 0.8141, + "step": 811 + }, + { + "epoch": 0.12425401683244071, + "grad_norm": 2.77324872264311, + "learning_rate": 1.9946138980404352e-05, + "loss": 0.8281, + "step": 812 + }, + { + "epoch": 0.124407039020658, + "grad_norm": 2.8947035176206204, + "learning_rate": 1.994588181285637e-05, + "loss": 0.865, + "step": 813 + }, + { + "epoch": 0.12456006120887529, + "grad_norm": 2.481415993737972, + "learning_rate": 1.9945624034492876e-05, + "loss": 0.8163, + "step": 814 + }, + { + "epoch": 0.12471308339709258, + "grad_norm": 2.812818009991962, + "learning_rate": 1.99453656453297e-05, + "loss": 0.8919, + "step": 815 + }, + { + "epoch": 0.12486610558530988, + "grad_norm": 2.606521546663759, + "learning_rate": 1.9945106645382713e-05, + "loss": 0.8239, + "step": 816 + }, + { + "epoch": 0.12501912777352717, + "grad_norm": 2.7952461518148692, + "learning_rate": 1.9944847034667826e-05, + "loss": 0.8532, + "step": 817 + }, + { + "epoch": 0.12517214996174444, + "grad_norm": 2.7255334999000977, + "learning_rate": 1.9944586813200975e-05, + "loss": 0.8189, + "step": 818 + }, + { + "epoch": 0.12532517214996175, + "grad_norm": 2.3712871371410578, + "learning_rate": 1.9944325980998143e-05, + "loss": 0.7809, + "step": 819 + }, + { + "epoch": 0.12547819433817903, + "grad_norm": 2.7615076995086567, + "learning_rate": 1.9944064538075355e-05, + "loss": 0.8229, + "step": 820 + }, + { + "epoch": 0.12563121652639633, + "grad_norm": 2.932395122434818, + "learning_rate": 1.994380248444866e-05, + "loss": 0.862, + "step": 821 + }, + { + "epoch": 0.1257842387146136, + "grad_norm": 2.6788048670584175, + "learning_rate": 1.994353982013415e-05, + "loss": 0.8015, + "step": 822 + }, + { + "epoch": 0.12593726090283092, + "grad_norm": 3.0278059097501893, + "learning_rate": 1.9943276545147966e-05, + "loss": 0.9526, + "step": 823 + }, + { + "epoch": 0.1260902830910482, + "grad_norm": 2.9252691667903608, + "learning_rate": 1.9943012659506268e-05, + "loss": 0.8529, + "step": 824 + }, + { + "epoch": 0.1262433052792655, + "grad_norm": 2.8583585269123994, + "learning_rate": 1.9942748163225264e-05, + "loss": 0.8131, + "step": 825 + }, + { + "epoch": 0.12639632746748278, + "grad_norm": 3.2704363944279664, + "learning_rate": 1.9942483056321204e-05, + "loss": 0.8934, + "step": 826 + }, + { + "epoch": 0.12654934965570008, + "grad_norm": 2.9576396568597705, + "learning_rate": 1.994221733881036e-05, + "loss": 0.9039, + "step": 827 + }, + { + "epoch": 0.12670237184391736, + "grad_norm": 2.5940180643661406, + "learning_rate": 1.9941951010709054e-05, + "loss": 0.891, + "step": 828 + }, + { + "epoch": 0.12685539403213467, + "grad_norm": 2.7471602663867722, + "learning_rate": 1.9941684072033646e-05, + "loss": 0.8782, + "step": 829 + }, + { + "epoch": 0.12700841622035194, + "grad_norm": 3.0403029906730894, + "learning_rate": 1.9941416522800528e-05, + "loss": 0.8783, + "step": 830 + }, + { + "epoch": 0.12716143840856925, + "grad_norm": 2.967016562837652, + "learning_rate": 1.994114836302613e-05, + "loss": 0.9094, + "step": 831 + }, + { + "epoch": 0.12731446059678653, + "grad_norm": 2.697380123869265, + "learning_rate": 1.994087959272692e-05, + "loss": 0.8034, + "step": 832 + }, + { + "epoch": 0.12746748278500383, + "grad_norm": 2.6505703341369, + "learning_rate": 1.9940610211919404e-05, + "loss": 0.762, + "step": 833 + }, + { + "epoch": 0.1276205049732211, + "grad_norm": 2.9265591451269377, + "learning_rate": 1.9940340220620128e-05, + "loss": 0.9365, + "step": 834 + }, + { + "epoch": 0.12777352716143842, + "grad_norm": 2.6800900011379114, + "learning_rate": 1.9940069618845674e-05, + "loss": 0.8783, + "step": 835 + }, + { + "epoch": 0.1279265493496557, + "grad_norm": 2.7506250769096963, + "learning_rate": 1.9939798406612657e-05, + "loss": 0.8799, + "step": 836 + }, + { + "epoch": 0.128079571537873, + "grad_norm": 2.77472886449925, + "learning_rate": 1.9939526583937736e-05, + "loss": 0.7991, + "step": 837 + }, + { + "epoch": 0.12823259372609028, + "grad_norm": 2.8680315970983177, + "learning_rate": 1.9939254150837603e-05, + "loss": 0.7995, + "step": 838 + }, + { + "epoch": 0.12838561591430758, + "grad_norm": 2.6373678887412253, + "learning_rate": 1.993898110732899e-05, + "loss": 0.8318, + "step": 839 + }, + { + "epoch": 0.12853863810252486, + "grad_norm": 2.912194374140499, + "learning_rate": 1.9938707453428665e-05, + "loss": 0.8977, + "step": 840 + }, + { + "epoch": 0.12869166029074217, + "grad_norm": 2.664614405723492, + "learning_rate": 1.9938433189153437e-05, + "loss": 0.7767, + "step": 841 + }, + { + "epoch": 0.12884468247895944, + "grad_norm": 2.787127868229937, + "learning_rate": 1.9938158314520145e-05, + "loss": 0.882, + "step": 842 + }, + { + "epoch": 0.12899770466717675, + "grad_norm": 3.050383550626497, + "learning_rate": 1.9937882829545673e-05, + "loss": 0.916, + "step": 843 + }, + { + "epoch": 0.12915072685539403, + "grad_norm": 2.749560412191003, + "learning_rate": 1.9937606734246943e-05, + "loss": 0.8087, + "step": 844 + }, + { + "epoch": 0.12930374904361133, + "grad_norm": 3.0722048039396954, + "learning_rate": 1.9937330028640903e-05, + "loss": 0.8669, + "step": 845 + }, + { + "epoch": 0.1294567712318286, + "grad_norm": 2.7330294561178254, + "learning_rate": 1.9937052712744552e-05, + "loss": 0.9029, + "step": 846 + }, + { + "epoch": 0.12960979342004592, + "grad_norm": 2.846524441855076, + "learning_rate": 1.993677478657492e-05, + "loss": 0.8833, + "step": 847 + }, + { + "epoch": 0.1297628156082632, + "grad_norm": 2.7819980747272983, + "learning_rate": 1.9936496250149077e-05, + "loss": 0.8333, + "step": 848 + }, + { + "epoch": 0.1299158377964805, + "grad_norm": 2.8784712404720016, + "learning_rate": 1.9936217103484126e-05, + "loss": 0.8171, + "step": 849 + }, + { + "epoch": 0.13006885998469778, + "grad_norm": 2.6849041458366423, + "learning_rate": 1.9935937346597213e-05, + "loss": 0.8568, + "step": 850 + }, + { + "epoch": 0.13022188217291508, + "grad_norm": 2.7137336876141913, + "learning_rate": 1.9935656979505518e-05, + "loss": 0.7485, + "step": 851 + }, + { + "epoch": 0.13037490436113236, + "grad_norm": 2.679237726202307, + "learning_rate": 1.993537600222626e-05, + "loss": 0.8848, + "step": 852 + }, + { + "epoch": 0.13052792654934967, + "grad_norm": 2.9623536876476013, + "learning_rate": 1.993509441477669e-05, + "loss": 0.9203, + "step": 853 + }, + { + "epoch": 0.13068094873756694, + "grad_norm": 3.0078297852392613, + "learning_rate": 1.9934812217174112e-05, + "loss": 0.8651, + "step": 854 + }, + { + "epoch": 0.13083397092578425, + "grad_norm": 2.924067755236479, + "learning_rate": 1.9934529409435845e-05, + "loss": 0.7741, + "step": 855 + }, + { + "epoch": 0.13098699311400153, + "grad_norm": 2.7098897161661557, + "learning_rate": 1.9934245991579265e-05, + "loss": 0.8354, + "step": 856 + }, + { + "epoch": 0.13114001530221883, + "grad_norm": 2.650605982041534, + "learning_rate": 1.9933961963621777e-05, + "loss": 0.819, + "step": 857 + }, + { + "epoch": 0.1312930374904361, + "grad_norm": 2.9612775731325844, + "learning_rate": 1.993367732558082e-05, + "loss": 0.9485, + "step": 858 + }, + { + "epoch": 0.13144605967865342, + "grad_norm": 2.6114054372319644, + "learning_rate": 1.993339207747388e-05, + "loss": 0.7404, + "step": 859 + }, + { + "epoch": 0.1315990818668707, + "grad_norm": 3.0422415925389, + "learning_rate": 1.9933106219318474e-05, + "loss": 0.8357, + "step": 860 + }, + { + "epoch": 0.131752104055088, + "grad_norm": 2.898403955243632, + "learning_rate": 1.9932819751132153e-05, + "loss": 0.78, + "step": 861 + }, + { + "epoch": 0.13190512624330528, + "grad_norm": 2.9726198703448383, + "learning_rate": 1.9932532672932515e-05, + "loss": 0.7844, + "step": 862 + }, + { + "epoch": 0.13205814843152258, + "grad_norm": 2.961682855155522, + "learning_rate": 1.993224498473719e-05, + "loss": 0.8929, + "step": 863 + }, + { + "epoch": 0.13221117061973986, + "grad_norm": 2.9954828687652064, + "learning_rate": 1.9931956686563848e-05, + "loss": 0.9105, + "step": 864 + }, + { + "epoch": 0.13236419280795717, + "grad_norm": 2.5682006472322163, + "learning_rate": 1.9931667778430188e-05, + "loss": 0.717, + "step": 865 + }, + { + "epoch": 0.13251721499617444, + "grad_norm": 2.771936142857185, + "learning_rate": 1.9931378260353957e-05, + "loss": 0.8252, + "step": 866 + }, + { + "epoch": 0.13267023718439175, + "grad_norm": 2.6416325626387613, + "learning_rate": 1.9931088132352933e-05, + "loss": 0.7803, + "step": 867 + }, + { + "epoch": 0.13282325937260903, + "grad_norm": 2.833863980241223, + "learning_rate": 1.993079739444494e-05, + "loss": 0.9553, + "step": 868 + }, + { + "epoch": 0.13297628156082633, + "grad_norm": 2.6645516366644424, + "learning_rate": 1.993050604664783e-05, + "loss": 0.7632, + "step": 869 + }, + { + "epoch": 0.1331293037490436, + "grad_norm": 2.7887899146599278, + "learning_rate": 1.9930214088979492e-05, + "loss": 0.7871, + "step": 870 + }, + { + "epoch": 0.13328232593726091, + "grad_norm": 2.7303368182841417, + "learning_rate": 1.9929921521457865e-05, + "loss": 0.8759, + "step": 871 + }, + { + "epoch": 0.1334353481254782, + "grad_norm": 2.5203669088161336, + "learning_rate": 1.9929628344100907e-05, + "loss": 0.8028, + "step": 872 + }, + { + "epoch": 0.1335883703136955, + "grad_norm": 2.7611918709365124, + "learning_rate": 1.9929334556926628e-05, + "loss": 0.8294, + "step": 873 + }, + { + "epoch": 0.13374139250191278, + "grad_norm": 2.6936499626342556, + "learning_rate": 1.992904015995307e-05, + "loss": 0.8079, + "step": 874 + }, + { + "epoch": 0.13389441469013008, + "grad_norm": 2.593917756784823, + "learning_rate": 1.9928745153198313e-05, + "loss": 0.7629, + "step": 875 + }, + { + "epoch": 0.13404743687834736, + "grad_norm": 2.578488093197608, + "learning_rate": 1.9928449536680476e-05, + "loss": 0.7517, + "step": 876 + }, + { + "epoch": 0.13420045906656466, + "grad_norm": 2.499522490435344, + "learning_rate": 1.9928153310417712e-05, + "loss": 0.8801, + "step": 877 + }, + { + "epoch": 0.13435348125478194, + "grad_norm": 2.867320358801422, + "learning_rate": 1.9927856474428215e-05, + "loss": 0.9224, + "step": 878 + }, + { + "epoch": 0.13450650344299925, + "grad_norm": 2.5535956136527846, + "learning_rate": 1.9927559028730212e-05, + "loss": 0.7598, + "step": 879 + }, + { + "epoch": 0.13465952563121653, + "grad_norm": 2.779439531223504, + "learning_rate": 1.992726097334197e-05, + "loss": 0.9068, + "step": 880 + }, + { + "epoch": 0.13481254781943383, + "grad_norm": 2.821228950007975, + "learning_rate": 1.9926962308281802e-05, + "loss": 0.7944, + "step": 881 + }, + { + "epoch": 0.1349655700076511, + "grad_norm": 2.6330247037928824, + "learning_rate": 1.992666303356804e-05, + "loss": 0.7822, + "step": 882 + }, + { + "epoch": 0.13511859219586841, + "grad_norm": 2.840038821329292, + "learning_rate": 1.992636314921907e-05, + "loss": 0.8301, + "step": 883 + }, + { + "epoch": 0.1352716143840857, + "grad_norm": 2.6793404976515434, + "learning_rate": 1.9926062655253305e-05, + "loss": 0.7915, + "step": 884 + }, + { + "epoch": 0.135424636572303, + "grad_norm": 2.689530769295994, + "learning_rate": 1.9925761551689203e-05, + "loss": 0.9103, + "step": 885 + }, + { + "epoch": 0.13557765876052028, + "grad_norm": 2.887001810355891, + "learning_rate": 1.9925459838545252e-05, + "loss": 0.8768, + "step": 886 + }, + { + "epoch": 0.13573068094873755, + "grad_norm": 2.877012186365154, + "learning_rate": 1.9925157515839984e-05, + "loss": 0.8887, + "step": 887 + }, + { + "epoch": 0.13588370313695486, + "grad_norm": 2.8588587834893406, + "learning_rate": 1.992485458359197e-05, + "loss": 0.8792, + "step": 888 + }, + { + "epoch": 0.13603672532517214, + "grad_norm": 3.1905984297528964, + "learning_rate": 1.9924551041819807e-05, + "loss": 0.895, + "step": 889 + }, + { + "epoch": 0.13618974751338944, + "grad_norm": 2.7402318928244083, + "learning_rate": 1.9924246890542137e-05, + "loss": 0.8693, + "step": 890 + }, + { + "epoch": 0.13634276970160672, + "grad_norm": 2.818887709481401, + "learning_rate": 1.9923942129777644e-05, + "loss": 0.7851, + "step": 891 + }, + { + "epoch": 0.13649579188982403, + "grad_norm": 2.6259577978312407, + "learning_rate": 1.992363675954504e-05, + "loss": 0.9576, + "step": 892 + }, + { + "epoch": 0.1366488140780413, + "grad_norm": 2.863449512098866, + "learning_rate": 1.9923330779863084e-05, + "loss": 0.9511, + "step": 893 + }, + { + "epoch": 0.1368018362662586, + "grad_norm": 2.562371413427193, + "learning_rate": 1.992302419075056e-05, + "loss": 0.8163, + "step": 894 + }, + { + "epoch": 0.1369548584544759, + "grad_norm": 2.8870573577788727, + "learning_rate": 1.992271699222631e-05, + "loss": 0.9528, + "step": 895 + }, + { + "epoch": 0.1371078806426932, + "grad_norm": 2.6937895637721723, + "learning_rate": 1.9922409184309184e-05, + "loss": 0.8091, + "step": 896 + }, + { + "epoch": 0.13726090283091047, + "grad_norm": 2.6630413615518163, + "learning_rate": 1.9922100767018095e-05, + "loss": 0.7998, + "step": 897 + }, + { + "epoch": 0.13741392501912778, + "grad_norm": 2.625511387398233, + "learning_rate": 1.9921791740371982e-05, + "loss": 0.9357, + "step": 898 + }, + { + "epoch": 0.13756694720734505, + "grad_norm": 2.758458249926434, + "learning_rate": 1.9921482104389827e-05, + "loss": 0.8647, + "step": 899 + }, + { + "epoch": 0.13771996939556236, + "grad_norm": 2.976298176316275, + "learning_rate": 1.992117185909064e-05, + "loss": 0.7932, + "step": 900 + }, + { + "epoch": 0.13787299158377964, + "grad_norm": 2.68384833081395, + "learning_rate": 1.9920861004493477e-05, + "loss": 0.9454, + "step": 901 + }, + { + "epoch": 0.13802601377199694, + "grad_norm": 2.7413100902098653, + "learning_rate": 1.992054954061743e-05, + "loss": 0.8505, + "step": 902 + }, + { + "epoch": 0.13817903596021422, + "grad_norm": 3.218955544647937, + "learning_rate": 1.9920237467481628e-05, + "loss": 0.9033, + "step": 903 + }, + { + "epoch": 0.13833205814843152, + "grad_norm": 2.5946224723970035, + "learning_rate": 1.9919924785105235e-05, + "loss": 0.7389, + "step": 904 + }, + { + "epoch": 0.1384850803366488, + "grad_norm": 2.6176807752494726, + "learning_rate": 1.991961149350745e-05, + "loss": 0.8513, + "step": 905 + }, + { + "epoch": 0.1386381025248661, + "grad_norm": 3.1291931843932335, + "learning_rate": 1.991929759270752e-05, + "loss": 0.8245, + "step": 906 + }, + { + "epoch": 0.13879112471308339, + "grad_norm": 2.8736841925384025, + "learning_rate": 1.991898308272472e-05, + "loss": 0.9262, + "step": 907 + }, + { + "epoch": 0.1389441469013007, + "grad_norm": 2.609901479819786, + "learning_rate": 1.9918667963578368e-05, + "loss": 0.783, + "step": 908 + }, + { + "epoch": 0.13909716908951797, + "grad_norm": 2.6331118876099873, + "learning_rate": 1.991835223528781e-05, + "loss": 0.7673, + "step": 909 + }, + { + "epoch": 0.13925019127773527, + "grad_norm": 2.9613767339162305, + "learning_rate": 1.9918035897872445e-05, + "loss": 0.8643, + "step": 910 + }, + { + "epoch": 0.13940321346595255, + "grad_norm": 3.039493882439516, + "learning_rate": 1.9917718951351692e-05, + "loss": 0.9873, + "step": 911 + }, + { + "epoch": 0.13955623565416986, + "grad_norm": 3.239850256883201, + "learning_rate": 1.991740139574502e-05, + "loss": 0.884, + "step": 912 + }, + { + "epoch": 0.13970925784238714, + "grad_norm": 2.8892605380358294, + "learning_rate": 1.9917083231071933e-05, + "loss": 0.8356, + "step": 913 + }, + { + "epoch": 0.13986228003060444, + "grad_norm": 3.2132002955197034, + "learning_rate": 1.991676445735197e-05, + "loss": 0.837, + "step": 914 + }, + { + "epoch": 0.14001530221882172, + "grad_norm": 3.2006127364915913, + "learning_rate": 1.9916445074604705e-05, + "loss": 0.8063, + "step": 915 + }, + { + "epoch": 0.14016832440703902, + "grad_norm": 2.899427273902653, + "learning_rate": 1.9916125082849755e-05, + "loss": 0.7584, + "step": 916 + }, + { + "epoch": 0.1403213465952563, + "grad_norm": 2.914187996315459, + "learning_rate": 1.991580448210677e-05, + "loss": 0.8503, + "step": 917 + }, + { + "epoch": 0.1404743687834736, + "grad_norm": 2.804034994852326, + "learning_rate": 1.9915483272395445e-05, + "loss": 0.8292, + "step": 918 + }, + { + "epoch": 0.14062739097169089, + "grad_norm": 2.680970492007158, + "learning_rate": 1.99151614537355e-05, + "loss": 0.9034, + "step": 919 + }, + { + "epoch": 0.1407804131599082, + "grad_norm": 2.7084201578294516, + "learning_rate": 1.9914839026146702e-05, + "loss": 0.8518, + "step": 920 + }, + { + "epoch": 0.14093343534812547, + "grad_norm": 2.553189897331636, + "learning_rate": 1.9914515989648852e-05, + "loss": 0.7878, + "step": 921 + }, + { + "epoch": 0.14108645753634277, + "grad_norm": 3.050378968443345, + "learning_rate": 1.991419234426179e-05, + "loss": 0.784, + "step": 922 + }, + { + "epoch": 0.14123947972456005, + "grad_norm": 2.977989619703046, + "learning_rate": 1.991386809000539e-05, + "loss": 1.0098, + "step": 923 + }, + { + "epoch": 0.14139250191277736, + "grad_norm": 2.8242683639181667, + "learning_rate": 1.991354322689957e-05, + "loss": 0.8582, + "step": 924 + }, + { + "epoch": 0.14154552410099464, + "grad_norm": 2.7128793061444596, + "learning_rate": 1.991321775496428e-05, + "loss": 0.7909, + "step": 925 + }, + { + "epoch": 0.14169854628921194, + "grad_norm": 2.9979822954093884, + "learning_rate": 1.9912891674219502e-05, + "loss": 0.9373, + "step": 926 + }, + { + "epoch": 0.14185156847742922, + "grad_norm": 2.9482931261488936, + "learning_rate": 1.991256498468527e-05, + "loss": 0.6891, + "step": 927 + }, + { + "epoch": 0.14200459066564652, + "grad_norm": 3.0299388842655723, + "learning_rate": 1.9912237686381643e-05, + "loss": 0.8743, + "step": 928 + }, + { + "epoch": 0.1421576128538638, + "grad_norm": 2.6801748229278917, + "learning_rate": 1.991190977932872e-05, + "loss": 0.863, + "step": 929 + }, + { + "epoch": 0.1423106350420811, + "grad_norm": 3.085060897266117, + "learning_rate": 1.9911581263546643e-05, + "loss": 0.8992, + "step": 930 + }, + { + "epoch": 0.14246365723029839, + "grad_norm": 2.800811746754135, + "learning_rate": 1.991125213905559e-05, + "loss": 0.9399, + "step": 931 + }, + { + "epoch": 0.1426166794185157, + "grad_norm": 2.830458349120088, + "learning_rate": 1.991092240587577e-05, + "loss": 0.9702, + "step": 932 + }, + { + "epoch": 0.14276970160673297, + "grad_norm": 2.8432982293955016, + "learning_rate": 1.991059206402743e-05, + "loss": 0.8011, + "step": 933 + }, + { + "epoch": 0.14292272379495027, + "grad_norm": 2.739167967766181, + "learning_rate": 1.9910261113530863e-05, + "loss": 0.8164, + "step": 934 + }, + { + "epoch": 0.14307574598316755, + "grad_norm": 2.563160167423824, + "learning_rate": 1.9909929554406388e-05, + "loss": 0.7779, + "step": 935 + }, + { + "epoch": 0.14322876817138486, + "grad_norm": 3.186579436293255, + "learning_rate": 1.9909597386674374e-05, + "loss": 0.7571, + "step": 936 + }, + { + "epoch": 0.14338179035960213, + "grad_norm": 2.708795534392877, + "learning_rate": 1.990926461035522e-05, + "loss": 0.8351, + "step": 937 + }, + { + "epoch": 0.14353481254781944, + "grad_norm": 2.5425722820092065, + "learning_rate": 1.990893122546936e-05, + "loss": 0.8501, + "step": 938 + }, + { + "epoch": 0.14368783473603672, + "grad_norm": 2.670809594852604, + "learning_rate": 1.990859723203727e-05, + "loss": 0.7912, + "step": 939 + }, + { + "epoch": 0.14384085692425402, + "grad_norm": 3.1185793441810232, + "learning_rate": 1.9908262630079454e-05, + "loss": 0.8205, + "step": 940 + }, + { + "epoch": 0.1439938791124713, + "grad_norm": 2.731495965280675, + "learning_rate": 1.9907927419616477e-05, + "loss": 0.7788, + "step": 941 + }, + { + "epoch": 0.1441469013006886, + "grad_norm": 3.275322213863118, + "learning_rate": 1.9907591600668916e-05, + "loss": 1.0141, + "step": 942 + }, + { + "epoch": 0.14429992348890588, + "grad_norm": 2.6128430204439765, + "learning_rate": 1.9907255173257393e-05, + "loss": 0.9174, + "step": 943 + }, + { + "epoch": 0.1444529456771232, + "grad_norm": 2.908802026725082, + "learning_rate": 1.9906918137402574e-05, + "loss": 0.8608, + "step": 944 + }, + { + "epoch": 0.14460596786534047, + "grad_norm": 2.7143909626282055, + "learning_rate": 1.990658049312516e-05, + "loss": 0.772, + "step": 945 + }, + { + "epoch": 0.14475899005355777, + "grad_norm": 2.7624040519127706, + "learning_rate": 1.9906242240445878e-05, + "loss": 0.9278, + "step": 946 + }, + { + "epoch": 0.14491201224177505, + "grad_norm": 2.7166448739759246, + "learning_rate": 1.990590337938551e-05, + "loss": 0.8404, + "step": 947 + }, + { + "epoch": 0.14506503442999236, + "grad_norm": 2.9800310838070847, + "learning_rate": 1.990556390996486e-05, + "loss": 0.9234, + "step": 948 + }, + { + "epoch": 0.14521805661820963, + "grad_norm": 3.0606738732407477, + "learning_rate": 1.990522383220478e-05, + "loss": 0.8102, + "step": 949 + }, + { + "epoch": 0.14537107880642694, + "grad_norm": 3.1187386786744176, + "learning_rate": 1.9904883146126157e-05, + "loss": 0.7964, + "step": 950 + }, + { + "epoch": 0.14552410099464422, + "grad_norm": 2.848220059970956, + "learning_rate": 1.990454185174991e-05, + "loss": 0.897, + "step": 951 + }, + { + "epoch": 0.14567712318286152, + "grad_norm": 2.6548873702733613, + "learning_rate": 1.9904199949097e-05, + "loss": 0.7779, + "step": 952 + }, + { + "epoch": 0.1458301453710788, + "grad_norm": 2.846580038404024, + "learning_rate": 1.990385743818843e-05, + "loss": 0.8872, + "step": 953 + }, + { + "epoch": 0.1459831675592961, + "grad_norm": 3.1386716203357143, + "learning_rate": 1.9903514319045224e-05, + "loss": 0.981, + "step": 954 + }, + { + "epoch": 0.14613618974751338, + "grad_norm": 2.6161336399690107, + "learning_rate": 1.990317059168847e-05, + "loss": 0.8891, + "step": 955 + }, + { + "epoch": 0.1462892119357307, + "grad_norm": 2.9030163643864477, + "learning_rate": 1.990282625613926e-05, + "loss": 0.9119, + "step": 956 + }, + { + "epoch": 0.14644223412394797, + "grad_norm": 2.6876433319201656, + "learning_rate": 1.9902481312418754e-05, + "loss": 0.9507, + "step": 957 + }, + { + "epoch": 0.14659525631216527, + "grad_norm": 2.654743185149196, + "learning_rate": 1.990213576054813e-05, + "loss": 0.7974, + "step": 958 + }, + { + "epoch": 0.14674827850038255, + "grad_norm": 2.444896860388207, + "learning_rate": 1.9901789600548612e-05, + "loss": 0.7975, + "step": 959 + }, + { + "epoch": 0.14690130068859986, + "grad_norm": 2.6102754444080176, + "learning_rate": 1.990144283244146e-05, + "loss": 0.8526, + "step": 960 + }, + { + "epoch": 0.14705432287681713, + "grad_norm": 2.3982164679534983, + "learning_rate": 1.990109545624797e-05, + "loss": 0.7627, + "step": 961 + }, + { + "epoch": 0.14720734506503444, + "grad_norm": 2.816740248073328, + "learning_rate": 1.990074747198947e-05, + "loss": 0.8622, + "step": 962 + }, + { + "epoch": 0.14736036725325172, + "grad_norm": 2.3312900266202554, + "learning_rate": 1.9900398879687343e-05, + "loss": 0.6515, + "step": 963 + }, + { + "epoch": 0.14751338944146902, + "grad_norm": 3.27051923019561, + "learning_rate": 1.9900049679362982e-05, + "loss": 0.9881, + "step": 964 + }, + { + "epoch": 0.1476664116296863, + "grad_norm": 2.515455754322227, + "learning_rate": 1.9899699871037847e-05, + "loss": 0.8661, + "step": 965 + }, + { + "epoch": 0.1478194338179036, + "grad_norm": 2.814087317408005, + "learning_rate": 1.989934945473341e-05, + "loss": 0.8411, + "step": 966 + }, + { + "epoch": 0.14797245600612088, + "grad_norm": 2.8017718511765213, + "learning_rate": 1.9898998430471202e-05, + "loss": 0.9098, + "step": 967 + }, + { + "epoch": 0.1481254781943382, + "grad_norm": 2.629421952387108, + "learning_rate": 1.9898646798272773e-05, + "loss": 0.8979, + "step": 968 + }, + { + "epoch": 0.14827850038255547, + "grad_norm": 2.841954083166271, + "learning_rate": 1.9898294558159722e-05, + "loss": 0.899, + "step": 969 + }, + { + "epoch": 0.14843152257077277, + "grad_norm": 2.8553527981636013, + "learning_rate": 1.9897941710153677e-05, + "loss": 0.8999, + "step": 970 + }, + { + "epoch": 0.14858454475899005, + "grad_norm": 2.813663450639181, + "learning_rate": 1.989758825427631e-05, + "loss": 0.8266, + "step": 971 + }, + { + "epoch": 0.14873756694720736, + "grad_norm": 2.8659970900711063, + "learning_rate": 1.989723419054933e-05, + "loss": 0.7943, + "step": 972 + }, + { + "epoch": 0.14889058913542463, + "grad_norm": 2.7945735429929965, + "learning_rate": 1.9896879518994483e-05, + "loss": 0.794, + "step": 973 + }, + { + "epoch": 0.14904361132364194, + "grad_norm": 2.7096673500989565, + "learning_rate": 1.9896524239633543e-05, + "loss": 0.7983, + "step": 974 + }, + { + "epoch": 0.14919663351185922, + "grad_norm": 3.053720034752578, + "learning_rate": 1.9896168352488336e-05, + "loss": 0.9787, + "step": 975 + }, + { + "epoch": 0.14934965570007652, + "grad_norm": 2.6802833840317244, + "learning_rate": 1.9895811857580717e-05, + "loss": 0.8805, + "step": 976 + }, + { + "epoch": 0.1495026778882938, + "grad_norm": 2.9100073841921876, + "learning_rate": 1.989545475493258e-05, + "loss": 0.9091, + "step": 977 + }, + { + "epoch": 0.1496557000765111, + "grad_norm": 2.8957628677948177, + "learning_rate": 1.9895097044565853e-05, + "loss": 0.8288, + "step": 978 + }, + { + "epoch": 0.14980872226472838, + "grad_norm": 3.038264240319414, + "learning_rate": 1.989473872650251e-05, + "loss": 0.9798, + "step": 979 + }, + { + "epoch": 0.1499617444529457, + "grad_norm": 2.5609975366129483, + "learning_rate": 1.9894379800764548e-05, + "loss": 0.7845, + "step": 980 + }, + { + "epoch": 0.15011476664116297, + "grad_norm": 2.7823981767835066, + "learning_rate": 1.9894020267374025e-05, + "loss": 0.7783, + "step": 981 + }, + { + "epoch": 0.15026778882938027, + "grad_norm": 3.3243553824353214, + "learning_rate": 1.9893660126353002e-05, + "loss": 1.0154, + "step": 982 + }, + { + "epoch": 0.15042081101759755, + "grad_norm": 2.480366976477657, + "learning_rate": 1.9893299377723608e-05, + "loss": 0.7665, + "step": 983 + }, + { + "epoch": 0.15057383320581486, + "grad_norm": 2.571561037126822, + "learning_rate": 1.9892938021508e-05, + "loss": 0.8231, + "step": 984 + }, + { + "epoch": 0.15072685539403213, + "grad_norm": 3.066861871830307, + "learning_rate": 1.9892576057728366e-05, + "loss": 0.8987, + "step": 985 + }, + { + "epoch": 0.15087987758224944, + "grad_norm": 3.0035198245097923, + "learning_rate": 1.9892213486406937e-05, + "loss": 0.8285, + "step": 986 + }, + { + "epoch": 0.15103289977046672, + "grad_norm": 2.4764522581212387, + "learning_rate": 1.9891850307565976e-05, + "loss": 0.7679, + "step": 987 + }, + { + "epoch": 0.15118592195868402, + "grad_norm": 2.7267601838920257, + "learning_rate": 1.989148652122779e-05, + "loss": 0.8122, + "step": 988 + }, + { + "epoch": 0.1513389441469013, + "grad_norm": 2.5889920763864374, + "learning_rate": 1.9891122127414725e-05, + "loss": 0.7737, + "step": 989 + }, + { + "epoch": 0.1514919663351186, + "grad_norm": 2.8995325510612138, + "learning_rate": 1.9890757126149154e-05, + "loss": 0.9062, + "step": 990 + }, + { + "epoch": 0.15164498852333588, + "grad_norm": 2.7144127871621495, + "learning_rate": 1.9890391517453495e-05, + "loss": 0.8394, + "step": 991 + }, + { + "epoch": 0.1517980107115532, + "grad_norm": 2.9785649600008, + "learning_rate": 1.9890025301350202e-05, + "loss": 0.7811, + "step": 992 + }, + { + "epoch": 0.15195103289977047, + "grad_norm": 2.7814497882213547, + "learning_rate": 1.9889658477861764e-05, + "loss": 0.7973, + "step": 993 + }, + { + "epoch": 0.15210405508798777, + "grad_norm": 3.16891375953589, + "learning_rate": 1.9889291047010713e-05, + "loss": 0.8412, + "step": 994 + }, + { + "epoch": 0.15225707727620505, + "grad_norm": 2.5061691308106853, + "learning_rate": 1.9888923008819607e-05, + "loss": 0.7916, + "step": 995 + }, + { + "epoch": 0.15241009946442236, + "grad_norm": 2.863002070757968, + "learning_rate": 1.9888554363311058e-05, + "loss": 0.8654, + "step": 996 + }, + { + "epoch": 0.15256312165263963, + "grad_norm": 2.6516947662903343, + "learning_rate": 1.9888185110507702e-05, + "loss": 0.8693, + "step": 997 + }, + { + "epoch": 0.1527161438408569, + "grad_norm": 2.655655971232325, + "learning_rate": 1.988781525043221e-05, + "loss": 0.8319, + "step": 998 + }, + { + "epoch": 0.15286916602907422, + "grad_norm": 2.450582508006812, + "learning_rate": 1.9887444783107302e-05, + "loss": 0.8347, + "step": 999 + }, + { + "epoch": 0.1530221882172915, + "grad_norm": 2.670839990441853, + "learning_rate": 1.9887073708555736e-05, + "loss": 0.8599, + "step": 1000 + }, + { + "epoch": 0.1531752104055088, + "grad_norm": 2.3507137132115736, + "learning_rate": 1.9886702026800295e-05, + "loss": 0.7578, + "step": 1001 + }, + { + "epoch": 0.15332823259372608, + "grad_norm": 2.7099399060859444, + "learning_rate": 1.98863297378638e-05, + "loss": 0.8655, + "step": 1002 + }, + { + "epoch": 0.15348125478194338, + "grad_norm": 2.5790296307424883, + "learning_rate": 1.9885956841769124e-05, + "loss": 0.8469, + "step": 1003 + }, + { + "epoch": 0.15363427697016066, + "grad_norm": 2.265260853462987, + "learning_rate": 1.9885583338539162e-05, + "loss": 0.8579, + "step": 1004 + }, + { + "epoch": 0.15378729915837797, + "grad_norm": 2.4073495126193305, + "learning_rate": 1.9885209228196854e-05, + "loss": 0.7811, + "step": 1005 + }, + { + "epoch": 0.15394032134659524, + "grad_norm": 2.736856123838533, + "learning_rate": 1.988483451076518e-05, + "loss": 0.8012, + "step": 1006 + }, + { + "epoch": 0.15409334353481255, + "grad_norm": 2.9918761922844714, + "learning_rate": 1.988445918626715e-05, + "loss": 0.9105, + "step": 1007 + }, + { + "epoch": 0.15424636572302983, + "grad_norm": 2.770423326239471, + "learning_rate": 1.9884083254725808e-05, + "loss": 0.862, + "step": 1008 + }, + { + "epoch": 0.15439938791124713, + "grad_norm": 2.596810493782701, + "learning_rate": 1.988370671616425e-05, + "loss": 0.8444, + "step": 1009 + }, + { + "epoch": 0.1545524100994644, + "grad_norm": 2.570741057177668, + "learning_rate": 1.9883329570605594e-05, + "loss": 0.835, + "step": 1010 + }, + { + "epoch": 0.15470543228768172, + "grad_norm": 2.4655694875759706, + "learning_rate": 1.9882951818073008e-05, + "loss": 0.9392, + "step": 1011 + }, + { + "epoch": 0.154858454475899, + "grad_norm": 2.881513696814611, + "learning_rate": 1.9882573458589687e-05, + "loss": 0.871, + "step": 1012 + }, + { + "epoch": 0.1550114766641163, + "grad_norm": 3.1776782245839135, + "learning_rate": 1.988219449217887e-05, + "loss": 0.7849, + "step": 1013 + }, + { + "epoch": 0.15516449885233358, + "grad_norm": 2.911172268228049, + "learning_rate": 1.988181491886383e-05, + "loss": 0.8837, + "step": 1014 + }, + { + "epoch": 0.15531752104055088, + "grad_norm": 2.4686030359884743, + "learning_rate": 1.9881434738667877e-05, + "loss": 0.8141, + "step": 1015 + }, + { + "epoch": 0.15547054322876816, + "grad_norm": 2.897765487407931, + "learning_rate": 1.988105395161436e-05, + "loss": 0.9853, + "step": 1016 + }, + { + "epoch": 0.15562356541698547, + "grad_norm": 2.71964317721296, + "learning_rate": 1.9880672557726667e-05, + "loss": 0.7614, + "step": 1017 + }, + { + "epoch": 0.15577658760520274, + "grad_norm": 2.9302612921371485, + "learning_rate": 1.9880290557028215e-05, + "loss": 0.8674, + "step": 1018 + }, + { + "epoch": 0.15592960979342005, + "grad_norm": 2.7434432525327455, + "learning_rate": 1.9879907949542475e-05, + "loss": 0.8911, + "step": 1019 + }, + { + "epoch": 0.15608263198163733, + "grad_norm": 3.1889761223051107, + "learning_rate": 1.987952473529293e-05, + "loss": 0.8377, + "step": 1020 + }, + { + "epoch": 0.15623565416985463, + "grad_norm": 2.81375672067027, + "learning_rate": 1.987914091430313e-05, + "loss": 0.8919, + "step": 1021 + }, + { + "epoch": 0.1563886763580719, + "grad_norm": 2.8180296209140563, + "learning_rate": 1.987875648659663e-05, + "loss": 0.9621, + "step": 1022 + }, + { + "epoch": 0.15654169854628922, + "grad_norm": 3.2066379325461916, + "learning_rate": 1.9878371452197053e-05, + "loss": 0.957, + "step": 1023 + }, + { + "epoch": 0.1566947207345065, + "grad_norm": 2.6277701752914204, + "learning_rate": 1.987798581112804e-05, + "loss": 0.8304, + "step": 1024 + }, + { + "epoch": 0.1568477429227238, + "grad_norm": 2.6966984629555903, + "learning_rate": 1.9877599563413277e-05, + "loss": 0.7235, + "step": 1025 + }, + { + "epoch": 0.15700076511094108, + "grad_norm": 2.73955117421016, + "learning_rate": 1.987721270907648e-05, + "loss": 0.7999, + "step": 1026 + }, + { + "epoch": 0.15715378729915838, + "grad_norm": 3.725546674908976, + "learning_rate": 1.9876825248141413e-05, + "loss": 0.8339, + "step": 1027 + }, + { + "epoch": 0.15730680948737566, + "grad_norm": 2.578167705290565, + "learning_rate": 1.9876437180631873e-05, + "loss": 0.7681, + "step": 1028 + }, + { + "epoch": 0.15745983167559297, + "grad_norm": 2.7771069205470837, + "learning_rate": 1.987604850657168e-05, + "loss": 0.7977, + "step": 1029 + }, + { + "epoch": 0.15761285386381024, + "grad_norm": 2.8394656456803653, + "learning_rate": 1.987565922598472e-05, + "loss": 0.7927, + "step": 1030 + }, + { + "epoch": 0.15776587605202755, + "grad_norm": 2.6510579492848088, + "learning_rate": 1.9875269338894888e-05, + "loss": 0.725, + "step": 1031 + }, + { + "epoch": 0.15791889824024483, + "grad_norm": 2.837067274020622, + "learning_rate": 1.9874878845326137e-05, + "loss": 0.9011, + "step": 1032 + }, + { + "epoch": 0.15807192042846213, + "grad_norm": 2.8520975345419792, + "learning_rate": 1.9874487745302446e-05, + "loss": 0.8493, + "step": 1033 + }, + { + "epoch": 0.1582249426166794, + "grad_norm": 3.1030948946703525, + "learning_rate": 1.9874096038847834e-05, + "loss": 1.0212, + "step": 1034 + }, + { + "epoch": 0.15837796480489671, + "grad_norm": 2.9449253861471143, + "learning_rate": 1.9873703725986353e-05, + "loss": 0.9892, + "step": 1035 + }, + { + "epoch": 0.158530986993114, + "grad_norm": 2.6094643163438906, + "learning_rate": 1.9873310806742103e-05, + "loss": 0.8207, + "step": 1036 + }, + { + "epoch": 0.1586840091813313, + "grad_norm": 2.8265307957870913, + "learning_rate": 1.9872917281139208e-05, + "loss": 0.79, + "step": 1037 + }, + { + "epoch": 0.15883703136954858, + "grad_norm": 2.900708034355136, + "learning_rate": 1.9872523149201844e-05, + "loss": 0.9009, + "step": 1038 + }, + { + "epoch": 0.15899005355776588, + "grad_norm": 2.8149903877112554, + "learning_rate": 1.987212841095421e-05, + "loss": 0.9657, + "step": 1039 + }, + { + "epoch": 0.15914307574598316, + "grad_norm": 2.8216182032358788, + "learning_rate": 1.9871733066420548e-05, + "loss": 0.916, + "step": 1040 + }, + { + "epoch": 0.15929609793420046, + "grad_norm": 3.0351034690911467, + "learning_rate": 1.9871337115625146e-05, + "loss": 0.8505, + "step": 1041 + }, + { + "epoch": 0.15944912012241774, + "grad_norm": 2.4801658941328246, + "learning_rate": 1.9870940558592312e-05, + "loss": 0.8227, + "step": 1042 + }, + { + "epoch": 0.15960214231063505, + "grad_norm": 2.745895940283436, + "learning_rate": 1.98705433953464e-05, + "loss": 0.6975, + "step": 1043 + }, + { + "epoch": 0.15975516449885233, + "grad_norm": 2.8931887662338664, + "learning_rate": 1.9870145625911806e-05, + "loss": 0.8135, + "step": 1044 + }, + { + "epoch": 0.15990818668706963, + "grad_norm": 3.020183308577327, + "learning_rate": 1.9869747250312956e-05, + "loss": 0.9374, + "step": 1045 + }, + { + "epoch": 0.1600612088752869, + "grad_norm": 2.91238434506869, + "learning_rate": 1.986934826857432e-05, + "loss": 0.7985, + "step": 1046 + }, + { + "epoch": 0.16021423106350421, + "grad_norm": 2.8074374602676264, + "learning_rate": 1.9868948680720396e-05, + "loss": 0.762, + "step": 1047 + }, + { + "epoch": 0.1603672532517215, + "grad_norm": 2.8655970184823967, + "learning_rate": 1.9868548486775726e-05, + "loss": 0.9527, + "step": 1048 + }, + { + "epoch": 0.1605202754399388, + "grad_norm": 3.3510217577059924, + "learning_rate": 1.9868147686764887e-05, + "loss": 0.9492, + "step": 1049 + }, + { + "epoch": 0.16067329762815608, + "grad_norm": 2.628760206614971, + "learning_rate": 1.9867746280712494e-05, + "loss": 0.7946, + "step": 1050 + }, + { + "epoch": 0.16082631981637338, + "grad_norm": 2.70806765868512, + "learning_rate": 1.98673442686432e-05, + "loss": 0.8205, + "step": 1051 + }, + { + "epoch": 0.16097934200459066, + "grad_norm": 2.4609824531759905, + "learning_rate": 1.986694165058169e-05, + "loss": 0.8578, + "step": 1052 + }, + { + "epoch": 0.16113236419280796, + "grad_norm": 3.141891000216467, + "learning_rate": 1.9866538426552698e-05, + "loss": 0.9828, + "step": 1053 + }, + { + "epoch": 0.16128538638102524, + "grad_norm": 3.061783048506876, + "learning_rate": 1.986613459658098e-05, + "loss": 0.8136, + "step": 1054 + }, + { + "epoch": 0.16143840856924255, + "grad_norm": 2.6758943412455745, + "learning_rate": 1.9865730160691343e-05, + "loss": 0.8794, + "step": 1055 + }, + { + "epoch": 0.16159143075745983, + "grad_norm": 2.7201604131125876, + "learning_rate": 1.986532511890862e-05, + "loss": 0.9024, + "step": 1056 + }, + { + "epoch": 0.16174445294567713, + "grad_norm": 3.5299447043757337, + "learning_rate": 1.9864919471257685e-05, + "loss": 0.8843, + "step": 1057 + }, + { + "epoch": 0.1618974751338944, + "grad_norm": 2.801548938363029, + "learning_rate": 1.9864513217763458e-05, + "loss": 0.8217, + "step": 1058 + }, + { + "epoch": 0.16205049732211171, + "grad_norm": 2.5274711526613194, + "learning_rate": 1.9864106358450884e-05, + "loss": 0.8106, + "step": 1059 + }, + { + "epoch": 0.162203519510329, + "grad_norm": 2.7733624193991093, + "learning_rate": 1.9863698893344947e-05, + "loss": 0.7936, + "step": 1060 + }, + { + "epoch": 0.1623565416985463, + "grad_norm": 2.5893301183647273, + "learning_rate": 1.9863290822470675e-05, + "loss": 0.792, + "step": 1061 + }, + { + "epoch": 0.16250956388676358, + "grad_norm": 2.8266067518782934, + "learning_rate": 1.9862882145853127e-05, + "loss": 0.9559, + "step": 1062 + }, + { + "epoch": 0.16266258607498088, + "grad_norm": 2.951855197304437, + "learning_rate": 1.986247286351741e-05, + "loss": 0.8523, + "step": 1063 + }, + { + "epoch": 0.16281560826319816, + "grad_norm": 2.378517248975909, + "learning_rate": 1.9862062975488645e-05, + "loss": 0.8576, + "step": 1064 + }, + { + "epoch": 0.16296863045141546, + "grad_norm": 2.795790531253483, + "learning_rate": 1.986165248179201e-05, + "loss": 0.8383, + "step": 1065 + }, + { + "epoch": 0.16312165263963274, + "grad_norm": 2.618907367926037, + "learning_rate": 1.9861241382452724e-05, + "loss": 0.8413, + "step": 1066 + }, + { + "epoch": 0.16327467482785005, + "grad_norm": 2.8494588393913336, + "learning_rate": 1.9860829677496024e-05, + "loss": 0.8867, + "step": 1067 + }, + { + "epoch": 0.16342769701606732, + "grad_norm": 2.5805689918609773, + "learning_rate": 1.98604173669472e-05, + "loss": 0.8482, + "step": 1068 + }, + { + "epoch": 0.16358071920428463, + "grad_norm": 2.711715960933929, + "learning_rate": 1.9860004450831566e-05, + "loss": 0.935, + "step": 1069 + }, + { + "epoch": 0.1637337413925019, + "grad_norm": 2.7090531671259797, + "learning_rate": 1.9859590929174487e-05, + "loss": 0.7718, + "step": 1070 + }, + { + "epoch": 0.1638867635807192, + "grad_norm": 2.6128893829365705, + "learning_rate": 1.985917680200136e-05, + "loss": 0.9498, + "step": 1071 + }, + { + "epoch": 0.1640397857689365, + "grad_norm": 2.8396674469625998, + "learning_rate": 1.985876206933762e-05, + "loss": 0.9958, + "step": 1072 + }, + { + "epoch": 0.1641928079571538, + "grad_norm": 2.8043208696375586, + "learning_rate": 1.9858346731208732e-05, + "loss": 0.8735, + "step": 1073 + }, + { + "epoch": 0.16434583014537107, + "grad_norm": 2.684593665840827, + "learning_rate": 1.98579307876402e-05, + "loss": 0.8892, + "step": 1074 + }, + { + "epoch": 0.16449885233358838, + "grad_norm": 2.596969620943833, + "learning_rate": 1.9857514238657576e-05, + "loss": 0.8741, + "step": 1075 + }, + { + "epoch": 0.16465187452180566, + "grad_norm": 2.9338466904139917, + "learning_rate": 1.9857097084286445e-05, + "loss": 0.9696, + "step": 1076 + }, + { + "epoch": 0.16480489671002296, + "grad_norm": 2.6540784341987087, + "learning_rate": 1.9856679324552415e-05, + "loss": 0.8988, + "step": 1077 + }, + { + "epoch": 0.16495791889824024, + "grad_norm": 2.7019156426629034, + "learning_rate": 1.985626095948115e-05, + "loss": 0.8065, + "step": 1078 + }, + { + "epoch": 0.16511094108645755, + "grad_norm": 2.615891811054439, + "learning_rate": 1.9855841989098343e-05, + "loss": 0.7563, + "step": 1079 + }, + { + "epoch": 0.16526396327467482, + "grad_norm": 2.409266564450927, + "learning_rate": 1.985542241342972e-05, + "loss": 0.8721, + "step": 1080 + }, + { + "epoch": 0.16541698546289213, + "grad_norm": 2.5645546399602397, + "learning_rate": 1.985500223250105e-05, + "loss": 0.7746, + "step": 1081 + }, + { + "epoch": 0.1655700076511094, + "grad_norm": 2.606535862888543, + "learning_rate": 1.985458144633815e-05, + "loss": 0.9245, + "step": 1082 + }, + { + "epoch": 0.1657230298393267, + "grad_norm": 3.102399821161681, + "learning_rate": 1.9854160054966845e-05, + "loss": 0.8806, + "step": 1083 + }, + { + "epoch": 0.165876052027544, + "grad_norm": 2.5601279913055994, + "learning_rate": 1.985373805841302e-05, + "loss": 0.8015, + "step": 1084 + }, + { + "epoch": 0.1660290742157613, + "grad_norm": 2.61724976207117, + "learning_rate": 1.9853315456702594e-05, + "loss": 0.7555, + "step": 1085 + }, + { + "epoch": 0.16618209640397857, + "grad_norm": 2.725383208135442, + "learning_rate": 1.9852892249861522e-05, + "loss": 0.8368, + "step": 1086 + }, + { + "epoch": 0.16633511859219588, + "grad_norm": 2.765961715816743, + "learning_rate": 1.985246843791579e-05, + "loss": 0.8246, + "step": 1087 + }, + { + "epoch": 0.16648814078041316, + "grad_norm": 2.8936965697100505, + "learning_rate": 1.985204402089143e-05, + "loss": 0.8021, + "step": 1088 + }, + { + "epoch": 0.16664116296863046, + "grad_norm": 2.713935819782728, + "learning_rate": 1.9851618998814503e-05, + "loss": 0.7641, + "step": 1089 + }, + { + "epoch": 0.16679418515684774, + "grad_norm": 2.7965825996455957, + "learning_rate": 1.9851193371711113e-05, + "loss": 0.8254, + "step": 1090 + }, + { + "epoch": 0.16694720734506505, + "grad_norm": 2.695814350476115, + "learning_rate": 1.9850767139607404e-05, + "loss": 0.8409, + "step": 1091 + }, + { + "epoch": 0.16710022953328232, + "grad_norm": 2.8587205196165515, + "learning_rate": 1.9850340302529545e-05, + "loss": 0.8999, + "step": 1092 + }, + { + "epoch": 0.16725325172149963, + "grad_norm": 2.616711026159911, + "learning_rate": 1.9849912860503753e-05, + "loss": 0.8085, + "step": 1093 + }, + { + "epoch": 0.1674062739097169, + "grad_norm": 2.942385943591848, + "learning_rate": 1.984948481355628e-05, + "loss": 0.9305, + "step": 1094 + }, + { + "epoch": 0.1675592960979342, + "grad_norm": 2.8217129762291595, + "learning_rate": 1.9849056161713415e-05, + "loss": 0.8427, + "step": 1095 + }, + { + "epoch": 0.1677123182861515, + "grad_norm": 2.92693636303928, + "learning_rate": 1.984862690500148e-05, + "loss": 0.8569, + "step": 1096 + }, + { + "epoch": 0.1678653404743688, + "grad_norm": 2.558373593775746, + "learning_rate": 1.984819704344684e-05, + "loss": 0.9141, + "step": 1097 + }, + { + "epoch": 0.16801836266258607, + "grad_norm": 2.6662816349876004, + "learning_rate": 1.984776657707589e-05, + "loss": 0.8173, + "step": 1098 + }, + { + "epoch": 0.16817138485080338, + "grad_norm": 2.914507887638554, + "learning_rate": 1.984733550591507e-05, + "loss": 0.8787, + "step": 1099 + }, + { + "epoch": 0.16832440703902066, + "grad_norm": 2.909457834414278, + "learning_rate": 1.9846903829990857e-05, + "loss": 0.9803, + "step": 1100 + }, + { + "epoch": 0.16847742922723796, + "grad_norm": 2.639150497743505, + "learning_rate": 1.9846471549329758e-05, + "loss": 0.8015, + "step": 1101 + }, + { + "epoch": 0.16863045141545524, + "grad_norm": 2.594125634192865, + "learning_rate": 1.984603866395832e-05, + "loss": 0.7903, + "step": 1102 + }, + { + "epoch": 0.16878347360367255, + "grad_norm": 2.369582246149647, + "learning_rate": 1.984560517390313e-05, + "loss": 0.7992, + "step": 1103 + }, + { + "epoch": 0.16893649579188982, + "grad_norm": 2.555457979137369, + "learning_rate": 1.984517107919081e-05, + "loss": 0.8961, + "step": 1104 + }, + { + "epoch": 0.16908951798010713, + "grad_norm": 2.615173559437897, + "learning_rate": 1.984473637984802e-05, + "loss": 0.9241, + "step": 1105 + }, + { + "epoch": 0.1692425401683244, + "grad_norm": 2.6096821694922188, + "learning_rate": 1.9844301075901455e-05, + "loss": 0.8641, + "step": 1106 + }, + { + "epoch": 0.1693955623565417, + "grad_norm": 2.559217882335341, + "learning_rate": 1.984386516737785e-05, + "loss": 0.927, + "step": 1107 + }, + { + "epoch": 0.169548584544759, + "grad_norm": 2.6360804443043073, + "learning_rate": 1.9843428654303977e-05, + "loss": 0.8263, + "step": 1108 + }, + { + "epoch": 0.16970160673297627, + "grad_norm": 2.7158983215786465, + "learning_rate": 1.984299153670664e-05, + "loss": 0.813, + "step": 1109 + }, + { + "epoch": 0.16985462892119357, + "grad_norm": 2.850752676076143, + "learning_rate": 1.984255381461269e-05, + "loss": 0.7828, + "step": 1110 + }, + { + "epoch": 0.17000765110941085, + "grad_norm": 2.849267604288, + "learning_rate": 1.9842115488049006e-05, + "loss": 0.8832, + "step": 1111 + }, + { + "epoch": 0.17016067329762816, + "grad_norm": 2.8725800405214286, + "learning_rate": 1.9841676557042505e-05, + "loss": 0.8527, + "step": 1112 + }, + { + "epoch": 0.17031369548584543, + "grad_norm": 2.7923030948061034, + "learning_rate": 1.9841237021620147e-05, + "loss": 0.9163, + "step": 1113 + }, + { + "epoch": 0.17046671767406274, + "grad_norm": 2.6630219303474822, + "learning_rate": 1.9840796881808922e-05, + "loss": 0.774, + "step": 1114 + }, + { + "epoch": 0.17061973986228002, + "grad_norm": 2.798714223571827, + "learning_rate": 1.9840356137635865e-05, + "loss": 0.8822, + "step": 1115 + }, + { + "epoch": 0.17077276205049732, + "grad_norm": 2.8105521810924237, + "learning_rate": 1.983991478912804e-05, + "loss": 0.8292, + "step": 1116 + }, + { + "epoch": 0.1709257842387146, + "grad_norm": 2.9679950044260988, + "learning_rate": 1.9839472836312558e-05, + "loss": 0.8392, + "step": 1117 + }, + { + "epoch": 0.1710788064269319, + "grad_norm": 2.5546239972059923, + "learning_rate": 1.9839030279216557e-05, + "loss": 0.8779, + "step": 1118 + }, + { + "epoch": 0.17123182861514918, + "grad_norm": 2.6350772004841367, + "learning_rate": 1.983858711786721e-05, + "loss": 0.9282, + "step": 1119 + }, + { + "epoch": 0.1713848508033665, + "grad_norm": 2.40493852138416, + "learning_rate": 1.9838143352291747e-05, + "loss": 0.7634, + "step": 1120 + }, + { + "epoch": 0.17153787299158377, + "grad_norm": 2.904701675083358, + "learning_rate": 1.9837698982517408e-05, + "loss": 0.8338, + "step": 1121 + }, + { + "epoch": 0.17169089517980107, + "grad_norm": 2.6070853971468466, + "learning_rate": 1.983725400857149e-05, + "loss": 0.9001, + "step": 1122 + }, + { + "epoch": 0.17184391736801835, + "grad_norm": 2.6858795073590023, + "learning_rate": 1.983680843048132e-05, + "loss": 0.9843, + "step": 1123 + }, + { + "epoch": 0.17199693955623566, + "grad_norm": 2.44821873068937, + "learning_rate": 1.9836362248274262e-05, + "loss": 0.8795, + "step": 1124 + }, + { + "epoch": 0.17214996174445293, + "grad_norm": 2.699038981515868, + "learning_rate": 1.983591546197772e-05, + "loss": 0.8473, + "step": 1125 + }, + { + "epoch": 0.17230298393267024, + "grad_norm": 2.4135541032139654, + "learning_rate": 1.9835468071619132e-05, + "loss": 0.8204, + "step": 1126 + }, + { + "epoch": 0.17245600612088752, + "grad_norm": 2.545206003492443, + "learning_rate": 1.983502007722597e-05, + "loss": 0.8033, + "step": 1127 + }, + { + "epoch": 0.17260902830910482, + "grad_norm": 2.5812248093359975, + "learning_rate": 1.9834571478825752e-05, + "loss": 0.7242, + "step": 1128 + }, + { + "epoch": 0.1727620504973221, + "grad_norm": 3.048017837391477, + "learning_rate": 1.983412227644603e-05, + "loss": 0.9028, + "step": 1129 + }, + { + "epoch": 0.1729150726855394, + "grad_norm": 3.0440038836635597, + "learning_rate": 1.983367247011438e-05, + "loss": 0.7642, + "step": 1130 + }, + { + "epoch": 0.17306809487375668, + "grad_norm": 2.4646195094640513, + "learning_rate": 1.9833222059858438e-05, + "loss": 0.737, + "step": 1131 + }, + { + "epoch": 0.173221117061974, + "grad_norm": 2.9135955129234916, + "learning_rate": 1.9832771045705862e-05, + "loss": 0.8295, + "step": 1132 + }, + { + "epoch": 0.17337413925019127, + "grad_norm": 2.517525658493596, + "learning_rate": 1.9832319427684352e-05, + "loss": 0.9521, + "step": 1133 + }, + { + "epoch": 0.17352716143840857, + "grad_norm": 2.563415773261328, + "learning_rate": 1.983186720582164e-05, + "loss": 0.6452, + "step": 1134 + }, + { + "epoch": 0.17368018362662585, + "grad_norm": 2.551066647247192, + "learning_rate": 1.98314143801455e-05, + "loss": 0.8596, + "step": 1135 + }, + { + "epoch": 0.17383320581484316, + "grad_norm": 2.309987774636032, + "learning_rate": 1.983096095068374e-05, + "loss": 0.7836, + "step": 1136 + }, + { + "epoch": 0.17398622800306043, + "grad_norm": 2.968328488790905, + "learning_rate": 1.983050691746421e-05, + "loss": 0.967, + "step": 1137 + }, + { + "epoch": 0.17413925019127774, + "grad_norm": 2.70579171197833, + "learning_rate": 1.9830052280514795e-05, + "loss": 0.9944, + "step": 1138 + }, + { + "epoch": 0.17429227237949502, + "grad_norm": 2.461411813599091, + "learning_rate": 1.9829597039863416e-05, + "loss": 0.8744, + "step": 1139 + }, + { + "epoch": 0.17444529456771232, + "grad_norm": 2.907849288421829, + "learning_rate": 1.9829141195538025e-05, + "loss": 0.8599, + "step": 1140 + }, + { + "epoch": 0.1745983167559296, + "grad_norm": 2.8382037466956875, + "learning_rate": 1.9828684747566625e-05, + "loss": 0.8959, + "step": 1141 + }, + { + "epoch": 0.1747513389441469, + "grad_norm": 2.748550273869275, + "learning_rate": 1.982822769597724e-05, + "loss": 0.8289, + "step": 1142 + }, + { + "epoch": 0.17490436113236418, + "grad_norm": 2.8067275425318052, + "learning_rate": 1.9827770040797946e-05, + "loss": 0.9106, + "step": 1143 + }, + { + "epoch": 0.1750573833205815, + "grad_norm": 2.846345213122419, + "learning_rate": 1.982731178205685e-05, + "loss": 0.7729, + "step": 1144 + }, + { + "epoch": 0.17521040550879877, + "grad_norm": 3.2293890063364796, + "learning_rate": 1.982685291978209e-05, + "loss": 0.9251, + "step": 1145 + }, + { + "epoch": 0.17536342769701607, + "grad_norm": 2.858144968984894, + "learning_rate": 1.9826393454001848e-05, + "loss": 0.7992, + "step": 1146 + }, + { + "epoch": 0.17551644988523335, + "grad_norm": 2.8071648371182345, + "learning_rate": 1.9825933384744343e-05, + "loss": 0.8398, + "step": 1147 + }, + { + "epoch": 0.17566947207345066, + "grad_norm": 2.875344520665991, + "learning_rate": 1.9825472712037836e-05, + "loss": 0.8488, + "step": 1148 + }, + { + "epoch": 0.17582249426166793, + "grad_norm": 2.6795215539666732, + "learning_rate": 1.9825011435910606e-05, + "loss": 0.8287, + "step": 1149 + }, + { + "epoch": 0.17597551644988524, + "grad_norm": 2.6519672698029524, + "learning_rate": 1.982454955639099e-05, + "loss": 0.9665, + "step": 1150 + }, + { + "epoch": 0.17612853863810252, + "grad_norm": 2.9398416836616263, + "learning_rate": 1.982408707350735e-05, + "loss": 0.9661, + "step": 1151 + }, + { + "epoch": 0.17628156082631982, + "grad_norm": 2.5372992843733444, + "learning_rate": 1.9823623987288096e-05, + "loss": 0.7973, + "step": 1152 + }, + { + "epoch": 0.1764345830145371, + "grad_norm": 2.5779136243237804, + "learning_rate": 1.9823160297761657e-05, + "loss": 0.8823, + "step": 1153 + }, + { + "epoch": 0.1765876052027544, + "grad_norm": 2.6819788898485943, + "learning_rate": 1.9822696004956516e-05, + "loss": 0.757, + "step": 1154 + }, + { + "epoch": 0.17674062739097168, + "grad_norm": 2.7476885475300827, + "learning_rate": 1.982223110890119e-05, + "loss": 0.9533, + "step": 1155 + }, + { + "epoch": 0.176893649579189, + "grad_norm": 2.5992700129932493, + "learning_rate": 1.9821765609624223e-05, + "loss": 0.7441, + "step": 1156 + }, + { + "epoch": 0.17704667176740627, + "grad_norm": 2.8631617649345626, + "learning_rate": 1.982129950715421e-05, + "loss": 0.8553, + "step": 1157 + }, + { + "epoch": 0.17719969395562357, + "grad_norm": 2.8390171782673996, + "learning_rate": 1.9820832801519772e-05, + "loss": 0.8184, + "step": 1158 + }, + { + "epoch": 0.17735271614384085, + "grad_norm": 2.712066208521167, + "learning_rate": 1.9820365492749577e-05, + "loss": 0.8139, + "step": 1159 + }, + { + "epoch": 0.17750573833205815, + "grad_norm": 2.613425607348815, + "learning_rate": 1.9819897580872313e-05, + "loss": 0.8787, + "step": 1160 + }, + { + "epoch": 0.17765876052027543, + "grad_norm": 2.7109084220530533, + "learning_rate": 1.9819429065916725e-05, + "loss": 0.9128, + "step": 1161 + }, + { + "epoch": 0.17781178270849274, + "grad_norm": 2.9223737813686057, + "learning_rate": 1.9818959947911585e-05, + "loss": 0.8176, + "step": 1162 + }, + { + "epoch": 0.17796480489671002, + "grad_norm": 2.778954100850369, + "learning_rate": 1.9818490226885703e-05, + "loss": 0.8663, + "step": 1163 + }, + { + "epoch": 0.17811782708492732, + "grad_norm": 2.6877635305323593, + "learning_rate": 1.9818019902867924e-05, + "loss": 0.9036, + "step": 1164 + }, + { + "epoch": 0.1782708492731446, + "grad_norm": 3.094006409195956, + "learning_rate": 1.9817548975887136e-05, + "loss": 0.822, + "step": 1165 + }, + { + "epoch": 0.1784238714613619, + "grad_norm": 2.487275168235667, + "learning_rate": 1.981707744597226e-05, + "loss": 0.8939, + "step": 1166 + }, + { + "epoch": 0.17857689364957918, + "grad_norm": 2.328184674285479, + "learning_rate": 1.981660531315225e-05, + "loss": 0.8309, + "step": 1167 + }, + { + "epoch": 0.1787299158377965, + "grad_norm": 2.3136962546636464, + "learning_rate": 1.9816132577456105e-05, + "loss": 0.7128, + "step": 1168 + }, + { + "epoch": 0.17888293802601377, + "grad_norm": 2.644074314289146, + "learning_rate": 1.981565923891286e-05, + "loss": 0.8859, + "step": 1169 + }, + { + "epoch": 0.17903596021423107, + "grad_norm": 2.5503162195908544, + "learning_rate": 1.9815185297551584e-05, + "loss": 0.8524, + "step": 1170 + }, + { + "epoch": 0.17918898240244835, + "grad_norm": 2.702749830814934, + "learning_rate": 1.981471075340138e-05, + "loss": 0.8143, + "step": 1171 + }, + { + "epoch": 0.17934200459066565, + "grad_norm": 2.6253033708655713, + "learning_rate": 1.981423560649139e-05, + "loss": 0.7908, + "step": 1172 + }, + { + "epoch": 0.17949502677888293, + "grad_norm": 2.5121491386593924, + "learning_rate": 1.98137598568508e-05, + "loss": 0.8573, + "step": 1173 + }, + { + "epoch": 0.17964804896710024, + "grad_norm": 3.5925654232893134, + "learning_rate": 1.9813283504508828e-05, + "loss": 0.8668, + "step": 1174 + }, + { + "epoch": 0.17980107115531752, + "grad_norm": 3.0099094303601195, + "learning_rate": 1.9812806549494723e-05, + "loss": 0.7845, + "step": 1175 + }, + { + "epoch": 0.17995409334353482, + "grad_norm": 7.263338996571074, + "learning_rate": 1.981232899183778e-05, + "loss": 0.9048, + "step": 1176 + }, + { + "epoch": 0.1801071155317521, + "grad_norm": 4.591653363432459, + "learning_rate": 1.9811850831567327e-05, + "loss": 0.7974, + "step": 1177 + }, + { + "epoch": 0.1802601377199694, + "grad_norm": 3.825510779009282, + "learning_rate": 1.9811372068712734e-05, + "loss": 0.8295, + "step": 1178 + }, + { + "epoch": 0.18041315990818668, + "grad_norm": 2.9921320333176706, + "learning_rate": 1.9810892703303398e-05, + "loss": 0.8919, + "step": 1179 + }, + { + "epoch": 0.180566182096404, + "grad_norm": 2.7588519917332617, + "learning_rate": 1.981041273536876e-05, + "loss": 0.7638, + "step": 1180 + }, + { + "epoch": 0.18071920428462127, + "grad_norm": 3.083353753553559, + "learning_rate": 1.9809932164938297e-05, + "loss": 0.942, + "step": 1181 + }, + { + "epoch": 0.18087222647283857, + "grad_norm": 2.728694292712886, + "learning_rate": 1.9809450992041522e-05, + "loss": 0.9411, + "step": 1182 + }, + { + "epoch": 0.18102524866105585, + "grad_norm": 2.7319529906411835, + "learning_rate": 1.980896921670799e-05, + "loss": 0.874, + "step": 1183 + }, + { + "epoch": 0.18117827084927315, + "grad_norm": 2.7865710261296917, + "learning_rate": 1.9808486838967286e-05, + "loss": 0.9014, + "step": 1184 + }, + { + "epoch": 0.18133129303749043, + "grad_norm": 2.5143812899440268, + "learning_rate": 1.9808003858849032e-05, + "loss": 0.8666, + "step": 1185 + }, + { + "epoch": 0.18148431522570774, + "grad_norm": 2.4430869214626516, + "learning_rate": 1.980752027638289e-05, + "loss": 0.7535, + "step": 1186 + }, + { + "epoch": 0.18163733741392502, + "grad_norm": 2.2877893367824726, + "learning_rate": 1.9807036091598563e-05, + "loss": 0.6914, + "step": 1187 + }, + { + "epoch": 0.18179035960214232, + "grad_norm": 3.0901473909433124, + "learning_rate": 1.9806551304525784e-05, + "loss": 0.795, + "step": 1188 + }, + { + "epoch": 0.1819433817903596, + "grad_norm": 2.7091237157952612, + "learning_rate": 1.9806065915194326e-05, + "loss": 0.8558, + "step": 1189 + }, + { + "epoch": 0.1820964039785769, + "grad_norm": 2.7289003459871743, + "learning_rate": 1.9805579923633997e-05, + "loss": 0.9272, + "step": 1190 + }, + { + "epoch": 0.18224942616679418, + "grad_norm": 2.9092943582139377, + "learning_rate": 1.980509332987465e-05, + "loss": 0.7949, + "step": 1191 + }, + { + "epoch": 0.1824024483550115, + "grad_norm": 2.8490336831731002, + "learning_rate": 1.980460613394616e-05, + "loss": 0.9731, + "step": 1192 + }, + { + "epoch": 0.18255547054322876, + "grad_norm": 2.5321760664034225, + "learning_rate": 1.9804118335878452e-05, + "loss": 0.8599, + "step": 1193 + }, + { + "epoch": 0.18270849273144607, + "grad_norm": 2.582180096249362, + "learning_rate": 1.980362993570148e-05, + "loss": 0.7725, + "step": 1194 + }, + { + "epoch": 0.18286151491966335, + "grad_norm": 3.050407598760349, + "learning_rate": 1.9803140933445246e-05, + "loss": 0.9098, + "step": 1195 + }, + { + "epoch": 0.18301453710788065, + "grad_norm": 2.9433028267353025, + "learning_rate": 1.980265132913978e-05, + "loss": 0.9321, + "step": 1196 + }, + { + "epoch": 0.18316755929609793, + "grad_norm": 2.439342164588578, + "learning_rate": 1.980216112281514e-05, + "loss": 0.745, + "step": 1197 + }, + { + "epoch": 0.18332058148431524, + "grad_norm": 2.8658445163454886, + "learning_rate": 1.9801670314501445e-05, + "loss": 0.9419, + "step": 1198 + }, + { + "epoch": 0.18347360367253251, + "grad_norm": 2.3996411264820825, + "learning_rate": 1.980117890422883e-05, + "loss": 0.7613, + "step": 1199 + }, + { + "epoch": 0.18362662586074982, + "grad_norm": 2.634152697556565, + "learning_rate": 1.9800686892027476e-05, + "loss": 0.838, + "step": 1200 + }, + { + "epoch": 0.1837796480489671, + "grad_norm": 2.507128419724863, + "learning_rate": 1.9800194277927598e-05, + "loss": 0.7374, + "step": 1201 + }, + { + "epoch": 0.1839326702371844, + "grad_norm": 2.5150477885289453, + "learning_rate": 1.9799701061959453e-05, + "loss": 0.8103, + "step": 1202 + }, + { + "epoch": 0.18408569242540168, + "grad_norm": 2.7743946382832214, + "learning_rate": 1.9799207244153328e-05, + "loss": 0.7684, + "step": 1203 + }, + { + "epoch": 0.184238714613619, + "grad_norm": 2.7916364471729262, + "learning_rate": 1.979871282453955e-05, + "loss": 0.9085, + "step": 1204 + }, + { + "epoch": 0.18439173680183626, + "grad_norm": 2.6761395292421652, + "learning_rate": 1.9798217803148488e-05, + "loss": 0.8737, + "step": 1205 + }, + { + "epoch": 0.18454475899005357, + "grad_norm": 2.7327345821631353, + "learning_rate": 1.9797722180010536e-05, + "loss": 0.8907, + "step": 1206 + }, + { + "epoch": 0.18469778117827085, + "grad_norm": 2.926716209435097, + "learning_rate": 1.979722595515614e-05, + "loss": 0.9471, + "step": 1207 + }, + { + "epoch": 0.18485080336648815, + "grad_norm": 2.4288423001399724, + "learning_rate": 1.979672912861577e-05, + "loss": 0.8499, + "step": 1208 + }, + { + "epoch": 0.18500382555470543, + "grad_norm": 2.8360903216784865, + "learning_rate": 1.9796231700419937e-05, + "loss": 0.9216, + "step": 1209 + }, + { + "epoch": 0.18515684774292274, + "grad_norm": 2.9513922158366466, + "learning_rate": 1.9795733670599194e-05, + "loss": 0.8389, + "step": 1210 + }, + { + "epoch": 0.18530986993114001, + "grad_norm": 2.480276836129265, + "learning_rate": 1.9795235039184122e-05, + "loss": 0.8037, + "step": 1211 + }, + { + "epoch": 0.18546289211935732, + "grad_norm": 2.5886986020209632, + "learning_rate": 1.979473580620535e-05, + "loss": 0.837, + "step": 1212 + }, + { + "epoch": 0.1856159143075746, + "grad_norm": 2.673110598202028, + "learning_rate": 1.9794235971693537e-05, + "loss": 0.8506, + "step": 1213 + }, + { + "epoch": 0.1857689364957919, + "grad_norm": 2.8120113685946833, + "learning_rate": 1.9793735535679373e-05, + "loss": 0.8873, + "step": 1214 + }, + { + "epoch": 0.18592195868400918, + "grad_norm": 2.4841528470474525, + "learning_rate": 1.97932344981936e-05, + "loss": 0.8227, + "step": 1215 + }, + { + "epoch": 0.1860749808722265, + "grad_norm": 2.6262957390756423, + "learning_rate": 1.9792732859266985e-05, + "loss": 0.831, + "step": 1216 + }, + { + "epoch": 0.18622800306044376, + "grad_norm": 2.7867995827118905, + "learning_rate": 1.979223061893033e-05, + "loss": 0.8887, + "step": 1217 + }, + { + "epoch": 0.18638102524866107, + "grad_norm": 2.4630014832920133, + "learning_rate": 1.9791727777214494e-05, + "loss": 0.7623, + "step": 1218 + }, + { + "epoch": 0.18653404743687835, + "grad_norm": 2.8608171398956714, + "learning_rate": 1.9791224334150344e-05, + "loss": 0.8973, + "step": 1219 + }, + { + "epoch": 0.18668706962509563, + "grad_norm": 2.625067202467613, + "learning_rate": 1.9790720289768807e-05, + "loss": 0.88, + "step": 1220 + }, + { + "epoch": 0.18684009181331293, + "grad_norm": 2.6002917464889475, + "learning_rate": 1.9790215644100834e-05, + "loss": 0.8389, + "step": 1221 + }, + { + "epoch": 0.1869931140015302, + "grad_norm": 2.9459681926716046, + "learning_rate": 1.978971039717742e-05, + "loss": 0.8296, + "step": 1222 + }, + { + "epoch": 0.18714613618974751, + "grad_norm": 2.713734464464466, + "learning_rate": 1.978920454902959e-05, + "loss": 0.8426, + "step": 1223 + }, + { + "epoch": 0.1872991583779648, + "grad_norm": 2.7203914331532415, + "learning_rate": 1.9788698099688416e-05, + "loss": 0.9216, + "step": 1224 + }, + { + "epoch": 0.1874521805661821, + "grad_norm": 2.410057875984283, + "learning_rate": 1.9788191049184998e-05, + "loss": 0.8333, + "step": 1225 + }, + { + "epoch": 0.18760520275439937, + "grad_norm": 2.7449351455709827, + "learning_rate": 1.9787683397550476e-05, + "loss": 0.8998, + "step": 1226 + }, + { + "epoch": 0.18775822494261668, + "grad_norm": 2.852865788819563, + "learning_rate": 1.9787175144816024e-05, + "loss": 0.7877, + "step": 1227 + }, + { + "epoch": 0.18791124713083396, + "grad_norm": 2.50764849784216, + "learning_rate": 1.9786666291012865e-05, + "loss": 0.8286, + "step": 1228 + }, + { + "epoch": 0.18806426931905126, + "grad_norm": 2.572027875501263, + "learning_rate": 1.9786156836172237e-05, + "loss": 0.8445, + "step": 1229 + }, + { + "epoch": 0.18821729150726854, + "grad_norm": 2.484590343748145, + "learning_rate": 1.9785646780325435e-05, + "loss": 0.8945, + "step": 1230 + }, + { + "epoch": 0.18837031369548585, + "grad_norm": 2.5100868462046817, + "learning_rate": 1.9785136123503787e-05, + "loss": 0.8939, + "step": 1231 + }, + { + "epoch": 0.18852333588370312, + "grad_norm": 2.455177735428078, + "learning_rate": 1.9784624865738643e-05, + "loss": 0.7709, + "step": 1232 + }, + { + "epoch": 0.18867635807192043, + "grad_norm": 2.4916566077574567, + "learning_rate": 1.9784113007061414e-05, + "loss": 0.8872, + "step": 1233 + }, + { + "epoch": 0.1888293802601377, + "grad_norm": 2.430966096318744, + "learning_rate": 1.9783600547503528e-05, + "loss": 0.9165, + "step": 1234 + }, + { + "epoch": 0.188982402448355, + "grad_norm": 2.4779511899944264, + "learning_rate": 1.978308748709646e-05, + "loss": 0.7428, + "step": 1235 + }, + { + "epoch": 0.1891354246365723, + "grad_norm": 2.7661213468146175, + "learning_rate": 1.9782573825871715e-05, + "loss": 0.7765, + "step": 1236 + }, + { + "epoch": 0.1892884468247896, + "grad_norm": 2.5469813018884793, + "learning_rate": 1.9782059563860844e-05, + "loss": 0.7575, + "step": 1237 + }, + { + "epoch": 0.18944146901300687, + "grad_norm": 2.6609455008160396, + "learning_rate": 1.9781544701095426e-05, + "loss": 0.9117, + "step": 1238 + }, + { + "epoch": 0.18959449120122418, + "grad_norm": 2.8769593063146948, + "learning_rate": 1.9781029237607082e-05, + "loss": 0.8893, + "step": 1239 + }, + { + "epoch": 0.18974751338944146, + "grad_norm": 2.910752462671504, + "learning_rate": 1.9780513173427472e-05, + "loss": 0.862, + "step": 1240 + }, + { + "epoch": 0.18990053557765876, + "grad_norm": 2.8258652202291135, + "learning_rate": 1.977999650858828e-05, + "loss": 0.9601, + "step": 1241 + }, + { + "epoch": 0.19005355776587604, + "grad_norm": 2.3361137453572756, + "learning_rate": 1.977947924312125e-05, + "loss": 0.6773, + "step": 1242 + }, + { + "epoch": 0.19020657995409335, + "grad_norm": 2.732147684563504, + "learning_rate": 1.9778961377058138e-05, + "loss": 0.9071, + "step": 1243 + }, + { + "epoch": 0.19035960214231062, + "grad_norm": 2.6873683102739014, + "learning_rate": 1.9778442910430753e-05, + "loss": 0.7924, + "step": 1244 + }, + { + "epoch": 0.19051262433052793, + "grad_norm": 2.8385621841540196, + "learning_rate": 1.9777923843270937e-05, + "loss": 0.9363, + "step": 1245 + }, + { + "epoch": 0.1906656465187452, + "grad_norm": 2.6345781764376106, + "learning_rate": 1.9777404175610563e-05, + "loss": 0.8486, + "step": 1246 + }, + { + "epoch": 0.1908186687069625, + "grad_norm": 2.7282412099331967, + "learning_rate": 1.9776883907481556e-05, + "loss": 0.8968, + "step": 1247 + }, + { + "epoch": 0.1909716908951798, + "grad_norm": 3.1198583665695385, + "learning_rate": 1.9776363038915853e-05, + "loss": 0.8591, + "step": 1248 + }, + { + "epoch": 0.1911247130833971, + "grad_norm": 2.9650963500849605, + "learning_rate": 1.9775841569945455e-05, + "loss": 0.895, + "step": 1249 + }, + { + "epoch": 0.19127773527161437, + "grad_norm": 3.005956067515491, + "learning_rate": 1.9775319500602383e-05, + "loss": 0.8774, + "step": 1250 + }, + { + "epoch": 0.19143075745983168, + "grad_norm": 2.785558603450888, + "learning_rate": 1.9774796830918696e-05, + "loss": 0.7999, + "step": 1251 + }, + { + "epoch": 0.19158377964804896, + "grad_norm": 2.754290803774593, + "learning_rate": 1.97742735609265e-05, + "loss": 0.8619, + "step": 1252 + }, + { + "epoch": 0.19173680183626626, + "grad_norm": 2.59123133889139, + "learning_rate": 1.9773749690657923e-05, + "loss": 0.8181, + "step": 1253 + }, + { + "epoch": 0.19188982402448354, + "grad_norm": 2.423499848751921, + "learning_rate": 1.9773225220145144e-05, + "loss": 0.858, + "step": 1254 + }, + { + "epoch": 0.19204284621270085, + "grad_norm": 2.651915413290144, + "learning_rate": 1.9772700149420374e-05, + "loss": 0.9273, + "step": 1255 + }, + { + "epoch": 0.19219586840091812, + "grad_norm": 2.4797552410990154, + "learning_rate": 1.9772174478515853e-05, + "loss": 0.8235, + "step": 1256 + }, + { + "epoch": 0.19234889058913543, + "grad_norm": 2.5741982732357824, + "learning_rate": 1.977164820746387e-05, + "loss": 0.77, + "step": 1257 + }, + { + "epoch": 0.1925019127773527, + "grad_norm": 2.610715685337163, + "learning_rate": 1.9771121336296744e-05, + "loss": 0.8574, + "step": 1258 + }, + { + "epoch": 0.19265493496557, + "grad_norm": 2.4847035418283254, + "learning_rate": 1.9770593865046832e-05, + "loss": 0.8437, + "step": 1259 + }, + { + "epoch": 0.1928079571537873, + "grad_norm": 2.4929585213596672, + "learning_rate": 1.9770065793746528e-05, + "loss": 0.8486, + "step": 1260 + }, + { + "epoch": 0.1929609793420046, + "grad_norm": 3.044420651513117, + "learning_rate": 1.9769537122428264e-05, + "loss": 0.9013, + "step": 1261 + }, + { + "epoch": 0.19311400153022187, + "grad_norm": 2.6620986896543237, + "learning_rate": 1.9769007851124505e-05, + "loss": 0.9234, + "step": 1262 + }, + { + "epoch": 0.19326702371843918, + "grad_norm": 2.8157426489061135, + "learning_rate": 1.9768477979867757e-05, + "loss": 0.9035, + "step": 1263 + }, + { + "epoch": 0.19342004590665646, + "grad_norm": 2.529021689411045, + "learning_rate": 1.9767947508690562e-05, + "loss": 0.8663, + "step": 1264 + }, + { + "epoch": 0.19357306809487376, + "grad_norm": 2.5843925171763535, + "learning_rate": 1.97674164376255e-05, + "loss": 0.8378, + "step": 1265 + }, + { + "epoch": 0.19372609028309104, + "grad_norm": 3.0335901445285525, + "learning_rate": 1.9766884766705182e-05, + "loss": 0.8908, + "step": 1266 + }, + { + "epoch": 0.19387911247130835, + "grad_norm": 3.145047505720264, + "learning_rate": 1.9766352495962263e-05, + "loss": 1.0083, + "step": 1267 + }, + { + "epoch": 0.19403213465952562, + "grad_norm": 2.4039128378461543, + "learning_rate": 1.976581962542943e-05, + "loss": 0.7459, + "step": 1268 + }, + { + "epoch": 0.19418515684774293, + "grad_norm": 2.2548402455018133, + "learning_rate": 1.976528615513941e-05, + "loss": 0.8713, + "step": 1269 + }, + { + "epoch": 0.1943381790359602, + "grad_norm": 2.5574308945342876, + "learning_rate": 1.9764752085124968e-05, + "loss": 0.8306, + "step": 1270 + }, + { + "epoch": 0.1944912012241775, + "grad_norm": 2.73069294916681, + "learning_rate": 1.97642174154189e-05, + "loss": 0.8369, + "step": 1271 + }, + { + "epoch": 0.1946442234123948, + "grad_norm": 2.48980869464948, + "learning_rate": 1.9763682146054036e-05, + "loss": 0.7243, + "step": 1272 + }, + { + "epoch": 0.1947972456006121, + "grad_norm": 2.736038468896169, + "learning_rate": 1.9763146277063264e-05, + "loss": 0.8022, + "step": 1273 + }, + { + "epoch": 0.19495026778882937, + "grad_norm": 2.698636507396173, + "learning_rate": 1.976260980847948e-05, + "loss": 0.8665, + "step": 1274 + }, + { + "epoch": 0.19510328997704668, + "grad_norm": 2.728859710561004, + "learning_rate": 1.976207274033564e-05, + "loss": 0.7747, + "step": 1275 + }, + { + "epoch": 0.19525631216526396, + "grad_norm": 2.4838979015966607, + "learning_rate": 1.976153507266472e-05, + "loss": 0.8504, + "step": 1276 + }, + { + "epoch": 0.19540933435348126, + "grad_norm": 2.659727034754188, + "learning_rate": 1.976099680549975e-05, + "loss": 0.9027, + "step": 1277 + }, + { + "epoch": 0.19556235654169854, + "grad_norm": 2.7146676990146257, + "learning_rate": 1.9760457938873773e-05, + "loss": 0.8086, + "step": 1278 + }, + { + "epoch": 0.19571537872991585, + "grad_norm": 2.476438101864568, + "learning_rate": 1.9759918472819896e-05, + "loss": 0.8024, + "step": 1279 + }, + { + "epoch": 0.19586840091813312, + "grad_norm": 2.494365497726393, + "learning_rate": 1.9759378407371242e-05, + "loss": 0.8212, + "step": 1280 + }, + { + "epoch": 0.19602142310635043, + "grad_norm": 2.7005260217584444, + "learning_rate": 1.9758837742560984e-05, + "loss": 0.8976, + "step": 1281 + }, + { + "epoch": 0.1961744452945677, + "grad_norm": 2.590370938150912, + "learning_rate": 1.975829647842232e-05, + "loss": 0.7617, + "step": 1282 + }, + { + "epoch": 0.196327467482785, + "grad_norm": 2.5882464926232824, + "learning_rate": 1.9757754614988495e-05, + "loss": 0.7785, + "step": 1283 + }, + { + "epoch": 0.1964804896710023, + "grad_norm": 2.502547989366547, + "learning_rate": 1.975721215229279e-05, + "loss": 0.9019, + "step": 1284 + }, + { + "epoch": 0.1966335118592196, + "grad_norm": 2.5390514701365268, + "learning_rate": 1.9756669090368514e-05, + "loss": 0.821, + "step": 1285 + }, + { + "epoch": 0.19678653404743687, + "grad_norm": 2.639839580822911, + "learning_rate": 1.975612542924902e-05, + "loss": 0.8262, + "step": 1286 + }, + { + "epoch": 0.19693955623565418, + "grad_norm": 2.4825562925435443, + "learning_rate": 1.97555811689677e-05, + "loss": 0.79, + "step": 1287 + }, + { + "epoch": 0.19709257842387146, + "grad_norm": 2.7116933120169704, + "learning_rate": 1.9755036309557974e-05, + "loss": 0.9368, + "step": 1288 + }, + { + "epoch": 0.19724560061208876, + "grad_norm": 2.663602543382191, + "learning_rate": 1.9754490851053306e-05, + "loss": 0.8395, + "step": 1289 + }, + { + "epoch": 0.19739862280030604, + "grad_norm": 2.642048603050421, + "learning_rate": 1.9753944793487195e-05, + "loss": 0.9173, + "step": 1290 + }, + { + "epoch": 0.19755164498852334, + "grad_norm": 2.619082257934699, + "learning_rate": 1.975339813689318e-05, + "loss": 0.8949, + "step": 1291 + }, + { + "epoch": 0.19770466717674062, + "grad_norm": 2.469477303471826, + "learning_rate": 1.9752850881304827e-05, + "loss": 0.8011, + "step": 1292 + }, + { + "epoch": 0.19785768936495793, + "grad_norm": 2.332170742162521, + "learning_rate": 1.9752303026755747e-05, + "loss": 0.7746, + "step": 1293 + }, + { + "epoch": 0.1980107115531752, + "grad_norm": 2.465506481746153, + "learning_rate": 1.975175457327959e-05, + "loss": 0.7254, + "step": 1294 + }, + { + "epoch": 0.1981637337413925, + "grad_norm": 2.504242954456863, + "learning_rate": 1.975120552091003e-05, + "loss": 0.8851, + "step": 1295 + }, + { + "epoch": 0.1983167559296098, + "grad_norm": 2.7474656960473345, + "learning_rate": 1.97506558696808e-05, + "loss": 0.8854, + "step": 1296 + }, + { + "epoch": 0.1984697781178271, + "grad_norm": 2.55032537694633, + "learning_rate": 1.9750105619625644e-05, + "loss": 0.8987, + "step": 1297 + }, + { + "epoch": 0.19862280030604437, + "grad_norm": 3.147485983482909, + "learning_rate": 1.9749554770778358e-05, + "loss": 0.8879, + "step": 1298 + }, + { + "epoch": 0.19877582249426168, + "grad_norm": 2.774617380445243, + "learning_rate": 1.974900332317277e-05, + "loss": 0.9347, + "step": 1299 + }, + { + "epoch": 0.19892884468247896, + "grad_norm": 2.5337569604258623, + "learning_rate": 1.974845127684276e-05, + "loss": 0.7679, + "step": 1300 + }, + { + "epoch": 0.19908186687069626, + "grad_norm": 2.432147574123346, + "learning_rate": 1.9747898631822213e-05, + "loss": 0.8188, + "step": 1301 + }, + { + "epoch": 0.19923488905891354, + "grad_norm": 2.6080426910470775, + "learning_rate": 1.9747345388145082e-05, + "loss": 0.8023, + "step": 1302 + }, + { + "epoch": 0.19938791124713084, + "grad_norm": 2.4662419988741497, + "learning_rate": 1.9746791545845335e-05, + "loss": 0.7576, + "step": 1303 + }, + { + "epoch": 0.19954093343534812, + "grad_norm": 2.6788843876154353, + "learning_rate": 1.974623710495699e-05, + "loss": 0.807, + "step": 1304 + }, + { + "epoch": 0.19969395562356543, + "grad_norm": 2.575256113628228, + "learning_rate": 1.9745682065514096e-05, + "loss": 0.8279, + "step": 1305 + }, + { + "epoch": 0.1998469778117827, + "grad_norm": 2.714741610802897, + "learning_rate": 1.9745126427550742e-05, + "loss": 0.8878, + "step": 1306 + }, + { + "epoch": 0.2, + "grad_norm": 2.8040762303011126, + "learning_rate": 1.9744570191101053e-05, + "loss": 0.8519, + "step": 1307 + }, + { + "epoch": 0.2001530221882173, + "grad_norm": 2.824683717072596, + "learning_rate": 1.9744013356199186e-05, + "loss": 0.8524, + "step": 1308 + }, + { + "epoch": 0.2003060443764346, + "grad_norm": 2.6712623088554306, + "learning_rate": 1.974345592287934e-05, + "loss": 0.9416, + "step": 1309 + }, + { + "epoch": 0.20045906656465187, + "grad_norm": 2.6566874873753794, + "learning_rate": 1.9742897891175746e-05, + "loss": 0.8073, + "step": 1310 + }, + { + "epoch": 0.20061208875286918, + "grad_norm": 2.8105095398771174, + "learning_rate": 1.9742339261122682e-05, + "loss": 0.8246, + "step": 1311 + }, + { + "epoch": 0.20076511094108646, + "grad_norm": 2.4516251972652334, + "learning_rate": 1.9741780032754452e-05, + "loss": 0.8449, + "step": 1312 + }, + { + "epoch": 0.20091813312930376, + "grad_norm": 2.7270021374282036, + "learning_rate": 1.9741220206105398e-05, + "loss": 0.9152, + "step": 1313 + }, + { + "epoch": 0.20107115531752104, + "grad_norm": 2.6346059254428815, + "learning_rate": 1.9740659781209905e-05, + "loss": 0.9087, + "step": 1314 + }, + { + "epoch": 0.20122417750573834, + "grad_norm": 2.6401950334296433, + "learning_rate": 1.9740098758102388e-05, + "loss": 0.8442, + "step": 1315 + }, + { + "epoch": 0.20137719969395562, + "grad_norm": 2.590632699986551, + "learning_rate": 1.9739537136817303e-05, + "loss": 0.7427, + "step": 1316 + }, + { + "epoch": 0.20153022188217293, + "grad_norm": 2.515449790487725, + "learning_rate": 1.973897491738914e-05, + "loss": 0.852, + "step": 1317 + }, + { + "epoch": 0.2016832440703902, + "grad_norm": 2.5728999188481927, + "learning_rate": 1.973841209985243e-05, + "loss": 0.8264, + "step": 1318 + }, + { + "epoch": 0.2018362662586075, + "grad_norm": 2.762783647383601, + "learning_rate": 1.973784868424174e-05, + "loss": 0.9527, + "step": 1319 + }, + { + "epoch": 0.2019892884468248, + "grad_norm": 2.5039467780176, + "learning_rate": 1.9737284670591662e-05, + "loss": 0.8145, + "step": 1320 + }, + { + "epoch": 0.2021423106350421, + "grad_norm": 2.741462373035885, + "learning_rate": 1.973672005893684e-05, + "loss": 0.8645, + "step": 1321 + }, + { + "epoch": 0.20229533282325937, + "grad_norm": 2.589649068698681, + "learning_rate": 1.973615484931195e-05, + "loss": 0.9089, + "step": 1322 + }, + { + "epoch": 0.20244835501147668, + "grad_norm": 2.2420383393474625, + "learning_rate": 1.9735589041751702e-05, + "loss": 0.7429, + "step": 1323 + }, + { + "epoch": 0.20260137719969395, + "grad_norm": 2.8834726302997473, + "learning_rate": 1.9735022636290845e-05, + "loss": 0.8063, + "step": 1324 + }, + { + "epoch": 0.20275439938791126, + "grad_norm": 2.8281724474513608, + "learning_rate": 1.9734455632964166e-05, + "loss": 0.9345, + "step": 1325 + }, + { + "epoch": 0.20290742157612854, + "grad_norm": 2.6418928860290967, + "learning_rate": 1.9733888031806485e-05, + "loss": 0.8527, + "step": 1326 + }, + { + "epoch": 0.20306044376434584, + "grad_norm": 2.801742930828167, + "learning_rate": 1.973331983285266e-05, + "loss": 0.902, + "step": 1327 + }, + { + "epoch": 0.20321346595256312, + "grad_norm": 2.724387569739316, + "learning_rate": 1.9732751036137588e-05, + "loss": 0.9353, + "step": 1328 + }, + { + "epoch": 0.20336648814078043, + "grad_norm": 2.852911846115286, + "learning_rate": 1.97321816416962e-05, + "loss": 0.9431, + "step": 1329 + }, + { + "epoch": 0.2035195103289977, + "grad_norm": 2.9385006447379203, + "learning_rate": 1.973161164956346e-05, + "loss": 0.7805, + "step": 1330 + }, + { + "epoch": 0.20367253251721498, + "grad_norm": 2.6617267844097, + "learning_rate": 1.9731041059774387e-05, + "loss": 0.9011, + "step": 1331 + }, + { + "epoch": 0.2038255547054323, + "grad_norm": 2.535089745451929, + "learning_rate": 1.973046987236401e-05, + "loss": 0.8131, + "step": 1332 + }, + { + "epoch": 0.20397857689364957, + "grad_norm": 2.5575312901886074, + "learning_rate": 1.972989808736741e-05, + "loss": 0.9255, + "step": 1333 + }, + { + "epoch": 0.20413159908186687, + "grad_norm": 2.9489071986972246, + "learning_rate": 1.972932570481971e-05, + "loss": 0.8734, + "step": 1334 + }, + { + "epoch": 0.20428462127008415, + "grad_norm": 2.6401501138319814, + "learning_rate": 1.9728752724756052e-05, + "loss": 0.8659, + "step": 1335 + }, + { + "epoch": 0.20443764345830145, + "grad_norm": 2.57743017743404, + "learning_rate": 1.9728179147211634e-05, + "loss": 0.6302, + "step": 1336 + }, + { + "epoch": 0.20459066564651873, + "grad_norm": 2.696725935953648, + "learning_rate": 1.9727604972221674e-05, + "loss": 0.9445, + "step": 1337 + }, + { + "epoch": 0.20474368783473604, + "grad_norm": 2.627476393799533, + "learning_rate": 1.9727030199821443e-05, + "loss": 0.7543, + "step": 1338 + }, + { + "epoch": 0.20489671002295332, + "grad_norm": 2.3880575508880604, + "learning_rate": 1.9726454830046233e-05, + "loss": 0.7506, + "step": 1339 + }, + { + "epoch": 0.20504973221117062, + "grad_norm": 2.7835239139366177, + "learning_rate": 1.9725878862931376e-05, + "loss": 0.8102, + "step": 1340 + }, + { + "epoch": 0.2052027543993879, + "grad_norm": 2.7321404831940614, + "learning_rate": 1.9725302298512257e-05, + "loss": 0.7956, + "step": 1341 + }, + { + "epoch": 0.2053557765876052, + "grad_norm": 2.457638556107576, + "learning_rate": 1.9724725136824277e-05, + "loss": 0.8221, + "step": 1342 + }, + { + "epoch": 0.20550879877582248, + "grad_norm": 2.4536782205822516, + "learning_rate": 1.9724147377902884e-05, + "loss": 0.71, + "step": 1343 + }, + { + "epoch": 0.2056618209640398, + "grad_norm": 2.7635778874710564, + "learning_rate": 1.9723569021783557e-05, + "loss": 0.8422, + "step": 1344 + }, + { + "epoch": 0.20581484315225707, + "grad_norm": 2.600934468297566, + "learning_rate": 1.9722990068501818e-05, + "loss": 0.8543, + "step": 1345 + }, + { + "epoch": 0.20596786534047437, + "grad_norm": 2.5400207546187388, + "learning_rate": 1.972241051809322e-05, + "loss": 0.8007, + "step": 1346 + }, + { + "epoch": 0.20612088752869165, + "grad_norm": 2.709332583626073, + "learning_rate": 1.9721830370593364e-05, + "loss": 0.8138, + "step": 1347 + }, + { + "epoch": 0.20627390971690895, + "grad_norm": 2.724189789443129, + "learning_rate": 1.972124962603787e-05, + "loss": 0.9304, + "step": 1348 + }, + { + "epoch": 0.20642693190512623, + "grad_norm": 2.4858669918490834, + "learning_rate": 1.9720668284462407e-05, + "loss": 0.7141, + "step": 1349 + }, + { + "epoch": 0.20657995409334354, + "grad_norm": 2.4065464378747254, + "learning_rate": 1.9720086345902675e-05, + "loss": 0.7101, + "step": 1350 + }, + { + "epoch": 0.20673297628156082, + "grad_norm": 2.524649685819583, + "learning_rate": 1.9719503810394417e-05, + "loss": 0.9611, + "step": 1351 + }, + { + "epoch": 0.20688599846977812, + "grad_norm": 2.7392602306838083, + "learning_rate": 1.9718920677973407e-05, + "loss": 0.8701, + "step": 1352 + }, + { + "epoch": 0.2070390206579954, + "grad_norm": 2.8684316541922716, + "learning_rate": 1.9718336948675457e-05, + "loss": 0.8392, + "step": 1353 + }, + { + "epoch": 0.2071920428462127, + "grad_norm": 2.797057123846008, + "learning_rate": 1.9717752622536417e-05, + "loss": 0.8712, + "step": 1354 + }, + { + "epoch": 0.20734506503442998, + "grad_norm": 2.7197161819835194, + "learning_rate": 1.9717167699592173e-05, + "loss": 0.8252, + "step": 1355 + }, + { + "epoch": 0.2074980872226473, + "grad_norm": 2.3072808087513605, + "learning_rate": 1.9716582179878645e-05, + "loss": 0.803, + "step": 1356 + }, + { + "epoch": 0.20765110941086456, + "grad_norm": 2.923572676762194, + "learning_rate": 1.97159960634318e-05, + "loss": 0.9197, + "step": 1357 + }, + { + "epoch": 0.20780413159908187, + "grad_norm": 2.792292441459427, + "learning_rate": 1.9715409350287618e-05, + "loss": 0.8846, + "step": 1358 + }, + { + "epoch": 0.20795715378729915, + "grad_norm": 2.793577571557601, + "learning_rate": 1.9714822040482144e-05, + "loss": 0.9456, + "step": 1359 + }, + { + "epoch": 0.20811017597551645, + "grad_norm": 2.5197653839780094, + "learning_rate": 1.9714234134051446e-05, + "loss": 0.8231, + "step": 1360 + }, + { + "epoch": 0.20826319816373373, + "grad_norm": 2.5856070289772943, + "learning_rate": 1.9713645631031628e-05, + "loss": 0.8052, + "step": 1361 + }, + { + "epoch": 0.20841622035195104, + "grad_norm": 2.4578741682910694, + "learning_rate": 1.971305653145883e-05, + "loss": 0.7517, + "step": 1362 + }, + { + "epoch": 0.20856924254016831, + "grad_norm": 2.519889808986514, + "learning_rate": 1.9712466835369234e-05, + "loss": 0.8433, + "step": 1363 + }, + { + "epoch": 0.20872226472838562, + "grad_norm": 2.377119912314756, + "learning_rate": 1.9711876542799056e-05, + "loss": 0.8387, + "step": 1364 + }, + { + "epoch": 0.2088752869166029, + "grad_norm": 2.577350155100122, + "learning_rate": 1.9711285653784543e-05, + "loss": 0.8693, + "step": 1365 + }, + { + "epoch": 0.2090283091048202, + "grad_norm": 2.3743002683845544, + "learning_rate": 1.971069416836199e-05, + "loss": 0.7715, + "step": 1366 + }, + { + "epoch": 0.20918133129303748, + "grad_norm": 2.5970257954430167, + "learning_rate": 1.9710102086567714e-05, + "loss": 0.8364, + "step": 1367 + }, + { + "epoch": 0.2093343534812548, + "grad_norm": 2.463807934762222, + "learning_rate": 1.9709509408438087e-05, + "loss": 0.9367, + "step": 1368 + }, + { + "epoch": 0.20948737566947206, + "grad_norm": 2.5215756250590173, + "learning_rate": 1.9708916134009507e-05, + "loss": 0.9188, + "step": 1369 + }, + { + "epoch": 0.20964039785768937, + "grad_norm": 2.5332861323220026, + "learning_rate": 1.97083222633184e-05, + "loss": 0.8427, + "step": 1370 + }, + { + "epoch": 0.20979342004590665, + "grad_norm": 3.1280702666688738, + "learning_rate": 1.9707727796401243e-05, + "loss": 0.8749, + "step": 1371 + }, + { + "epoch": 0.20994644223412395, + "grad_norm": 2.7501921842495447, + "learning_rate": 1.970713273329455e-05, + "loss": 0.929, + "step": 1372 + }, + { + "epoch": 0.21009946442234123, + "grad_norm": 2.489037234979162, + "learning_rate": 1.970653707403486e-05, + "loss": 0.8092, + "step": 1373 + }, + { + "epoch": 0.21025248661055854, + "grad_norm": 2.6729264987086436, + "learning_rate": 1.9705940818658753e-05, + "loss": 0.8513, + "step": 1374 + }, + { + "epoch": 0.21040550879877581, + "grad_norm": 2.7135658037185815, + "learning_rate": 1.9705343967202853e-05, + "loss": 0.8504, + "step": 1375 + }, + { + "epoch": 0.21055853098699312, + "grad_norm": 2.748488375492164, + "learning_rate": 1.970474651970381e-05, + "loss": 0.8899, + "step": 1376 + }, + { + "epoch": 0.2107115531752104, + "grad_norm": 2.851236345307254, + "learning_rate": 1.9704148476198323e-05, + "loss": 0.8719, + "step": 1377 + }, + { + "epoch": 0.2108645753634277, + "grad_norm": 2.7526677950267637, + "learning_rate": 1.9703549836723112e-05, + "loss": 0.9131, + "step": 1378 + }, + { + "epoch": 0.21101759755164498, + "grad_norm": 2.7251933214674784, + "learning_rate": 1.9702950601314948e-05, + "loss": 0.7765, + "step": 1379 + }, + { + "epoch": 0.2111706197398623, + "grad_norm": 2.422786440415208, + "learning_rate": 1.9702350770010625e-05, + "loss": 0.816, + "step": 1380 + }, + { + "epoch": 0.21132364192807956, + "grad_norm": 2.8510678470175823, + "learning_rate": 1.9701750342846985e-05, + "loss": 0.9267, + "step": 1381 + }, + { + "epoch": 0.21147666411629687, + "grad_norm": 2.445685396692264, + "learning_rate": 1.970114931986091e-05, + "loss": 0.7711, + "step": 1382 + }, + { + "epoch": 0.21162968630451415, + "grad_norm": 2.5729642145381395, + "learning_rate": 1.9700547701089297e-05, + "loss": 0.8154, + "step": 1383 + }, + { + "epoch": 0.21178270849273145, + "grad_norm": 2.3053407957709378, + "learning_rate": 1.9699945486569102e-05, + "loss": 0.8914, + "step": 1384 + }, + { + "epoch": 0.21193573068094873, + "grad_norm": 2.3823797011223373, + "learning_rate": 1.969934267633731e-05, + "loss": 0.7488, + "step": 1385 + }, + { + "epoch": 0.21208875286916604, + "grad_norm": 2.482422953417084, + "learning_rate": 1.969873927043094e-05, + "loss": 0.8749, + "step": 1386 + }, + { + "epoch": 0.21224177505738331, + "grad_norm": 2.779771599316904, + "learning_rate": 1.969813526888705e-05, + "loss": 0.8083, + "step": 1387 + }, + { + "epoch": 0.21239479724560062, + "grad_norm": 2.651475617855006, + "learning_rate": 1.9697530671742733e-05, + "loss": 0.8822, + "step": 1388 + }, + { + "epoch": 0.2125478194338179, + "grad_norm": 2.4441971083865113, + "learning_rate": 1.969692547903512e-05, + "loss": 0.9099, + "step": 1389 + }, + { + "epoch": 0.2127008416220352, + "grad_norm": 2.901432036147306, + "learning_rate": 1.969631969080138e-05, + "loss": 0.8652, + "step": 1390 + }, + { + "epoch": 0.21285386381025248, + "grad_norm": 2.7019808196456974, + "learning_rate": 1.9695713307078718e-05, + "loss": 0.8447, + "step": 1391 + }, + { + "epoch": 0.21300688599846979, + "grad_norm": 2.524291797108313, + "learning_rate": 1.9695106327904367e-05, + "loss": 0.9771, + "step": 1392 + }, + { + "epoch": 0.21315990818668706, + "grad_norm": 2.432632551140893, + "learning_rate": 1.9694498753315613e-05, + "loss": 0.816, + "step": 1393 + }, + { + "epoch": 0.21331293037490437, + "grad_norm": 3.0185482439437012, + "learning_rate": 1.9693890583349762e-05, + "loss": 1.0286, + "step": 1394 + }, + { + "epoch": 0.21346595256312165, + "grad_norm": 2.473777410059679, + "learning_rate": 1.9693281818044168e-05, + "loss": 0.8313, + "step": 1395 + }, + { + "epoch": 0.21361897475133895, + "grad_norm": 2.9398419262263973, + "learning_rate": 1.969267245743622e-05, + "loss": 0.9696, + "step": 1396 + }, + { + "epoch": 0.21377199693955623, + "grad_norm": 2.5598888551944197, + "learning_rate": 1.9692062501563333e-05, + "loss": 0.7388, + "step": 1397 + }, + { + "epoch": 0.21392501912777354, + "grad_norm": 2.6365253523922365, + "learning_rate": 1.9691451950462977e-05, + "loss": 0.8378, + "step": 1398 + }, + { + "epoch": 0.2140780413159908, + "grad_norm": 2.6272557509945313, + "learning_rate": 1.9690840804172644e-05, + "loss": 0.7908, + "step": 1399 + }, + { + "epoch": 0.21423106350420812, + "grad_norm": 2.457956626306656, + "learning_rate": 1.9690229062729863e-05, + "loss": 0.9102, + "step": 1400 + }, + { + "epoch": 0.2143840856924254, + "grad_norm": 2.7856149622579003, + "learning_rate": 1.968961672617221e-05, + "loss": 0.7951, + "step": 1401 + }, + { + "epoch": 0.2145371078806427, + "grad_norm": 2.5890160262046766, + "learning_rate": 1.9689003794537286e-05, + "loss": 0.8368, + "step": 1402 + }, + { + "epoch": 0.21469013006885998, + "grad_norm": 2.5397138932158865, + "learning_rate": 1.9688390267862737e-05, + "loss": 0.8831, + "step": 1403 + }, + { + "epoch": 0.21484315225707729, + "grad_norm": 2.5658431838731195, + "learning_rate": 1.968777614618624e-05, + "loss": 0.8381, + "step": 1404 + }, + { + "epoch": 0.21499617444529456, + "grad_norm": 2.551108162079543, + "learning_rate": 1.9687161429545512e-05, + "loss": 0.9045, + "step": 1405 + }, + { + "epoch": 0.21514919663351187, + "grad_norm": 2.4869151629476436, + "learning_rate": 1.96865461179783e-05, + "loss": 0.7808, + "step": 1406 + }, + { + "epoch": 0.21530221882172915, + "grad_norm": 2.733337979301424, + "learning_rate": 1.96859302115224e-05, + "loss": 0.9888, + "step": 1407 + }, + { + "epoch": 0.21545524100994645, + "grad_norm": 2.6573788126722486, + "learning_rate": 1.9685313710215634e-05, + "loss": 0.8592, + "step": 1408 + }, + { + "epoch": 0.21560826319816373, + "grad_norm": 2.4780481991806367, + "learning_rate": 1.9684696614095865e-05, + "loss": 0.7615, + "step": 1409 + }, + { + "epoch": 0.21576128538638104, + "grad_norm": 2.8468670949990713, + "learning_rate": 1.968407892320099e-05, + "loss": 0.8195, + "step": 1410 + }, + { + "epoch": 0.2159143075745983, + "grad_norm": 2.542501474269545, + "learning_rate": 1.968346063756894e-05, + "loss": 0.7449, + "step": 1411 + }, + { + "epoch": 0.21606732976281562, + "grad_norm": 2.5323321906232006, + "learning_rate": 1.9682841757237693e-05, + "loss": 0.8259, + "step": 1412 + }, + { + "epoch": 0.2162203519510329, + "grad_norm": 2.6822987672510616, + "learning_rate": 1.9682222282245255e-05, + "loss": 0.9143, + "step": 1413 + }, + { + "epoch": 0.2163733741392502, + "grad_norm": 2.661368983707966, + "learning_rate": 1.9681602212629668e-05, + "loss": 0.8521, + "step": 1414 + }, + { + "epoch": 0.21652639632746748, + "grad_norm": 2.4421559076683925, + "learning_rate": 1.9680981548429017e-05, + "loss": 0.7393, + "step": 1415 + }, + { + "epoch": 0.21667941851568479, + "grad_norm": 2.7182110574776375, + "learning_rate": 1.9680360289681415e-05, + "loss": 0.8152, + "step": 1416 + }, + { + "epoch": 0.21683244070390206, + "grad_norm": 2.7587215745106426, + "learning_rate": 1.967973843642502e-05, + "loss": 0.9543, + "step": 1417 + }, + { + "epoch": 0.21698546289211937, + "grad_norm": 2.443920441762142, + "learning_rate": 1.967911598869802e-05, + "loss": 0.8022, + "step": 1418 + }, + { + "epoch": 0.21713848508033665, + "grad_norm": 2.770489472508194, + "learning_rate": 1.967849294653864e-05, + "loss": 0.8629, + "step": 1419 + }, + { + "epoch": 0.21729150726855395, + "grad_norm": 2.6168180326566186, + "learning_rate": 1.9677869309985146e-05, + "loss": 0.8642, + "step": 1420 + }, + { + "epoch": 0.21744452945677123, + "grad_norm": 2.752417489975351, + "learning_rate": 1.9677245079075837e-05, + "loss": 0.789, + "step": 1421 + }, + { + "epoch": 0.21759755164498853, + "grad_norm": 2.683755581101207, + "learning_rate": 1.967662025384905e-05, + "loss": 0.9186, + "step": 1422 + }, + { + "epoch": 0.2177505738332058, + "grad_norm": 2.804693835725841, + "learning_rate": 1.967599483434316e-05, + "loss": 0.9001, + "step": 1423 + }, + { + "epoch": 0.21790359602142312, + "grad_norm": 3.135183658906024, + "learning_rate": 1.9675368820596575e-05, + "loss": 0.9598, + "step": 1424 + }, + { + "epoch": 0.2180566182096404, + "grad_norm": 3.0719430556931067, + "learning_rate": 1.9674742212647738e-05, + "loss": 0.8038, + "step": 1425 + }, + { + "epoch": 0.2182096403978577, + "grad_norm": 2.60160695139502, + "learning_rate": 1.9674115010535135e-05, + "loss": 0.8428, + "step": 1426 + }, + { + "epoch": 0.21836266258607498, + "grad_norm": 2.6151319424341897, + "learning_rate": 1.9673487214297284e-05, + "loss": 0.8505, + "step": 1427 + }, + { + "epoch": 0.21851568477429228, + "grad_norm": 2.575352044936147, + "learning_rate": 1.967285882397274e-05, + "loss": 0.826, + "step": 1428 + }, + { + "epoch": 0.21866870696250956, + "grad_norm": 2.628612019649863, + "learning_rate": 1.9672229839600098e-05, + "loss": 0.7893, + "step": 1429 + }, + { + "epoch": 0.21882172915072687, + "grad_norm": 2.512796903702364, + "learning_rate": 1.9671600261217978e-05, + "loss": 0.7888, + "step": 1430 + }, + { + "epoch": 0.21897475133894415, + "grad_norm": 2.6086882195208525, + "learning_rate": 1.9670970088865052e-05, + "loss": 0.794, + "step": 1431 + }, + { + "epoch": 0.21912777352716145, + "grad_norm": 2.9600265005459225, + "learning_rate": 1.9670339322580023e-05, + "loss": 0.932, + "step": 1432 + }, + { + "epoch": 0.21928079571537873, + "grad_norm": 2.366110521394664, + "learning_rate": 1.966970796240162e-05, + "loss": 0.885, + "step": 1433 + }, + { + "epoch": 0.21943381790359603, + "grad_norm": 2.406125948360677, + "learning_rate": 1.966907600836863e-05, + "loss": 0.8062, + "step": 1434 + }, + { + "epoch": 0.2195868400918133, + "grad_norm": 2.735448209351719, + "learning_rate": 1.9668443460519854e-05, + "loss": 0.9436, + "step": 1435 + }, + { + "epoch": 0.21973986228003062, + "grad_norm": 2.468087419582429, + "learning_rate": 1.966781031889414e-05, + "loss": 0.8398, + "step": 1436 + }, + { + "epoch": 0.2198928844682479, + "grad_norm": 2.4874153722433316, + "learning_rate": 1.9667176583530377e-05, + "loss": 0.8523, + "step": 1437 + }, + { + "epoch": 0.2200459066564652, + "grad_norm": 2.6445890152061917, + "learning_rate": 1.9666542254467478e-05, + "loss": 1.0136, + "step": 1438 + }, + { + "epoch": 0.22019892884468248, + "grad_norm": 2.607525745722285, + "learning_rate": 1.9665907331744404e-05, + "loss": 0.8658, + "step": 1439 + }, + { + "epoch": 0.22035195103289976, + "grad_norm": 2.7498225059483348, + "learning_rate": 1.966527181540015e-05, + "loss": 0.8795, + "step": 1440 + }, + { + "epoch": 0.22050497322111706, + "grad_norm": 2.449442723272099, + "learning_rate": 1.9664635705473745e-05, + "loss": 0.7556, + "step": 1441 + }, + { + "epoch": 0.22065799540933434, + "grad_norm": 2.5182627516208265, + "learning_rate": 1.9663999002004247e-05, + "loss": 0.9188, + "step": 1442 + }, + { + "epoch": 0.22081101759755165, + "grad_norm": 3.0815320780904214, + "learning_rate": 1.966336170503077e-05, + "loss": 0.8873, + "step": 1443 + }, + { + "epoch": 0.22096403978576892, + "grad_norm": 2.1521962301015205, + "learning_rate": 1.9662723814592443e-05, + "loss": 0.6666, + "step": 1444 + }, + { + "epoch": 0.22111706197398623, + "grad_norm": 2.617498957904519, + "learning_rate": 1.9662085330728448e-05, + "loss": 0.8849, + "step": 1445 + }, + { + "epoch": 0.2212700841622035, + "grad_norm": 2.6878494492762717, + "learning_rate": 1.9661446253477995e-05, + "loss": 0.8731, + "step": 1446 + }, + { + "epoch": 0.2214231063504208, + "grad_norm": 2.4562980436730086, + "learning_rate": 1.966080658288033e-05, + "loss": 0.7922, + "step": 1447 + }, + { + "epoch": 0.2215761285386381, + "grad_norm": 2.4564471373976278, + "learning_rate": 1.966016631897474e-05, + "loss": 0.8254, + "step": 1448 + }, + { + "epoch": 0.2217291507268554, + "grad_norm": 2.649776130638235, + "learning_rate": 1.9659525461800546e-05, + "loss": 0.9063, + "step": 1449 + }, + { + "epoch": 0.22188217291507267, + "grad_norm": 2.2959740420216237, + "learning_rate": 1.9658884011397105e-05, + "loss": 0.7877, + "step": 1450 + }, + { + "epoch": 0.22203519510328998, + "grad_norm": 2.5813619059244166, + "learning_rate": 1.965824196780381e-05, + "loss": 0.8595, + "step": 1451 + }, + { + "epoch": 0.22218821729150726, + "grad_norm": 2.584172187703942, + "learning_rate": 1.9657599331060097e-05, + "loss": 0.7845, + "step": 1452 + }, + { + "epoch": 0.22234123947972456, + "grad_norm": 2.7062610966861564, + "learning_rate": 1.9656956101205426e-05, + "loss": 0.8721, + "step": 1453 + }, + { + "epoch": 0.22249426166794184, + "grad_norm": 2.374713272758218, + "learning_rate": 1.9656312278279298e-05, + "loss": 0.6398, + "step": 1454 + }, + { + "epoch": 0.22264728385615914, + "grad_norm": 2.562393686891158, + "learning_rate": 1.965566786232126e-05, + "loss": 0.8596, + "step": 1455 + }, + { + "epoch": 0.22280030604437642, + "grad_norm": 2.2857421843137518, + "learning_rate": 1.9655022853370886e-05, + "loss": 0.7233, + "step": 1456 + }, + { + "epoch": 0.22295332823259373, + "grad_norm": 2.364375523326793, + "learning_rate": 1.965437725146779e-05, + "loss": 0.777, + "step": 1457 + }, + { + "epoch": 0.223106350420811, + "grad_norm": 2.326524319190945, + "learning_rate": 1.9653731056651616e-05, + "loss": 0.7536, + "step": 1458 + }, + { + "epoch": 0.2232593726090283, + "grad_norm": 2.417153715891703, + "learning_rate": 1.9653084268962054e-05, + "loss": 0.7981, + "step": 1459 + }, + { + "epoch": 0.2234123947972456, + "grad_norm": 2.5776287610942306, + "learning_rate": 1.965243688843882e-05, + "loss": 0.7941, + "step": 1460 + }, + { + "epoch": 0.2235654169854629, + "grad_norm": 2.392772961147377, + "learning_rate": 1.965178891512168e-05, + "loss": 0.9104, + "step": 1461 + }, + { + "epoch": 0.22371843917368017, + "grad_norm": 2.487273411363196, + "learning_rate": 1.9651140349050428e-05, + "loss": 0.7555, + "step": 1462 + }, + { + "epoch": 0.22387146136189748, + "grad_norm": 2.7143668921668245, + "learning_rate": 1.9650491190264887e-05, + "loss": 0.8837, + "step": 1463 + }, + { + "epoch": 0.22402448355011476, + "grad_norm": 2.734873235580024, + "learning_rate": 1.964984143880493e-05, + "loss": 0.815, + "step": 1464 + }, + { + "epoch": 0.22417750573833206, + "grad_norm": 2.928990160124733, + "learning_rate": 1.9649191094710457e-05, + "loss": 0.8404, + "step": 1465 + }, + { + "epoch": 0.22433052792654934, + "grad_norm": 2.5593116581170583, + "learning_rate": 1.9648540158021416e-05, + "loss": 0.7353, + "step": 1466 + }, + { + "epoch": 0.22448355011476664, + "grad_norm": 2.6266572490060804, + "learning_rate": 1.9647888628777777e-05, + "loss": 0.9329, + "step": 1467 + }, + { + "epoch": 0.22463657230298392, + "grad_norm": 2.8704971084818878, + "learning_rate": 1.9647236507019552e-05, + "loss": 0.9432, + "step": 1468 + }, + { + "epoch": 0.22478959449120123, + "grad_norm": 2.590453922700811, + "learning_rate": 1.9646583792786795e-05, + "loss": 0.7691, + "step": 1469 + }, + { + "epoch": 0.2249426166794185, + "grad_norm": 3.1415820506912624, + "learning_rate": 1.9645930486119588e-05, + "loss": 1.0224, + "step": 1470 + }, + { + "epoch": 0.2250956388676358, + "grad_norm": 2.5309118552802836, + "learning_rate": 1.964527658705806e-05, + "loss": 0.886, + "step": 1471 + }, + { + "epoch": 0.2252486610558531, + "grad_norm": 2.5972158241040657, + "learning_rate": 1.9644622095642358e-05, + "loss": 0.6973, + "step": 1472 + }, + { + "epoch": 0.2254016832440704, + "grad_norm": 2.5546268806279455, + "learning_rate": 1.9643967011912685e-05, + "loss": 0.8449, + "step": 1473 + }, + { + "epoch": 0.22555470543228767, + "grad_norm": 2.3479011681317314, + "learning_rate": 1.964331133590927e-05, + "loss": 0.8669, + "step": 1474 + }, + { + "epoch": 0.22570772762050498, + "grad_norm": 2.515862803842189, + "learning_rate": 1.9642655067672384e-05, + "loss": 0.8085, + "step": 1475 + }, + { + "epoch": 0.22586074980872226, + "grad_norm": 2.3668911776660755, + "learning_rate": 1.9641998207242325e-05, + "loss": 0.7926, + "step": 1476 + }, + { + "epoch": 0.22601377199693956, + "grad_norm": 2.280772898900187, + "learning_rate": 1.9641340754659438e-05, + "loss": 0.5904, + "step": 1477 + }, + { + "epoch": 0.22616679418515684, + "grad_norm": 2.465742578131, + "learning_rate": 1.9640682709964098e-05, + "loss": 0.7349, + "step": 1478 + }, + { + "epoch": 0.22631981637337414, + "grad_norm": 2.530176243255086, + "learning_rate": 1.964002407319672e-05, + "loss": 0.8513, + "step": 1479 + }, + { + "epoch": 0.22647283856159142, + "grad_norm": 2.5405734648853873, + "learning_rate": 1.9639364844397745e-05, + "loss": 0.9168, + "step": 1480 + }, + { + "epoch": 0.22662586074980873, + "grad_norm": 2.8804862950288337, + "learning_rate": 1.9638705023607668e-05, + "loss": 0.8588, + "step": 1481 + }, + { + "epoch": 0.226778882938026, + "grad_norm": 2.260546018171997, + "learning_rate": 1.9638044610867013e-05, + "loss": 0.7765, + "step": 1482 + }, + { + "epoch": 0.2269319051262433, + "grad_norm": 2.536491527405855, + "learning_rate": 1.963738360621633e-05, + "loss": 0.764, + "step": 1483 + }, + { + "epoch": 0.2270849273144606, + "grad_norm": 2.0029390016703763, + "learning_rate": 1.963672200969622e-05, + "loss": 0.7933, + "step": 1484 + }, + { + "epoch": 0.2272379495026779, + "grad_norm": 2.681926900943292, + "learning_rate": 1.9636059821347315e-05, + "loss": 0.8794, + "step": 1485 + }, + { + "epoch": 0.22739097169089517, + "grad_norm": 2.460030338333229, + "learning_rate": 1.9635397041210274e-05, + "loss": 0.8535, + "step": 1486 + }, + { + "epoch": 0.22754399387911248, + "grad_norm": 2.666549304410844, + "learning_rate": 1.963473366932581e-05, + "loss": 0.7369, + "step": 1487 + }, + { + "epoch": 0.22769701606732975, + "grad_norm": 2.5057662138797676, + "learning_rate": 1.9634069705734662e-05, + "loss": 0.8288, + "step": 1488 + }, + { + "epoch": 0.22785003825554706, + "grad_norm": 2.5683484789985256, + "learning_rate": 1.9633405150477603e-05, + "loss": 0.7962, + "step": 1489 + }, + { + "epoch": 0.22800306044376434, + "grad_norm": 3.0952874862064816, + "learning_rate": 1.9632740003595447e-05, + "loss": 0.879, + "step": 1490 + }, + { + "epoch": 0.22815608263198164, + "grad_norm": 2.8884222769603065, + "learning_rate": 1.9632074265129044e-05, + "loss": 0.7994, + "step": 1491 + }, + { + "epoch": 0.22830910482019892, + "grad_norm": 2.735873639194007, + "learning_rate": 1.963140793511928e-05, + "loss": 0.8619, + "step": 1492 + }, + { + "epoch": 0.22846212700841623, + "grad_norm": 2.631822280768736, + "learning_rate": 1.9630741013607077e-05, + "loss": 0.8749, + "step": 1493 + }, + { + "epoch": 0.2286151491966335, + "grad_norm": 2.577737303029769, + "learning_rate": 1.9630073500633392e-05, + "loss": 0.8527, + "step": 1494 + }, + { + "epoch": 0.2287681713848508, + "grad_norm": 2.852657029630802, + "learning_rate": 1.962940539623922e-05, + "loss": 0.9213, + "step": 1495 + }, + { + "epoch": 0.2289211935730681, + "grad_norm": 2.5092520712765447, + "learning_rate": 1.962873670046559e-05, + "loss": 0.7863, + "step": 1496 + }, + { + "epoch": 0.2290742157612854, + "grad_norm": 3.0271335789574065, + "learning_rate": 1.962806741335357e-05, + "loss": 0.8167, + "step": 1497 + }, + { + "epoch": 0.22922723794950267, + "grad_norm": 2.671356145739248, + "learning_rate": 1.962739753494427e-05, + "loss": 0.9197, + "step": 1498 + }, + { + "epoch": 0.22938026013771998, + "grad_norm": 2.6843150913405354, + "learning_rate": 1.9626727065278827e-05, + "loss": 0.9332, + "step": 1499 + }, + { + "epoch": 0.22953328232593725, + "grad_norm": 2.5279817214021385, + "learning_rate": 1.962605600439841e-05, + "loss": 0.8936, + "step": 1500 + }, + { + "epoch": 0.22968630451415456, + "grad_norm": 2.825279808824693, + "learning_rate": 1.9625384352344235e-05, + "loss": 0.8902, + "step": 1501 + }, + { + "epoch": 0.22983932670237184, + "grad_norm": 2.592876328071922, + "learning_rate": 1.9624712109157554e-05, + "loss": 0.9093, + "step": 1502 + }, + { + "epoch": 0.22999234889058914, + "grad_norm": 2.328063927339844, + "learning_rate": 1.9624039274879654e-05, + "loss": 0.7551, + "step": 1503 + }, + { + "epoch": 0.23014537107880642, + "grad_norm": 2.597131304137858, + "learning_rate": 1.962336584955185e-05, + "loss": 0.8054, + "step": 1504 + }, + { + "epoch": 0.23029839326702373, + "grad_norm": 2.561933334091904, + "learning_rate": 1.9622691833215502e-05, + "loss": 0.8699, + "step": 1505 + }, + { + "epoch": 0.230451415455241, + "grad_norm": 2.6185390256998904, + "learning_rate": 1.9622017225912e-05, + "loss": 0.882, + "step": 1506 + }, + { + "epoch": 0.2306044376434583, + "grad_norm": 2.4996637324030844, + "learning_rate": 1.9621342027682785e-05, + "loss": 0.7593, + "step": 1507 + }, + { + "epoch": 0.2307574598316756, + "grad_norm": 2.337279426626155, + "learning_rate": 1.9620666238569317e-05, + "loss": 0.7016, + "step": 1508 + }, + { + "epoch": 0.2309104820198929, + "grad_norm": 2.644195150141663, + "learning_rate": 1.96199898586131e-05, + "loss": 0.8604, + "step": 1509 + }, + { + "epoch": 0.23106350420811017, + "grad_norm": 2.6142675375463624, + "learning_rate": 1.9619312887855666e-05, + "loss": 0.8094, + "step": 1510 + }, + { + "epoch": 0.23121652639632748, + "grad_norm": 2.331606054900535, + "learning_rate": 1.96186353263386e-05, + "loss": 0.7867, + "step": 1511 + }, + { + "epoch": 0.23136954858454475, + "grad_norm": 2.8201130356617057, + "learning_rate": 1.9617957174103512e-05, + "loss": 0.9134, + "step": 1512 + }, + { + "epoch": 0.23152257077276206, + "grad_norm": 3.166099027860909, + "learning_rate": 1.9617278431192045e-05, + "loss": 0.7511, + "step": 1513 + }, + { + "epoch": 0.23167559296097934, + "grad_norm": 2.5832948593251595, + "learning_rate": 1.961659909764589e-05, + "loss": 0.6853, + "step": 1514 + }, + { + "epoch": 0.23182861514919664, + "grad_norm": 3.0072774469631534, + "learning_rate": 1.9615919173506763e-05, + "loss": 0.8164, + "step": 1515 + }, + { + "epoch": 0.23198163733741392, + "grad_norm": 2.5885075319239084, + "learning_rate": 1.961523865881642e-05, + "loss": 0.8215, + "step": 1516 + }, + { + "epoch": 0.23213465952563123, + "grad_norm": 2.5422510418493127, + "learning_rate": 1.9614557553616658e-05, + "loss": 0.8625, + "step": 1517 + }, + { + "epoch": 0.2322876817138485, + "grad_norm": 2.994853277911608, + "learning_rate": 1.96138758579493e-05, + "loss": 1.0072, + "step": 1518 + }, + { + "epoch": 0.2324407039020658, + "grad_norm": 3.8292304341773473, + "learning_rate": 1.961319357185622e-05, + "loss": 0.8986, + "step": 1519 + }, + { + "epoch": 0.2325937260902831, + "grad_norm": 2.4955902381832273, + "learning_rate": 1.9612510695379318e-05, + "loss": 0.8552, + "step": 1520 + }, + { + "epoch": 0.2327467482785004, + "grad_norm": 2.5365745905944976, + "learning_rate": 1.9611827228560526e-05, + "loss": 0.833, + "step": 1521 + }, + { + "epoch": 0.23289977046671767, + "grad_norm": 2.5287596898930675, + "learning_rate": 1.9611143171441824e-05, + "loss": 0.7321, + "step": 1522 + }, + { + "epoch": 0.23305279265493498, + "grad_norm": 2.5755887252276604, + "learning_rate": 1.961045852406522e-05, + "loss": 0.8743, + "step": 1523 + }, + { + "epoch": 0.23320581484315225, + "grad_norm": 2.6042826275761564, + "learning_rate": 1.960977328647276e-05, + "loss": 0.9544, + "step": 1524 + }, + { + "epoch": 0.23335883703136956, + "grad_norm": 2.622514539233016, + "learning_rate": 1.9609087458706528e-05, + "loss": 0.9218, + "step": 1525 + }, + { + "epoch": 0.23351185921958684, + "grad_norm": 2.4768249685418735, + "learning_rate": 1.9608401040808648e-05, + "loss": 0.8509, + "step": 1526 + }, + { + "epoch": 0.23366488140780414, + "grad_norm": 2.5099475586849036, + "learning_rate": 1.960771403282127e-05, + "loss": 0.7543, + "step": 1527 + }, + { + "epoch": 0.23381790359602142, + "grad_norm": 2.4341180953680035, + "learning_rate": 1.9607026434786587e-05, + "loss": 0.7317, + "step": 1528 + }, + { + "epoch": 0.23397092578423873, + "grad_norm": 2.722094841689053, + "learning_rate": 1.960633824674683e-05, + "loss": 0.8697, + "step": 1529 + }, + { + "epoch": 0.234123947972456, + "grad_norm": 2.8260615483151588, + "learning_rate": 1.960564946874426e-05, + "loss": 0.8851, + "step": 1530 + }, + { + "epoch": 0.2342769701606733, + "grad_norm": 2.5417326953835824, + "learning_rate": 1.960496010082118e-05, + "loss": 0.8586, + "step": 1531 + }, + { + "epoch": 0.2344299923488906, + "grad_norm": 2.5826334101465944, + "learning_rate": 1.9604270143019923e-05, + "loss": 0.8883, + "step": 1532 + }, + { + "epoch": 0.2345830145371079, + "grad_norm": 2.678630361021353, + "learning_rate": 1.9603579595382866e-05, + "loss": 0.8858, + "step": 1533 + }, + { + "epoch": 0.23473603672532517, + "grad_norm": 2.5681223877610346, + "learning_rate": 1.9602888457952414e-05, + "loss": 0.8861, + "step": 1534 + }, + { + "epoch": 0.23488905891354248, + "grad_norm": 2.556048444081866, + "learning_rate": 1.9602196730771017e-05, + "loss": 0.9105, + "step": 1535 + }, + { + "epoch": 0.23504208110175975, + "grad_norm": 2.2656890021267966, + "learning_rate": 1.9601504413881155e-05, + "loss": 0.8354, + "step": 1536 + }, + { + "epoch": 0.23519510328997706, + "grad_norm": 2.6891425941651375, + "learning_rate": 1.9600811507325346e-05, + "loss": 0.7968, + "step": 1537 + }, + { + "epoch": 0.23534812547819434, + "grad_norm": 2.6911301647078805, + "learning_rate": 1.9600118011146142e-05, + "loss": 0.8348, + "step": 1538 + }, + { + "epoch": 0.23550114766641164, + "grad_norm": 2.9627416505056092, + "learning_rate": 1.9599423925386135e-05, + "loss": 0.8664, + "step": 1539 + }, + { + "epoch": 0.23565416985462892, + "grad_norm": 2.5888079738464387, + "learning_rate": 1.9598729250087953e-05, + "loss": 0.834, + "step": 1540 + }, + { + "epoch": 0.23580719204284623, + "grad_norm": 2.8789926115688007, + "learning_rate": 1.9598033985294262e-05, + "loss": 0.9464, + "step": 1541 + }, + { + "epoch": 0.2359602142310635, + "grad_norm": 2.575145545960676, + "learning_rate": 1.9597338131047747e-05, + "loss": 0.8041, + "step": 1542 + }, + { + "epoch": 0.2361132364192808, + "grad_norm": 2.4750916407006516, + "learning_rate": 1.9596641687391157e-05, + "loss": 0.938, + "step": 1543 + }, + { + "epoch": 0.2362662586074981, + "grad_norm": 2.3274446744899477, + "learning_rate": 1.9595944654367257e-05, + "loss": 0.8365, + "step": 1544 + }, + { + "epoch": 0.2364192807957154, + "grad_norm": 2.474251844393818, + "learning_rate": 1.959524703201886e-05, + "loss": 0.8646, + "step": 1545 + }, + { + "epoch": 0.23657230298393267, + "grad_norm": 2.5147567170404153, + "learning_rate": 1.9594548820388802e-05, + "loss": 0.8311, + "step": 1546 + }, + { + "epoch": 0.23672532517214998, + "grad_norm": 2.5166123534386813, + "learning_rate": 1.959385001951997e-05, + "loss": 0.8027, + "step": 1547 + }, + { + "epoch": 0.23687834736036725, + "grad_norm": 2.740847576540663, + "learning_rate": 1.9593150629455276e-05, + "loss": 0.953, + "step": 1548 + }, + { + "epoch": 0.23703136954858456, + "grad_norm": 2.3306374527547735, + "learning_rate": 1.9592450650237672e-05, + "loss": 0.8136, + "step": 1549 + }, + { + "epoch": 0.23718439173680184, + "grad_norm": 2.777170941620961, + "learning_rate": 1.9591750081910147e-05, + "loss": 0.8697, + "step": 1550 + }, + { + "epoch": 0.23733741392501911, + "grad_norm": 2.5805389495767574, + "learning_rate": 1.9591048924515727e-05, + "loss": 0.8805, + "step": 1551 + }, + { + "epoch": 0.23749043611323642, + "grad_norm": 2.514446714606269, + "learning_rate": 1.9590347178097472e-05, + "loss": 0.763, + "step": 1552 + }, + { + "epoch": 0.2376434583014537, + "grad_norm": 2.8092464729269424, + "learning_rate": 1.9589644842698483e-05, + "loss": 0.8816, + "step": 1553 + }, + { + "epoch": 0.237796480489671, + "grad_norm": 2.483884600727865, + "learning_rate": 1.9588941918361887e-05, + "loss": 0.8546, + "step": 1554 + }, + { + "epoch": 0.23794950267788828, + "grad_norm": 2.474736699712295, + "learning_rate": 1.958823840513085e-05, + "loss": 0.8574, + "step": 1555 + }, + { + "epoch": 0.23810252486610559, + "grad_norm": 2.4571375008471383, + "learning_rate": 1.9587534303048593e-05, + "loss": 0.8287, + "step": 1556 + }, + { + "epoch": 0.23825554705432286, + "grad_norm": 2.3407274090716546, + "learning_rate": 1.9586829612158344e-05, + "loss": 0.6982, + "step": 1557 + }, + { + "epoch": 0.23840856924254017, + "grad_norm": 2.377616391628943, + "learning_rate": 1.958612433250338e-05, + "loss": 0.8394, + "step": 1558 + }, + { + "epoch": 0.23856159143075745, + "grad_norm": 2.5486874556181407, + "learning_rate": 1.9585418464127024e-05, + "loss": 0.761, + "step": 1559 + }, + { + "epoch": 0.23871461361897475, + "grad_norm": 2.766035827712233, + "learning_rate": 1.958471200707262e-05, + "loss": 0.9256, + "step": 1560 + }, + { + "epoch": 0.23886763580719203, + "grad_norm": 2.7270798479698035, + "learning_rate": 1.958400496138356e-05, + "loss": 0.8441, + "step": 1561 + }, + { + "epoch": 0.23902065799540934, + "grad_norm": 2.5294552861595556, + "learning_rate": 1.9583297327103258e-05, + "loss": 0.8277, + "step": 1562 + }, + { + "epoch": 0.2391736801836266, + "grad_norm": 2.5286458879300002, + "learning_rate": 1.9582589104275178e-05, + "loss": 0.9408, + "step": 1563 + }, + { + "epoch": 0.23932670237184392, + "grad_norm": 2.290446335806122, + "learning_rate": 1.9581880292942813e-05, + "loss": 0.7325, + "step": 1564 + }, + { + "epoch": 0.2394797245600612, + "grad_norm": 2.542319946851815, + "learning_rate": 1.9581170893149696e-05, + "loss": 0.8568, + "step": 1565 + }, + { + "epoch": 0.2396327467482785, + "grad_norm": 2.6996924947494594, + "learning_rate": 1.9580460904939393e-05, + "loss": 0.8699, + "step": 1566 + }, + { + "epoch": 0.23978576893649578, + "grad_norm": 2.7752847934882894, + "learning_rate": 1.9579750328355505e-05, + "loss": 0.8267, + "step": 1567 + }, + { + "epoch": 0.23993879112471309, + "grad_norm": 2.564510429754208, + "learning_rate": 1.957903916344168e-05, + "loss": 0.8308, + "step": 1568 + }, + { + "epoch": 0.24009181331293036, + "grad_norm": 2.425102215801101, + "learning_rate": 1.957832741024158e-05, + "loss": 0.806, + "step": 1569 + }, + { + "epoch": 0.24024483550114767, + "grad_norm": 2.6676736135165364, + "learning_rate": 1.957761506879892e-05, + "loss": 0.8601, + "step": 1570 + }, + { + "epoch": 0.24039785768936495, + "grad_norm": 2.4198971288915163, + "learning_rate": 1.9576902139157455e-05, + "loss": 0.77, + "step": 1571 + }, + { + "epoch": 0.24055087987758225, + "grad_norm": 2.2966349523292307, + "learning_rate": 1.9576188621360966e-05, + "loss": 0.7461, + "step": 1572 + }, + { + "epoch": 0.24070390206579953, + "grad_norm": 2.6079230848347956, + "learning_rate": 1.957547451545327e-05, + "loss": 0.8364, + "step": 1573 + }, + { + "epoch": 0.24085692425401684, + "grad_norm": 2.758369012562051, + "learning_rate": 1.9574759821478223e-05, + "loss": 0.9007, + "step": 1574 + }, + { + "epoch": 0.2410099464422341, + "grad_norm": 2.3613131551646673, + "learning_rate": 1.9574044539479722e-05, + "loss": 0.8054, + "step": 1575 + }, + { + "epoch": 0.24116296863045142, + "grad_norm": 2.464175192909486, + "learning_rate": 1.9573328669501693e-05, + "loss": 0.7988, + "step": 1576 + }, + { + "epoch": 0.2413159908186687, + "grad_norm": 2.8590820543841837, + "learning_rate": 1.9572612211588094e-05, + "loss": 0.7991, + "step": 1577 + }, + { + "epoch": 0.241469013006886, + "grad_norm": 2.6513303722544386, + "learning_rate": 1.9571895165782933e-05, + "loss": 0.8747, + "step": 1578 + }, + { + "epoch": 0.24162203519510328, + "grad_norm": 2.6797342921700555, + "learning_rate": 1.9571177532130245e-05, + "loss": 0.9601, + "step": 1579 + }, + { + "epoch": 0.24177505738332059, + "grad_norm": 2.656625618019867, + "learning_rate": 1.9570459310674104e-05, + "loss": 0.9296, + "step": 1580 + }, + { + "epoch": 0.24192807957153786, + "grad_norm": 2.476973262298971, + "learning_rate": 1.9569740501458615e-05, + "loss": 0.8351, + "step": 1581 + }, + { + "epoch": 0.24208110175975517, + "grad_norm": 2.6908872499133283, + "learning_rate": 1.9569021104527924e-05, + "loss": 0.9745, + "step": 1582 + }, + { + "epoch": 0.24223412394797245, + "grad_norm": 2.400852771610583, + "learning_rate": 1.956830111992621e-05, + "loss": 0.8625, + "step": 1583 + }, + { + "epoch": 0.24238714613618975, + "grad_norm": 2.7311721684322574, + "learning_rate": 1.95675805476977e-05, + "loss": 0.8294, + "step": 1584 + }, + { + "epoch": 0.24254016832440703, + "grad_norm": 3.031322195765501, + "learning_rate": 1.9566859387886638e-05, + "loss": 0.8948, + "step": 1585 + }, + { + "epoch": 0.24269319051262433, + "grad_norm": 2.386831353760463, + "learning_rate": 1.956613764053731e-05, + "loss": 0.8555, + "step": 1586 + }, + { + "epoch": 0.2428462127008416, + "grad_norm": 2.5958286967053223, + "learning_rate": 1.956541530569405e-05, + "loss": 0.8871, + "step": 1587 + }, + { + "epoch": 0.24299923488905892, + "grad_norm": 2.351972886279123, + "learning_rate": 1.9564692383401218e-05, + "loss": 0.7892, + "step": 1588 + }, + { + "epoch": 0.2431522570772762, + "grad_norm": 2.563518227284029, + "learning_rate": 1.9563968873703206e-05, + "loss": 0.8589, + "step": 1589 + }, + { + "epoch": 0.2433052792654935, + "grad_norm": 2.7469234929921242, + "learning_rate": 1.956324477664445e-05, + "loss": 0.9476, + "step": 1590 + }, + { + "epoch": 0.24345830145371078, + "grad_norm": 2.5793234210805354, + "learning_rate": 1.9562520092269426e-05, + "loss": 0.9077, + "step": 1591 + }, + { + "epoch": 0.24361132364192808, + "grad_norm": 2.487999582227551, + "learning_rate": 1.956179482062263e-05, + "loss": 0.8971, + "step": 1592 + }, + { + "epoch": 0.24376434583014536, + "grad_norm": 2.5411762234948734, + "learning_rate": 1.9561068961748612e-05, + "loss": 0.7937, + "step": 1593 + }, + { + "epoch": 0.24391736801836267, + "grad_norm": 2.7545988237977275, + "learning_rate": 1.9560342515691942e-05, + "loss": 0.9105, + "step": 1594 + }, + { + "epoch": 0.24407039020657995, + "grad_norm": 2.338750092973846, + "learning_rate": 1.9559615482497237e-05, + "loss": 0.8024, + "step": 1595 + }, + { + "epoch": 0.24422341239479725, + "grad_norm": 2.5892173696364145, + "learning_rate": 1.9558887862209153e-05, + "loss": 0.9056, + "step": 1596 + }, + { + "epoch": 0.24437643458301453, + "grad_norm": 2.779375109330424, + "learning_rate": 1.955815965487237e-05, + "loss": 0.8351, + "step": 1597 + }, + { + "epoch": 0.24452945677123183, + "grad_norm": 2.85621778641634, + "learning_rate": 1.9557430860531605e-05, + "loss": 0.8458, + "step": 1598 + }, + { + "epoch": 0.2446824789594491, + "grad_norm": 3.0230061074447794, + "learning_rate": 1.955670147923163e-05, + "loss": 0.8472, + "step": 1599 + }, + { + "epoch": 0.24483550114766642, + "grad_norm": 2.81465165939412, + "learning_rate": 1.9555971511017225e-05, + "loss": 0.8206, + "step": 1600 + }, + { + "epoch": 0.2449885233358837, + "grad_norm": 2.4138314641623984, + "learning_rate": 1.9555240955933227e-05, + "loss": 0.8138, + "step": 1601 + }, + { + "epoch": 0.245141545524101, + "grad_norm": 2.8178514221242126, + "learning_rate": 1.9554509814024502e-05, + "loss": 0.9016, + "step": 1602 + }, + { + "epoch": 0.24529456771231828, + "grad_norm": 2.5742380441275796, + "learning_rate": 1.9553778085335953e-05, + "loss": 0.8801, + "step": 1603 + }, + { + "epoch": 0.24544758990053558, + "grad_norm": 2.2493911411311296, + "learning_rate": 1.955304576991252e-05, + "loss": 0.7488, + "step": 1604 + }, + { + "epoch": 0.24560061208875286, + "grad_norm": 2.618681068118831, + "learning_rate": 1.9552312867799168e-05, + "loss": 0.8048, + "step": 1605 + }, + { + "epoch": 0.24575363427697017, + "grad_norm": 2.882616971639018, + "learning_rate": 1.955157937904092e-05, + "loss": 0.9131, + "step": 1606 + }, + { + "epoch": 0.24590665646518745, + "grad_norm": 2.5178465278639792, + "learning_rate": 1.955084530368281e-05, + "loss": 0.8579, + "step": 1607 + }, + { + "epoch": 0.24605967865340475, + "grad_norm": 2.760529263401625, + "learning_rate": 1.955011064176993e-05, + "loss": 0.8543, + "step": 1608 + }, + { + "epoch": 0.24621270084162203, + "grad_norm": 2.6012155141285143, + "learning_rate": 1.9549375393347397e-05, + "loss": 1.1095, + "step": 1609 + }, + { + "epoch": 0.24636572302983933, + "grad_norm": 2.379103223108997, + "learning_rate": 1.9548639558460363e-05, + "loss": 0.7103, + "step": 1610 + }, + { + "epoch": 0.2465187452180566, + "grad_norm": 2.4002591433736105, + "learning_rate": 1.954790313715402e-05, + "loss": 0.8428, + "step": 1611 + }, + { + "epoch": 0.24667176740627392, + "grad_norm": 2.4612350390198134, + "learning_rate": 1.9547166129473592e-05, + "loss": 0.8282, + "step": 1612 + }, + { + "epoch": 0.2468247895944912, + "grad_norm": 2.7307204781232044, + "learning_rate": 1.9546428535464342e-05, + "loss": 0.8591, + "step": 1613 + }, + { + "epoch": 0.2469778117827085, + "grad_norm": 2.4852927020406375, + "learning_rate": 1.954569035517157e-05, + "loss": 0.8924, + "step": 1614 + }, + { + "epoch": 0.24713083397092578, + "grad_norm": 2.4986642056279673, + "learning_rate": 1.9544951588640613e-05, + "loss": 0.8122, + "step": 1615 + }, + { + "epoch": 0.24728385615914308, + "grad_norm": 2.4127278623822526, + "learning_rate": 1.954421223591684e-05, + "loss": 0.6704, + "step": 1616 + }, + { + "epoch": 0.24743687834736036, + "grad_norm": 2.4815806633333413, + "learning_rate": 1.9543472297045652e-05, + "loss": 0.9251, + "step": 1617 + }, + { + "epoch": 0.24758990053557767, + "grad_norm": 2.5451951843300957, + "learning_rate": 1.95427317720725e-05, + "loss": 0.77, + "step": 1618 + }, + { + "epoch": 0.24774292272379494, + "grad_norm": 2.6155792651075465, + "learning_rate": 1.9541990661042855e-05, + "loss": 0.7563, + "step": 1619 + }, + { + "epoch": 0.24789594491201225, + "grad_norm": 2.6364533209981116, + "learning_rate": 1.9541248964002237e-05, + "loss": 0.7738, + "step": 1620 + }, + { + "epoch": 0.24804896710022953, + "grad_norm": 2.4555048705871365, + "learning_rate": 1.9540506680996194e-05, + "loss": 0.844, + "step": 1621 + }, + { + "epoch": 0.24820198928844683, + "grad_norm": 2.5501946462827707, + "learning_rate": 1.9539763812070315e-05, + "loss": 0.8376, + "step": 1622 + }, + { + "epoch": 0.2483550114766641, + "grad_norm": 2.393788755143763, + "learning_rate": 1.953902035727022e-05, + "loss": 0.8484, + "step": 1623 + }, + { + "epoch": 0.24850803366488142, + "grad_norm": 2.7383748703121813, + "learning_rate": 1.9538276316641563e-05, + "loss": 0.8219, + "step": 1624 + }, + { + "epoch": 0.2486610558530987, + "grad_norm": 2.7572133382018844, + "learning_rate": 1.9537531690230047e-05, + "loss": 0.8975, + "step": 1625 + }, + { + "epoch": 0.248814078041316, + "grad_norm": 2.515487334192462, + "learning_rate": 1.95367864780814e-05, + "loss": 0.8214, + "step": 1626 + }, + { + "epoch": 0.24896710022953328, + "grad_norm": 2.48662376579883, + "learning_rate": 1.9536040680241386e-05, + "loss": 0.8922, + "step": 1627 + }, + { + "epoch": 0.24912012241775058, + "grad_norm": 2.9232465851956424, + "learning_rate": 1.953529429675581e-05, + "loss": 0.9557, + "step": 1628 + }, + { + "epoch": 0.24927314460596786, + "grad_norm": 2.6749171268509278, + "learning_rate": 1.9534547327670507e-05, + "loss": 0.8247, + "step": 1629 + }, + { + "epoch": 0.24942616679418517, + "grad_norm": 2.4601486076921932, + "learning_rate": 1.9533799773031355e-05, + "loss": 0.8779, + "step": 1630 + }, + { + "epoch": 0.24957918898240244, + "grad_norm": 2.24378715386169, + "learning_rate": 1.9533051632884262e-05, + "loss": 0.7781, + "step": 1631 + }, + { + "epoch": 0.24973221117061975, + "grad_norm": 2.2968548749052977, + "learning_rate": 1.9532302907275172e-05, + "loss": 0.8105, + "step": 1632 + }, + { + "epoch": 0.24988523335883703, + "grad_norm": 2.5467470529707246, + "learning_rate": 1.9531553596250076e-05, + "loss": 0.8232, + "step": 1633 + }, + { + "epoch": 0.25003825554705433, + "grad_norm": 2.7549851558941967, + "learning_rate": 1.953080369985498e-05, + "loss": 0.866, + "step": 1634 + }, + { + "epoch": 0.2501912777352716, + "grad_norm": 2.5660121185422793, + "learning_rate": 1.9530053218135947e-05, + "loss": 0.8282, + "step": 1635 + }, + { + "epoch": 0.2503442999234889, + "grad_norm": 2.8719794869613264, + "learning_rate": 1.9529302151139062e-05, + "loss": 0.9236, + "step": 1636 + }, + { + "epoch": 0.2504973221117062, + "grad_norm": 2.708214993200167, + "learning_rate": 1.9528550498910454e-05, + "loss": 0.8374, + "step": 1637 + }, + { + "epoch": 0.2506503442999235, + "grad_norm": 2.604499565659554, + "learning_rate": 1.9527798261496283e-05, + "loss": 0.9509, + "step": 1638 + }, + { + "epoch": 0.2508033664881408, + "grad_norm": 2.4786457903057473, + "learning_rate": 1.952704543894275e-05, + "loss": 0.7756, + "step": 1639 + }, + { + "epoch": 0.25095638867635806, + "grad_norm": 2.6065249288861603, + "learning_rate": 1.9526292031296086e-05, + "loss": 0.8183, + "step": 1640 + }, + { + "epoch": 0.2511094108645754, + "grad_norm": 2.644774458166758, + "learning_rate": 1.9525538038602563e-05, + "loss": 0.8495, + "step": 1641 + }, + { + "epoch": 0.25126243305279267, + "grad_norm": 2.7492322691250735, + "learning_rate": 1.9524783460908482e-05, + "loss": 0.8283, + "step": 1642 + }, + { + "epoch": 0.25141545524100994, + "grad_norm": 2.3011963876888757, + "learning_rate": 1.952402829826019e-05, + "loss": 0.7487, + "step": 1643 + }, + { + "epoch": 0.2515684774292272, + "grad_norm": 2.885621875830341, + "learning_rate": 1.952327255070406e-05, + "loss": 0.86, + "step": 1644 + }, + { + "epoch": 0.25172149961744456, + "grad_norm": 2.4666631770259895, + "learning_rate": 1.9522516218286508e-05, + "loss": 0.7159, + "step": 1645 + }, + { + "epoch": 0.25187452180566183, + "grad_norm": 2.3549607097582608, + "learning_rate": 1.9521759301053983e-05, + "loss": 0.8202, + "step": 1646 + }, + { + "epoch": 0.2520275439938791, + "grad_norm": 2.4507380115808357, + "learning_rate": 1.9521001799052967e-05, + "loss": 0.8397, + "step": 1647 + }, + { + "epoch": 0.2521805661820964, + "grad_norm": 2.5911841336571433, + "learning_rate": 1.952024371232999e-05, + "loss": 0.9088, + "step": 1648 + }, + { + "epoch": 0.2523335883703137, + "grad_norm": 2.6750521928124256, + "learning_rate": 1.9519485040931597e-05, + "loss": 0.793, + "step": 1649 + }, + { + "epoch": 0.252486610558531, + "grad_norm": 2.7834755569688503, + "learning_rate": 1.951872578490439e-05, + "loss": 0.8295, + "step": 1650 + }, + { + "epoch": 0.2526396327467483, + "grad_norm": 2.237899404312587, + "learning_rate": 1.9517965944295e-05, + "loss": 0.8427, + "step": 1651 + }, + { + "epoch": 0.25279265493496555, + "grad_norm": 2.4852949812861906, + "learning_rate": 1.951720551915008e-05, + "loss": 0.8564, + "step": 1652 + }, + { + "epoch": 0.2529456771231829, + "grad_norm": 2.632829816927367, + "learning_rate": 1.951644450951634e-05, + "loss": 0.7319, + "step": 1653 + }, + { + "epoch": 0.25309869931140017, + "grad_norm": 2.40826639657792, + "learning_rate": 1.951568291544051e-05, + "loss": 0.8, + "step": 1654 + }, + { + "epoch": 0.25325172149961744, + "grad_norm": 2.3396314856198694, + "learning_rate": 1.9514920736969374e-05, + "loss": 0.8018, + "step": 1655 + }, + { + "epoch": 0.2534047436878347, + "grad_norm": 2.4346994936885356, + "learning_rate": 1.951415797414973e-05, + "loss": 0.8905, + "step": 1656 + }, + { + "epoch": 0.25355776587605205, + "grad_norm": 2.644073064070106, + "learning_rate": 1.9513394627028423e-05, + "loss": 0.9219, + "step": 1657 + }, + { + "epoch": 0.25371078806426933, + "grad_norm": 2.4464354186477433, + "learning_rate": 1.9512630695652336e-05, + "loss": 0.7972, + "step": 1658 + }, + { + "epoch": 0.2538638102524866, + "grad_norm": 2.590677816544767, + "learning_rate": 1.9511866180068385e-05, + "loss": 0.7883, + "step": 1659 + }, + { + "epoch": 0.2540168324407039, + "grad_norm": 2.361634237448809, + "learning_rate": 1.9511101080323524e-05, + "loss": 0.7745, + "step": 1660 + }, + { + "epoch": 0.2541698546289212, + "grad_norm": 2.577411232973206, + "learning_rate": 1.9510335396464736e-05, + "loss": 0.867, + "step": 1661 + }, + { + "epoch": 0.2543228768171385, + "grad_norm": 2.9471633774424477, + "learning_rate": 1.9509569128539048e-05, + "loss": 0.902, + "step": 1662 + }, + { + "epoch": 0.2544758990053558, + "grad_norm": 2.3233960734786345, + "learning_rate": 1.9508802276593514e-05, + "loss": 0.7731, + "step": 1663 + }, + { + "epoch": 0.25462892119357305, + "grad_norm": 3.0664701594842296, + "learning_rate": 1.9508034840675236e-05, + "loss": 0.8958, + "step": 1664 + }, + { + "epoch": 0.25478194338179033, + "grad_norm": 2.7694989075868914, + "learning_rate": 1.9507266820831344e-05, + "loss": 0.8821, + "step": 1665 + }, + { + "epoch": 0.25493496557000767, + "grad_norm": 2.4964849392361006, + "learning_rate": 1.9506498217109003e-05, + "loss": 0.8569, + "step": 1666 + }, + { + "epoch": 0.25508798775822494, + "grad_norm": 2.5239295777251534, + "learning_rate": 1.9505729029555417e-05, + "loss": 0.9162, + "step": 1667 + }, + { + "epoch": 0.2552410099464422, + "grad_norm": 2.5052913112386426, + "learning_rate": 1.9504959258217823e-05, + "loss": 0.922, + "step": 1668 + }, + { + "epoch": 0.2553940321346595, + "grad_norm": 2.36279363557302, + "learning_rate": 1.95041889031435e-05, + "loss": 0.8044, + "step": 1669 + }, + { + "epoch": 0.25554705432287683, + "grad_norm": 2.349428630092639, + "learning_rate": 1.9503417964379754e-05, + "loss": 0.8111, + "step": 1670 + }, + { + "epoch": 0.2557000765110941, + "grad_norm": 2.4173485389188247, + "learning_rate": 1.9502646441973933e-05, + "loss": 0.8887, + "step": 1671 + }, + { + "epoch": 0.2558530986993114, + "grad_norm": 2.3382379947820344, + "learning_rate": 1.9501874335973422e-05, + "loss": 0.848, + "step": 1672 + }, + { + "epoch": 0.25600612088752867, + "grad_norm": 2.610146018950607, + "learning_rate": 1.9501101646425633e-05, + "loss": 0.8671, + "step": 1673 + }, + { + "epoch": 0.256159143075746, + "grad_norm": 2.4976105687857166, + "learning_rate": 1.9500328373378026e-05, + "loss": 0.79, + "step": 1674 + }, + { + "epoch": 0.2563121652639633, + "grad_norm": 2.4734090750179103, + "learning_rate": 1.9499554516878088e-05, + "loss": 0.8003, + "step": 1675 + }, + { + "epoch": 0.25646518745218055, + "grad_norm": 2.481384562214497, + "learning_rate": 1.949878007697334e-05, + "loss": 0.8418, + "step": 1676 + }, + { + "epoch": 0.25661820964039783, + "grad_norm": 2.0977882078690824, + "learning_rate": 1.9498005053711354e-05, + "loss": 0.819, + "step": 1677 + }, + { + "epoch": 0.25677123182861517, + "grad_norm": 2.421363761561868, + "learning_rate": 1.9497229447139717e-05, + "loss": 0.9224, + "step": 1678 + }, + { + "epoch": 0.25692425401683244, + "grad_norm": 2.5633269767032894, + "learning_rate": 1.949645325730607e-05, + "loss": 0.849, + "step": 1679 + }, + { + "epoch": 0.2570772762050497, + "grad_norm": 2.8582041097741344, + "learning_rate": 1.949567648425808e-05, + "loss": 0.8003, + "step": 1680 + }, + { + "epoch": 0.257230298393267, + "grad_norm": 2.6758874575609957, + "learning_rate": 1.9494899128043442e-05, + "loss": 0.8408, + "step": 1681 + }, + { + "epoch": 0.25738332058148433, + "grad_norm": 2.5888294107853147, + "learning_rate": 1.949412118870991e-05, + "loss": 0.7712, + "step": 1682 + }, + { + "epoch": 0.2575363427697016, + "grad_norm": 2.485349688173273, + "learning_rate": 1.9493342666305254e-05, + "loss": 0.7898, + "step": 1683 + }, + { + "epoch": 0.2576893649579189, + "grad_norm": 2.4074664249571707, + "learning_rate": 1.9492563560877285e-05, + "loss": 0.9074, + "step": 1684 + }, + { + "epoch": 0.25784238714613616, + "grad_norm": 3.006519080206942, + "learning_rate": 1.949178387247385e-05, + "loss": 0.9606, + "step": 1685 + }, + { + "epoch": 0.2579954093343535, + "grad_norm": 2.9344864762687353, + "learning_rate": 1.9491003601142842e-05, + "loss": 0.7411, + "step": 1686 + }, + { + "epoch": 0.2581484315225708, + "grad_norm": 2.160570396212931, + "learning_rate": 1.949022274693217e-05, + "loss": 0.6894, + "step": 1687 + }, + { + "epoch": 0.25830145371078805, + "grad_norm": 2.438882169334196, + "learning_rate": 1.9489441309889794e-05, + "loss": 0.7715, + "step": 1688 + }, + { + "epoch": 0.25845447589900533, + "grad_norm": 2.436285424122042, + "learning_rate": 1.9488659290063702e-05, + "loss": 0.7852, + "step": 1689 + }, + { + "epoch": 0.25860749808722266, + "grad_norm": 2.854857000650002, + "learning_rate": 1.9487876687501926e-05, + "loss": 0.8264, + "step": 1690 + }, + { + "epoch": 0.25876052027543994, + "grad_norm": 2.271191622502827, + "learning_rate": 1.9487093502252526e-05, + "loss": 0.7694, + "step": 1691 + }, + { + "epoch": 0.2589135424636572, + "grad_norm": 2.489235162164264, + "learning_rate": 1.94863097343636e-05, + "loss": 0.7871, + "step": 1692 + }, + { + "epoch": 0.2590665646518745, + "grad_norm": 3.0714855530371015, + "learning_rate": 1.948552538388328e-05, + "loss": 0.8953, + "step": 1693 + }, + { + "epoch": 0.25921958684009183, + "grad_norm": 2.5084220890499007, + "learning_rate": 1.9484740450859743e-05, + "loss": 0.8391, + "step": 1694 + }, + { + "epoch": 0.2593726090283091, + "grad_norm": 2.5910117597918116, + "learning_rate": 1.9483954935341186e-05, + "loss": 0.8472, + "step": 1695 + }, + { + "epoch": 0.2595256312165264, + "grad_norm": 2.4874414300183982, + "learning_rate": 1.9483168837375856e-05, + "loss": 0.8618, + "step": 1696 + }, + { + "epoch": 0.25967865340474366, + "grad_norm": 2.4390882665604536, + "learning_rate": 1.9482382157012033e-05, + "loss": 0.8633, + "step": 1697 + }, + { + "epoch": 0.259831675592961, + "grad_norm": 2.7537562928055355, + "learning_rate": 1.9481594894298023e-05, + "loss": 0.8594, + "step": 1698 + }, + { + "epoch": 0.2599846977811783, + "grad_norm": 2.3613754614350078, + "learning_rate": 1.9480807049282177e-05, + "loss": 0.8136, + "step": 1699 + }, + { + "epoch": 0.26013771996939555, + "grad_norm": 2.611634039266382, + "learning_rate": 1.9480018622012884e-05, + "loss": 0.9738, + "step": 1700 + }, + { + "epoch": 0.26029074215761283, + "grad_norm": 2.4645726484909, + "learning_rate": 1.9479229612538558e-05, + "loss": 0.8931, + "step": 1701 + }, + { + "epoch": 0.26044376434583016, + "grad_norm": 2.4373875784863284, + "learning_rate": 1.9478440020907662e-05, + "loss": 0.8618, + "step": 1702 + }, + { + "epoch": 0.26059678653404744, + "grad_norm": 2.642912343584658, + "learning_rate": 1.9477649847168685e-05, + "loss": 0.8661, + "step": 1703 + }, + { + "epoch": 0.2607498087222647, + "grad_norm": 2.433110527065469, + "learning_rate": 1.9476859091370153e-05, + "loss": 0.7962, + "step": 1704 + }, + { + "epoch": 0.260902830910482, + "grad_norm": 2.5788838749611966, + "learning_rate": 1.947606775356063e-05, + "loss": 0.9376, + "step": 1705 + }, + { + "epoch": 0.26105585309869933, + "grad_norm": 2.3973604286815458, + "learning_rate": 1.9475275833788714e-05, + "loss": 0.8217, + "step": 1706 + }, + { + "epoch": 0.2612088752869166, + "grad_norm": 2.669068275728912, + "learning_rate": 1.9474483332103043e-05, + "loss": 0.8883, + "step": 1707 + }, + { + "epoch": 0.2613618974751339, + "grad_norm": 2.5071451612106883, + "learning_rate": 1.947369024855229e-05, + "loss": 0.8878, + "step": 1708 + }, + { + "epoch": 0.26151491966335116, + "grad_norm": 2.2570325349738556, + "learning_rate": 1.947289658318515e-05, + "loss": 0.8426, + "step": 1709 + }, + { + "epoch": 0.2616679418515685, + "grad_norm": 2.5071883489100015, + "learning_rate": 1.947210233605038e-05, + "loss": 0.8524, + "step": 1710 + }, + { + "epoch": 0.2618209640397858, + "grad_norm": 2.4286658527705343, + "learning_rate": 1.9471307507196746e-05, + "loss": 0.9167, + "step": 1711 + }, + { + "epoch": 0.26197398622800305, + "grad_norm": 2.5819637026712683, + "learning_rate": 1.9470512096673065e-05, + "loss": 0.7922, + "step": 1712 + }, + { + "epoch": 0.26212700841622033, + "grad_norm": 2.4051224205801027, + "learning_rate": 1.946971610452819e-05, + "loss": 0.7694, + "step": 1713 + }, + { + "epoch": 0.26228003060443766, + "grad_norm": 2.5521899267184667, + "learning_rate": 1.9468919530811002e-05, + "loss": 0.847, + "step": 1714 + }, + { + "epoch": 0.26243305279265494, + "grad_norm": 2.4020801572092516, + "learning_rate": 1.9468122375570425e-05, + "loss": 0.8737, + "step": 1715 + }, + { + "epoch": 0.2625860749808722, + "grad_norm": 2.3615195314704582, + "learning_rate": 1.946732463885541e-05, + "loss": 0.7862, + "step": 1716 + }, + { + "epoch": 0.2627390971690895, + "grad_norm": 2.5988427829601415, + "learning_rate": 1.9466526320714956e-05, + "loss": 0.8121, + "step": 1717 + }, + { + "epoch": 0.26289211935730683, + "grad_norm": 2.720083299441531, + "learning_rate": 1.9465727421198086e-05, + "loss": 0.9416, + "step": 1718 + }, + { + "epoch": 0.2630451415455241, + "grad_norm": 2.3491570168515534, + "learning_rate": 1.9464927940353865e-05, + "loss": 0.8364, + "step": 1719 + }, + { + "epoch": 0.2631981637337414, + "grad_norm": 2.3320233151056846, + "learning_rate": 1.9464127878231393e-05, + "loss": 0.9151, + "step": 1720 + }, + { + "epoch": 0.26335118592195866, + "grad_norm": 2.4609771161448064, + "learning_rate": 1.9463327234879805e-05, + "loss": 0.7901, + "step": 1721 + }, + { + "epoch": 0.263504208110176, + "grad_norm": 2.576087752470663, + "learning_rate": 1.946252601034827e-05, + "loss": 0.9416, + "step": 1722 + }, + { + "epoch": 0.2636572302983933, + "grad_norm": 2.746683862451466, + "learning_rate": 1.946172420468599e-05, + "loss": 0.9327, + "step": 1723 + }, + { + "epoch": 0.26381025248661055, + "grad_norm": 2.544995404935606, + "learning_rate": 1.9460921817942217e-05, + "loss": 0.7567, + "step": 1724 + }, + { + "epoch": 0.26396327467482783, + "grad_norm": 2.2425437074057832, + "learning_rate": 1.9460118850166223e-05, + "loss": 0.6886, + "step": 1725 + }, + { + "epoch": 0.26411629686304516, + "grad_norm": 2.4009536341104036, + "learning_rate": 1.9459315301407323e-05, + "loss": 0.9018, + "step": 1726 + }, + { + "epoch": 0.26426931905126244, + "grad_norm": 2.610099232671235, + "learning_rate": 1.9458511171714863e-05, + "loss": 0.8207, + "step": 1727 + }, + { + "epoch": 0.2644223412394797, + "grad_norm": 2.4320257408549635, + "learning_rate": 1.9457706461138233e-05, + "loss": 0.7446, + "step": 1728 + }, + { + "epoch": 0.264575363427697, + "grad_norm": 2.5185379316749548, + "learning_rate": 1.945690116972685e-05, + "loss": 0.7479, + "step": 1729 + }, + { + "epoch": 0.26472838561591433, + "grad_norm": 2.895728792841351, + "learning_rate": 1.945609529753017e-05, + "loss": 0.8769, + "step": 1730 + }, + { + "epoch": 0.2648814078041316, + "grad_norm": 2.730992822149975, + "learning_rate": 1.945528884459768e-05, + "loss": 0.916, + "step": 1731 + }, + { + "epoch": 0.2650344299923489, + "grad_norm": 2.693860601738405, + "learning_rate": 1.945448181097892e-05, + "loss": 0.8514, + "step": 1732 + }, + { + "epoch": 0.26518745218056616, + "grad_norm": 2.6405405825364494, + "learning_rate": 1.9453674196723445e-05, + "loss": 0.9356, + "step": 1733 + }, + { + "epoch": 0.2653404743687835, + "grad_norm": 2.5688295938831693, + "learning_rate": 1.9452866001880852e-05, + "loss": 0.9191, + "step": 1734 + }, + { + "epoch": 0.2654934965570008, + "grad_norm": 2.499877747605039, + "learning_rate": 1.945205722650078e-05, + "loss": 0.888, + "step": 1735 + }, + { + "epoch": 0.26564651874521805, + "grad_norm": 2.922492979194366, + "learning_rate": 1.9451247870632898e-05, + "loss": 0.8968, + "step": 1736 + }, + { + "epoch": 0.26579954093343533, + "grad_norm": 2.5466573461677076, + "learning_rate": 1.9450437934326906e-05, + "loss": 0.7677, + "step": 1737 + }, + { + "epoch": 0.26595256312165266, + "grad_norm": 2.5820858780258753, + "learning_rate": 1.9449627417632554e-05, + "loss": 0.8154, + "step": 1738 + }, + { + "epoch": 0.26610558530986994, + "grad_norm": 2.2852352856657903, + "learning_rate": 1.9448816320599615e-05, + "loss": 0.8817, + "step": 1739 + }, + { + "epoch": 0.2662586074980872, + "grad_norm": 2.473054849169737, + "learning_rate": 1.94480046432779e-05, + "loss": 0.8555, + "step": 1740 + }, + { + "epoch": 0.2664116296863045, + "grad_norm": 2.761146652490966, + "learning_rate": 1.944719238571726e-05, + "loss": 0.8623, + "step": 1741 + }, + { + "epoch": 0.26656465187452183, + "grad_norm": 2.687345946730486, + "learning_rate": 1.944637954796758e-05, + "loss": 0.7884, + "step": 1742 + }, + { + "epoch": 0.2667176740627391, + "grad_norm": 2.6170414834070415, + "learning_rate": 1.9445566130078774e-05, + "loss": 0.823, + "step": 1743 + }, + { + "epoch": 0.2668706962509564, + "grad_norm": 2.5065210386666315, + "learning_rate": 1.94447521321008e-05, + "loss": 0.8013, + "step": 1744 + }, + { + "epoch": 0.26702371843917366, + "grad_norm": 2.569502254268044, + "learning_rate": 1.9443937554083655e-05, + "loss": 0.9016, + "step": 1745 + }, + { + "epoch": 0.267176740627391, + "grad_norm": 2.318313240796993, + "learning_rate": 1.9443122396077357e-05, + "loss": 0.8254, + "step": 1746 + }, + { + "epoch": 0.2673297628156083, + "grad_norm": 2.3525786375234063, + "learning_rate": 1.9442306658131967e-05, + "loss": 0.7866, + "step": 1747 + }, + { + "epoch": 0.26748278500382555, + "grad_norm": 2.558329696352076, + "learning_rate": 1.944149034029759e-05, + "loss": 0.8594, + "step": 1748 + }, + { + "epoch": 0.26763580719204283, + "grad_norm": 2.285331779879835, + "learning_rate": 1.944067344262436e-05, + "loss": 0.7182, + "step": 1749 + }, + { + "epoch": 0.26778882938026016, + "grad_norm": 2.607640802866723, + "learning_rate": 1.9439855965162436e-05, + "loss": 0.7751, + "step": 1750 + }, + { + "epoch": 0.26794185156847744, + "grad_norm": 2.412052455367587, + "learning_rate": 1.9439037907962032e-05, + "loss": 0.8451, + "step": 1751 + }, + { + "epoch": 0.2680948737566947, + "grad_norm": 2.6293034239554456, + "learning_rate": 1.943821927107338e-05, + "loss": 0.9424, + "step": 1752 + }, + { + "epoch": 0.268247895944912, + "grad_norm": 2.6107845855461695, + "learning_rate": 1.9437400054546765e-05, + "loss": 0.8322, + "step": 1753 + }, + { + "epoch": 0.26840091813312933, + "grad_norm": 2.548893203671856, + "learning_rate": 1.9436580258432488e-05, + "loss": 0.7892, + "step": 1754 + }, + { + "epoch": 0.2685539403213466, + "grad_norm": 3.0202000676385783, + "learning_rate": 1.9435759882780905e-05, + "loss": 0.9641, + "step": 1755 + }, + { + "epoch": 0.2687069625095639, + "grad_norm": 2.5840683936343467, + "learning_rate": 1.9434938927642393e-05, + "loss": 0.8314, + "step": 1756 + }, + { + "epoch": 0.26885998469778116, + "grad_norm": 2.754920380157699, + "learning_rate": 1.9434117393067375e-05, + "loss": 0.9488, + "step": 1757 + }, + { + "epoch": 0.2690130068859985, + "grad_norm": 2.48864745158179, + "learning_rate": 1.9433295279106296e-05, + "loss": 0.8753, + "step": 1758 + }, + { + "epoch": 0.2691660290742158, + "grad_norm": 2.3149579588067573, + "learning_rate": 1.9432472585809657e-05, + "loss": 0.8767, + "step": 1759 + }, + { + "epoch": 0.26931905126243305, + "grad_norm": 2.6859581863399087, + "learning_rate": 1.9431649313227972e-05, + "loss": 0.8624, + "step": 1760 + }, + { + "epoch": 0.26947207345065033, + "grad_norm": 2.2212850321280593, + "learning_rate": 1.9430825461411805e-05, + "loss": 0.761, + "step": 1761 + }, + { + "epoch": 0.26962509563886766, + "grad_norm": 2.4027565272940654, + "learning_rate": 1.9430001030411757e-05, + "loss": 0.8739, + "step": 1762 + }, + { + "epoch": 0.26977811782708494, + "grad_norm": 2.660172589499753, + "learning_rate": 1.942917602027845e-05, + "loss": 0.9251, + "step": 1763 + }, + { + "epoch": 0.2699311400153022, + "grad_norm": 2.743206963419242, + "learning_rate": 1.942835043106256e-05, + "loss": 0.9153, + "step": 1764 + }, + { + "epoch": 0.2700841622035195, + "grad_norm": 2.6989866804235145, + "learning_rate": 1.9427524262814786e-05, + "loss": 0.9009, + "step": 1765 + }, + { + "epoch": 0.27023718439173683, + "grad_norm": 2.487307962781265, + "learning_rate": 1.9426697515585865e-05, + "loss": 0.7067, + "step": 1766 + }, + { + "epoch": 0.2703902065799541, + "grad_norm": 2.4830826951639353, + "learning_rate": 1.9425870189426573e-05, + "loss": 0.8055, + "step": 1767 + }, + { + "epoch": 0.2705432287681714, + "grad_norm": 2.5304574169611134, + "learning_rate": 1.942504228438772e-05, + "loss": 0.766, + "step": 1768 + }, + { + "epoch": 0.27069625095638866, + "grad_norm": 2.328713565548941, + "learning_rate": 1.9424213800520147e-05, + "loss": 0.7621, + "step": 1769 + }, + { + "epoch": 0.270849273144606, + "grad_norm": 2.6578397488472723, + "learning_rate": 1.9423384737874738e-05, + "loss": 0.8997, + "step": 1770 + }, + { + "epoch": 0.2710022953328233, + "grad_norm": 2.406319383474901, + "learning_rate": 1.9422555096502406e-05, + "loss": 0.6949, + "step": 1771 + }, + { + "epoch": 0.27115531752104055, + "grad_norm": 2.320190474995434, + "learning_rate": 1.9421724876454108e-05, + "loss": 0.7765, + "step": 1772 + }, + { + "epoch": 0.27130833970925783, + "grad_norm": 2.255508796586933, + "learning_rate": 1.9420894077780826e-05, + "loss": 0.7989, + "step": 1773 + }, + { + "epoch": 0.2714613618974751, + "grad_norm": 2.4808465555872967, + "learning_rate": 1.942006270053358e-05, + "loss": 0.7873, + "step": 1774 + }, + { + "epoch": 0.27161438408569244, + "grad_norm": 2.6957590465125505, + "learning_rate": 1.9419230744763437e-05, + "loss": 0.9488, + "step": 1775 + }, + { + "epoch": 0.2717674062739097, + "grad_norm": 2.6825223214394205, + "learning_rate": 1.9418398210521486e-05, + "loss": 0.8551, + "step": 1776 + }, + { + "epoch": 0.271920428462127, + "grad_norm": 2.319812276713523, + "learning_rate": 1.941756509785885e-05, + "loss": 0.8255, + "step": 1777 + }, + { + "epoch": 0.2720734506503443, + "grad_norm": 2.736308455119988, + "learning_rate": 1.9416731406826704e-05, + "loss": 0.784, + "step": 1778 + }, + { + "epoch": 0.2722264728385616, + "grad_norm": 2.623518511491986, + "learning_rate": 1.9415897137476243e-05, + "loss": 0.8191, + "step": 1779 + }, + { + "epoch": 0.2723794950267789, + "grad_norm": 2.3011853266463924, + "learning_rate": 1.9415062289858702e-05, + "loss": 0.7144, + "step": 1780 + }, + { + "epoch": 0.27253251721499616, + "grad_norm": 2.820252509123847, + "learning_rate": 1.941422686402536e-05, + "loss": 0.8803, + "step": 1781 + }, + { + "epoch": 0.27268553940321344, + "grad_norm": 2.3683267612707195, + "learning_rate": 1.9413390860027512e-05, + "loss": 0.7924, + "step": 1782 + }, + { + "epoch": 0.2728385615914308, + "grad_norm": 2.489890268313438, + "learning_rate": 1.9412554277916506e-05, + "loss": 0.8228, + "step": 1783 + }, + { + "epoch": 0.27299158377964805, + "grad_norm": 2.5950610129814966, + "learning_rate": 1.941171711774372e-05, + "loss": 0.8059, + "step": 1784 + }, + { + "epoch": 0.27314460596786533, + "grad_norm": 2.4309315036464603, + "learning_rate": 1.941087937956057e-05, + "loss": 0.8338, + "step": 1785 + }, + { + "epoch": 0.2732976281560826, + "grad_norm": 2.4619004282248484, + "learning_rate": 1.94100410634185e-05, + "loss": 0.775, + "step": 1786 + }, + { + "epoch": 0.27345065034429994, + "grad_norm": 2.553665078049513, + "learning_rate": 1.9409202169368994e-05, + "loss": 0.7981, + "step": 1787 + }, + { + "epoch": 0.2736036725325172, + "grad_norm": 2.7012115848050375, + "learning_rate": 1.9408362697463576e-05, + "loss": 0.7708, + "step": 1788 + }, + { + "epoch": 0.2737566947207345, + "grad_norm": 2.548331990200666, + "learning_rate": 1.94075226477538e-05, + "loss": 0.8804, + "step": 1789 + }, + { + "epoch": 0.2739097169089518, + "grad_norm": 2.348960462028775, + "learning_rate": 1.9406682020291253e-05, + "loss": 0.7993, + "step": 1790 + }, + { + "epoch": 0.2740627390971691, + "grad_norm": 2.4769272628241517, + "learning_rate": 1.9405840815127567e-05, + "loss": 0.8696, + "step": 1791 + }, + { + "epoch": 0.2742157612853864, + "grad_norm": 2.659296349245691, + "learning_rate": 1.9404999032314397e-05, + "loss": 0.8705, + "step": 1792 + }, + { + "epoch": 0.27436878347360366, + "grad_norm": 2.5496822689710177, + "learning_rate": 1.9404156671903443e-05, + "loss": 0.7476, + "step": 1793 + }, + { + "epoch": 0.27452180566182094, + "grad_norm": 2.1802122613902726, + "learning_rate": 1.9403313733946442e-05, + "loss": 0.7877, + "step": 1794 + }, + { + "epoch": 0.2746748278500383, + "grad_norm": 2.636214632414026, + "learning_rate": 1.9402470218495158e-05, + "loss": 0.8912, + "step": 1795 + }, + { + "epoch": 0.27482785003825555, + "grad_norm": 2.5169431177436996, + "learning_rate": 1.9401626125601395e-05, + "loss": 0.7936, + "step": 1796 + }, + { + "epoch": 0.27498087222647283, + "grad_norm": 2.6274651789471486, + "learning_rate": 1.940078145531699e-05, + "loss": 0.8681, + "step": 1797 + }, + { + "epoch": 0.2751338944146901, + "grad_norm": 2.6433366981390223, + "learning_rate": 1.9399936207693826e-05, + "loss": 0.7657, + "step": 1798 + }, + { + "epoch": 0.27528691660290744, + "grad_norm": 2.3588536871322914, + "learning_rate": 1.9399090382783802e-05, + "loss": 0.708, + "step": 1799 + }, + { + "epoch": 0.2754399387911247, + "grad_norm": 2.6128210317047076, + "learning_rate": 1.9398243980638867e-05, + "loss": 0.8084, + "step": 1800 + }, + { + "epoch": 0.275592960979342, + "grad_norm": 2.384807803529249, + "learning_rate": 1.9397397001311007e-05, + "loss": 0.8958, + "step": 1801 + }, + { + "epoch": 0.27574598316755927, + "grad_norm": 2.561729566518759, + "learning_rate": 1.939654944485223e-05, + "loss": 0.8511, + "step": 1802 + }, + { + "epoch": 0.2758990053557766, + "grad_norm": 2.409093911820784, + "learning_rate": 1.9395701311314594e-05, + "loss": 0.7611, + "step": 1803 + }, + { + "epoch": 0.2760520275439939, + "grad_norm": 2.369205457750817, + "learning_rate": 1.9394852600750184e-05, + "loss": 0.7975, + "step": 1804 + }, + { + "epoch": 0.27620504973221116, + "grad_norm": 2.501377346696639, + "learning_rate": 1.9394003313211126e-05, + "loss": 0.7295, + "step": 1805 + }, + { + "epoch": 0.27635807192042844, + "grad_norm": 2.419062632867766, + "learning_rate": 1.9393153448749572e-05, + "loss": 0.8551, + "step": 1806 + }, + { + "epoch": 0.27651109410864577, + "grad_norm": 2.4102900990912928, + "learning_rate": 1.9392303007417717e-05, + "loss": 0.9076, + "step": 1807 + }, + { + "epoch": 0.27666411629686305, + "grad_norm": 2.2795357293501457, + "learning_rate": 1.9391451989267795e-05, + "loss": 0.7942, + "step": 1808 + }, + { + "epoch": 0.2768171384850803, + "grad_norm": 2.650801351754482, + "learning_rate": 1.9390600394352066e-05, + "loss": 0.8162, + "step": 1809 + }, + { + "epoch": 0.2769701606732976, + "grad_norm": 2.389261821947559, + "learning_rate": 1.9389748222722827e-05, + "loss": 0.7481, + "step": 1810 + }, + { + "epoch": 0.27712318286151494, + "grad_norm": 2.1568992212804363, + "learning_rate": 1.938889547443242e-05, + "loss": 0.7289, + "step": 1811 + }, + { + "epoch": 0.2772762050497322, + "grad_norm": 2.5354031323568242, + "learning_rate": 1.9388042149533214e-05, + "loss": 0.8771, + "step": 1812 + }, + { + "epoch": 0.2774292272379495, + "grad_norm": 2.7834820672844067, + "learning_rate": 1.938718824807761e-05, + "loss": 0.8314, + "step": 1813 + }, + { + "epoch": 0.27758224942616677, + "grad_norm": 2.342286831367304, + "learning_rate": 1.9386333770118054e-05, + "loss": 0.7752, + "step": 1814 + }, + { + "epoch": 0.2777352716143841, + "grad_norm": 2.431265873833426, + "learning_rate": 1.9385478715707024e-05, + "loss": 0.8154, + "step": 1815 + }, + { + "epoch": 0.2778882938026014, + "grad_norm": 3.472642793672234, + "learning_rate": 1.9384623084897025e-05, + "loss": 0.8133, + "step": 1816 + }, + { + "epoch": 0.27804131599081866, + "grad_norm": 2.4898768514747287, + "learning_rate": 1.938376687774061e-05, + "loss": 0.9039, + "step": 1817 + }, + { + "epoch": 0.27819433817903594, + "grad_norm": 2.453143034799608, + "learning_rate": 1.9382910094290367e-05, + "loss": 0.7147, + "step": 1818 + }, + { + "epoch": 0.27834736036725327, + "grad_norm": 2.3760737629972275, + "learning_rate": 1.9382052734598902e-05, + "loss": 0.8291, + "step": 1819 + }, + { + "epoch": 0.27850038255547055, + "grad_norm": 2.9747112957484365, + "learning_rate": 1.938119479871888e-05, + "loss": 0.8505, + "step": 1820 + }, + { + "epoch": 0.2786534047436878, + "grad_norm": 2.6897445660961323, + "learning_rate": 1.9380336286702987e-05, + "loss": 0.8376, + "step": 1821 + }, + { + "epoch": 0.2788064269319051, + "grad_norm": 2.9716563210946103, + "learning_rate": 1.9379477198603944e-05, + "loss": 0.9802, + "step": 1822 + }, + { + "epoch": 0.27895944912012244, + "grad_norm": 2.508289472632622, + "learning_rate": 1.9378617534474514e-05, + "loss": 0.9134, + "step": 1823 + }, + { + "epoch": 0.2791124713083397, + "grad_norm": 2.458112494711284, + "learning_rate": 1.937775729436749e-05, + "loss": 0.775, + "step": 1824 + }, + { + "epoch": 0.279265493496557, + "grad_norm": 2.334406138363872, + "learning_rate": 1.937689647833571e-05, + "loss": 0.82, + "step": 1825 + }, + { + "epoch": 0.27941851568477427, + "grad_norm": 2.4747860463869005, + "learning_rate": 1.937603508643203e-05, + "loss": 0.8692, + "step": 1826 + }, + { + "epoch": 0.2795715378729916, + "grad_norm": 2.477096190126597, + "learning_rate": 1.9375173118709357e-05, + "loss": 0.8041, + "step": 1827 + }, + { + "epoch": 0.2797245600612089, + "grad_norm": 2.2376490321825253, + "learning_rate": 1.937431057522063e-05, + "loss": 0.8329, + "step": 1828 + }, + { + "epoch": 0.27987758224942616, + "grad_norm": 2.7371937042935217, + "learning_rate": 1.9373447456018814e-05, + "loss": 0.8114, + "step": 1829 + }, + { + "epoch": 0.28003060443764344, + "grad_norm": 2.4625318159596223, + "learning_rate": 1.9372583761156924e-05, + "loss": 0.8698, + "step": 1830 + }, + { + "epoch": 0.28018362662586077, + "grad_norm": 2.572144753716217, + "learning_rate": 1.9371719490687994e-05, + "loss": 0.7726, + "step": 1831 + }, + { + "epoch": 0.28033664881407805, + "grad_norm": 3.1187471865773766, + "learning_rate": 1.9370854644665113e-05, + "loss": 0.907, + "step": 1832 + }, + { + "epoch": 0.2804896710022953, + "grad_norm": 2.8014006276479577, + "learning_rate": 1.9369989223141386e-05, + "loss": 0.879, + "step": 1833 + }, + { + "epoch": 0.2806426931905126, + "grad_norm": 2.3574224893672655, + "learning_rate": 1.9369123226169967e-05, + "loss": 0.7876, + "step": 1834 + }, + { + "epoch": 0.28079571537872994, + "grad_norm": 2.4041536999686706, + "learning_rate": 1.936825665380404e-05, + "loss": 0.7086, + "step": 1835 + }, + { + "epoch": 0.2809487375669472, + "grad_norm": 2.629825929620783, + "learning_rate": 1.936738950609682e-05, + "loss": 0.8817, + "step": 1836 + }, + { + "epoch": 0.2811017597551645, + "grad_norm": 2.501772945163813, + "learning_rate": 1.9366521783101566e-05, + "loss": 0.8242, + "step": 1837 + }, + { + "epoch": 0.28125478194338177, + "grad_norm": 2.5759467678625065, + "learning_rate": 1.9365653484871567e-05, + "loss": 0.8058, + "step": 1838 + }, + { + "epoch": 0.2814078041315991, + "grad_norm": 2.506404915094637, + "learning_rate": 1.936478461146015e-05, + "loss": 0.7182, + "step": 1839 + }, + { + "epoch": 0.2815608263198164, + "grad_norm": 2.654103256153006, + "learning_rate": 1.9363915162920676e-05, + "loss": 0.8209, + "step": 1840 + }, + { + "epoch": 0.28171384850803366, + "grad_norm": 2.352377734931323, + "learning_rate": 1.9363045139306536e-05, + "loss": 0.7696, + "step": 1841 + }, + { + "epoch": 0.28186687069625094, + "grad_norm": 2.4722586829533966, + "learning_rate": 1.9362174540671167e-05, + "loss": 0.7529, + "step": 1842 + }, + { + "epoch": 0.28201989288446827, + "grad_norm": 2.573994784803902, + "learning_rate": 1.9361303367068035e-05, + "loss": 0.7462, + "step": 1843 + }, + { + "epoch": 0.28217291507268555, + "grad_norm": 2.2498326088485974, + "learning_rate": 1.9360431618550645e-05, + "loss": 0.7454, + "step": 1844 + }, + { + "epoch": 0.2823259372609028, + "grad_norm": 2.289597736923765, + "learning_rate": 1.9359559295172525e-05, + "loss": 0.8085, + "step": 1845 + }, + { + "epoch": 0.2824789594491201, + "grad_norm": 2.7762561379472954, + "learning_rate": 1.9358686396987256e-05, + "loss": 0.7927, + "step": 1846 + }, + { + "epoch": 0.28263198163733744, + "grad_norm": 2.6661352318965044, + "learning_rate": 1.9357812924048445e-05, + "loss": 0.841, + "step": 1847 + }, + { + "epoch": 0.2827850038255547, + "grad_norm": 2.5975812171329964, + "learning_rate": 1.9356938876409735e-05, + "loss": 0.8676, + "step": 1848 + }, + { + "epoch": 0.282938026013772, + "grad_norm": 2.125735235990031, + "learning_rate": 1.9356064254124803e-05, + "loss": 0.7212, + "step": 1849 + }, + { + "epoch": 0.28309104820198927, + "grad_norm": 2.7059435749550556, + "learning_rate": 1.9355189057247363e-05, + "loss": 0.7979, + "step": 1850 + }, + { + "epoch": 0.2832440703902066, + "grad_norm": 2.4770619838430448, + "learning_rate": 1.9354313285831167e-05, + "loss": 0.8406, + "step": 1851 + }, + { + "epoch": 0.2833970925784239, + "grad_norm": 2.695527673356984, + "learning_rate": 1.9353436939929997e-05, + "loss": 0.7533, + "step": 1852 + }, + { + "epoch": 0.28355011476664116, + "grad_norm": 3.0149128657035362, + "learning_rate": 1.9352560019597675e-05, + "loss": 0.8231, + "step": 1853 + }, + { + "epoch": 0.28370313695485844, + "grad_norm": 2.6130169564784893, + "learning_rate": 1.9351682524888052e-05, + "loss": 0.7279, + "step": 1854 + }, + { + "epoch": 0.28385615914307577, + "grad_norm": 2.2246128982518347, + "learning_rate": 1.935080445585502e-05, + "loss": 0.8827, + "step": 1855 + }, + { + "epoch": 0.28400918133129305, + "grad_norm": 2.21223342955157, + "learning_rate": 1.934992581255251e-05, + "loss": 0.727, + "step": 1856 + }, + { + "epoch": 0.2841622035195103, + "grad_norm": 2.579774600746488, + "learning_rate": 1.934904659503448e-05, + "loss": 0.8197, + "step": 1857 + }, + { + "epoch": 0.2843152257077276, + "grad_norm": 2.485921849818407, + "learning_rate": 1.9348166803354923e-05, + "loss": 0.77, + "step": 1858 + }, + { + "epoch": 0.28446824789594494, + "grad_norm": 2.407776330565242, + "learning_rate": 1.9347286437567868e-05, + "loss": 0.8485, + "step": 1859 + }, + { + "epoch": 0.2846212700841622, + "grad_norm": 2.462948009991273, + "learning_rate": 1.934640549772739e-05, + "loss": 0.748, + "step": 1860 + }, + { + "epoch": 0.2847742922723795, + "grad_norm": 2.5536317903542, + "learning_rate": 1.9345523983887585e-05, + "loss": 0.8609, + "step": 1861 + }, + { + "epoch": 0.28492731446059677, + "grad_norm": 2.4342429583754637, + "learning_rate": 1.9344641896102596e-05, + "loss": 0.8274, + "step": 1862 + }, + { + "epoch": 0.2850803366488141, + "grad_norm": 2.629328839248284, + "learning_rate": 1.934375923442659e-05, + "loss": 0.7806, + "step": 1863 + }, + { + "epoch": 0.2852333588370314, + "grad_norm": 2.825115788194663, + "learning_rate": 1.9342875998913774e-05, + "loss": 0.843, + "step": 1864 + }, + { + "epoch": 0.28538638102524866, + "grad_norm": 2.432079122950358, + "learning_rate": 1.93419921896184e-05, + "loss": 0.6456, + "step": 1865 + }, + { + "epoch": 0.28553940321346594, + "grad_norm": 2.437685754761619, + "learning_rate": 1.9341107806594733e-05, + "loss": 0.8309, + "step": 1866 + }, + { + "epoch": 0.28569242540168327, + "grad_norm": 2.551675385381315, + "learning_rate": 1.9340222849897096e-05, + "loss": 0.8865, + "step": 1867 + }, + { + "epoch": 0.28584544758990055, + "grad_norm": 2.508467686617371, + "learning_rate": 1.9339337319579833e-05, + "loss": 0.7803, + "step": 1868 + }, + { + "epoch": 0.2859984697781178, + "grad_norm": 2.5126960561564413, + "learning_rate": 1.933845121569733e-05, + "loss": 0.7849, + "step": 1869 + }, + { + "epoch": 0.2861514919663351, + "grad_norm": 2.613958666791803, + "learning_rate": 1.9337564538304004e-05, + "loss": 0.8853, + "step": 1870 + }, + { + "epoch": 0.28630451415455244, + "grad_norm": 2.5244937795138376, + "learning_rate": 1.9336677287454316e-05, + "loss": 0.8044, + "step": 1871 + }, + { + "epoch": 0.2864575363427697, + "grad_norm": 2.565342492613703, + "learning_rate": 1.9335789463202744e-05, + "loss": 0.7959, + "step": 1872 + }, + { + "epoch": 0.286610558530987, + "grad_norm": 2.5403185668043773, + "learning_rate": 1.9334901065603823e-05, + "loss": 0.7555, + "step": 1873 + }, + { + "epoch": 0.28676358071920427, + "grad_norm": 2.4550949680199, + "learning_rate": 1.9334012094712108e-05, + "loss": 0.7674, + "step": 1874 + }, + { + "epoch": 0.2869166029074216, + "grad_norm": 2.3284267657849793, + "learning_rate": 1.9333122550582197e-05, + "loss": 0.7756, + "step": 1875 + }, + { + "epoch": 0.2870696250956389, + "grad_norm": 2.6275948038792323, + "learning_rate": 1.9332232433268718e-05, + "loss": 0.7993, + "step": 1876 + }, + { + "epoch": 0.28722264728385616, + "grad_norm": 2.568517017557821, + "learning_rate": 1.9331341742826337e-05, + "loss": 0.7721, + "step": 1877 + }, + { + "epoch": 0.28737566947207344, + "grad_norm": 2.334517381546241, + "learning_rate": 1.933045047930976e-05, + "loss": 0.8133, + "step": 1878 + }, + { + "epoch": 0.28752869166029077, + "grad_norm": 2.3005139977871942, + "learning_rate": 1.932955864277371e-05, + "loss": 0.8128, + "step": 1879 + }, + { + "epoch": 0.28768171384850805, + "grad_norm": 2.7207446187801017, + "learning_rate": 1.932866623327297e-05, + "loss": 0.8604, + "step": 1880 + }, + { + "epoch": 0.2878347360367253, + "grad_norm": 2.492245625318649, + "learning_rate": 1.9327773250862344e-05, + "loss": 0.845, + "step": 1881 + }, + { + "epoch": 0.2879877582249426, + "grad_norm": 2.467518676739865, + "learning_rate": 1.932687969559667e-05, + "loss": 0.9091, + "step": 1882 + }, + { + "epoch": 0.2881407804131599, + "grad_norm": 2.538560762578847, + "learning_rate": 1.9325985567530825e-05, + "loss": 0.7151, + "step": 1883 + }, + { + "epoch": 0.2882938026013772, + "grad_norm": 2.3038764537536305, + "learning_rate": 1.9325090866719726e-05, + "loss": 0.8092, + "step": 1884 + }, + { + "epoch": 0.2884468247895945, + "grad_norm": 2.703874276713587, + "learning_rate": 1.9324195593218315e-05, + "loss": 0.7405, + "step": 1885 + }, + { + "epoch": 0.28859984697781177, + "grad_norm": 2.2950766344341047, + "learning_rate": 1.932329974708158e-05, + "loss": 0.6749, + "step": 1886 + }, + { + "epoch": 0.28875286916602905, + "grad_norm": 2.4093547511917826, + "learning_rate": 1.932240332836453e-05, + "loss": 0.9128, + "step": 1887 + }, + { + "epoch": 0.2889058913542464, + "grad_norm": 2.743832608962814, + "learning_rate": 1.9321506337122224e-05, + "loss": 0.8467, + "step": 1888 + }, + { + "epoch": 0.28905891354246366, + "grad_norm": 2.418044806593421, + "learning_rate": 1.932060877340975e-05, + "loss": 0.7427, + "step": 1889 + }, + { + "epoch": 0.28921193573068094, + "grad_norm": 2.4645967587420987, + "learning_rate": 1.9319710637282227e-05, + "loss": 0.8417, + "step": 1890 + }, + { + "epoch": 0.2893649579188982, + "grad_norm": 2.6800041375280763, + "learning_rate": 1.9318811928794817e-05, + "loss": 0.8604, + "step": 1891 + }, + { + "epoch": 0.28951798010711555, + "grad_norm": 2.8142534613435033, + "learning_rate": 1.9317912648002708e-05, + "loss": 0.8685, + "step": 1892 + }, + { + "epoch": 0.2896710022953328, + "grad_norm": 2.517363713662083, + "learning_rate": 1.931701279496113e-05, + "loss": 0.8123, + "step": 1893 + }, + { + "epoch": 0.2898240244835501, + "grad_norm": 2.559876231147071, + "learning_rate": 1.9316112369725354e-05, + "loss": 0.8617, + "step": 1894 + }, + { + "epoch": 0.2899770466717674, + "grad_norm": 2.247438925725813, + "learning_rate": 1.9315211372350667e-05, + "loss": 0.8346, + "step": 1895 + }, + { + "epoch": 0.2901300688599847, + "grad_norm": 2.4545703617872365, + "learning_rate": 1.9314309802892407e-05, + "loss": 0.7889, + "step": 1896 + }, + { + "epoch": 0.290283091048202, + "grad_norm": 2.3768578395555835, + "learning_rate": 1.931340766140595e-05, + "loss": 0.8748, + "step": 1897 + }, + { + "epoch": 0.29043611323641927, + "grad_norm": 2.429358667095976, + "learning_rate": 1.931250494794669e-05, + "loss": 0.8916, + "step": 1898 + }, + { + "epoch": 0.29058913542463655, + "grad_norm": 2.494900204607473, + "learning_rate": 1.9311601662570072e-05, + "loss": 0.8617, + "step": 1899 + }, + { + "epoch": 0.2907421576128539, + "grad_norm": 2.4468745440891295, + "learning_rate": 1.931069780533157e-05, + "loss": 0.7456, + "step": 1900 + }, + { + "epoch": 0.29089517980107116, + "grad_norm": 2.736762375685349, + "learning_rate": 1.930979337628669e-05, + "loss": 0.7943, + "step": 1901 + }, + { + "epoch": 0.29104820198928844, + "grad_norm": 2.702694904834764, + "learning_rate": 1.930888837549098e-05, + "loss": 0.8252, + "step": 1902 + }, + { + "epoch": 0.2912012241775057, + "grad_norm": 2.527780804908085, + "learning_rate": 1.9307982803000017e-05, + "loss": 0.7839, + "step": 1903 + }, + { + "epoch": 0.29135424636572305, + "grad_norm": 2.579662212769781, + "learning_rate": 1.9307076658869417e-05, + "loss": 0.8185, + "step": 1904 + }, + { + "epoch": 0.2915072685539403, + "grad_norm": 2.3463765755521186, + "learning_rate": 1.9306169943154832e-05, + "loss": 0.7127, + "step": 1905 + }, + { + "epoch": 0.2916602907421576, + "grad_norm": 2.5764100564188106, + "learning_rate": 1.9305262655911946e-05, + "loss": 0.8555, + "step": 1906 + }, + { + "epoch": 0.2918133129303749, + "grad_norm": 2.7225480761530623, + "learning_rate": 1.9304354797196475e-05, + "loss": 0.8489, + "step": 1907 + }, + { + "epoch": 0.2919663351185922, + "grad_norm": 2.9225439859512687, + "learning_rate": 1.9303446367064175e-05, + "loss": 0.8204, + "step": 1908 + }, + { + "epoch": 0.2921193573068095, + "grad_norm": 2.7728976761121276, + "learning_rate": 1.9302537365570843e-05, + "loss": 0.8533, + "step": 1909 + }, + { + "epoch": 0.29227237949502677, + "grad_norm": 2.788291953812961, + "learning_rate": 1.9301627792772297e-05, + "loss": 0.9378, + "step": 1910 + }, + { + "epoch": 0.29242540168324405, + "grad_norm": 2.4399042097338235, + "learning_rate": 1.93007176487244e-05, + "loss": 0.7785, + "step": 1911 + }, + { + "epoch": 0.2925784238714614, + "grad_norm": 2.428125304895574, + "learning_rate": 1.929980693348305e-05, + "loss": 0.7808, + "step": 1912 + }, + { + "epoch": 0.29273144605967866, + "grad_norm": 2.472689699702656, + "learning_rate": 1.929889564710417e-05, + "loss": 0.8331, + "step": 1913 + }, + { + "epoch": 0.29288446824789593, + "grad_norm": 2.5262689416061037, + "learning_rate": 1.9297983789643735e-05, + "loss": 0.8749, + "step": 1914 + }, + { + "epoch": 0.2930374904361132, + "grad_norm": 2.638386784656946, + "learning_rate": 1.929707136115774e-05, + "loss": 0.919, + "step": 1915 + }, + { + "epoch": 0.29319051262433055, + "grad_norm": 2.4550326677156344, + "learning_rate": 1.9296158361702223e-05, + "loss": 0.7279, + "step": 1916 + }, + { + "epoch": 0.2933435348125478, + "grad_norm": 2.643293435117193, + "learning_rate": 1.9295244791333255e-05, + "loss": 0.8779, + "step": 1917 + }, + { + "epoch": 0.2934965570007651, + "grad_norm": 2.5727735075626557, + "learning_rate": 1.929433065010694e-05, + "loss": 0.7984, + "step": 1918 + }, + { + "epoch": 0.2936495791889824, + "grad_norm": 2.434329291686872, + "learning_rate": 1.929341593807942e-05, + "loss": 0.858, + "step": 1919 + }, + { + "epoch": 0.2938026013771997, + "grad_norm": 2.4388397392820393, + "learning_rate": 1.9292500655306872e-05, + "loss": 0.8588, + "step": 1920 + }, + { + "epoch": 0.293955623565417, + "grad_norm": 2.469418880655886, + "learning_rate": 1.9291584801845508e-05, + "loss": 0.8674, + "step": 1921 + }, + { + "epoch": 0.29410864575363427, + "grad_norm": 2.300386835534169, + "learning_rate": 1.929066837775157e-05, + "loss": 0.8688, + "step": 1922 + }, + { + "epoch": 0.29426166794185155, + "grad_norm": 2.317847305611336, + "learning_rate": 1.9289751383081342e-05, + "loss": 0.7381, + "step": 1923 + }, + { + "epoch": 0.2944146901300689, + "grad_norm": 2.4895403782250067, + "learning_rate": 1.928883381789114e-05, + "loss": 0.9119, + "step": 1924 + }, + { + "epoch": 0.29456771231828616, + "grad_norm": 2.56795549530539, + "learning_rate": 1.9287915682237314e-05, + "loss": 0.7846, + "step": 1925 + }, + { + "epoch": 0.29472073450650343, + "grad_norm": 2.549866674572653, + "learning_rate": 1.928699697617625e-05, + "loss": 0.7652, + "step": 1926 + }, + { + "epoch": 0.2948737566947207, + "grad_norm": 2.4390351626614057, + "learning_rate": 1.9286077699764376e-05, + "loss": 0.7653, + "step": 1927 + }, + { + "epoch": 0.29502677888293805, + "grad_norm": 2.4418868117867394, + "learning_rate": 1.9285157853058142e-05, + "loss": 0.8644, + "step": 1928 + }, + { + "epoch": 0.2951798010711553, + "grad_norm": 2.502305903557577, + "learning_rate": 1.9284237436114038e-05, + "loss": 0.9264, + "step": 1929 + }, + { + "epoch": 0.2953328232593726, + "grad_norm": 2.2963754527727933, + "learning_rate": 1.9283316448988593e-05, + "loss": 0.7938, + "step": 1930 + }, + { + "epoch": 0.2954858454475899, + "grad_norm": 3.0614462809673526, + "learning_rate": 1.928239489173837e-05, + "loss": 0.8869, + "step": 1931 + }, + { + "epoch": 0.2956388676358072, + "grad_norm": 2.4080850709017385, + "learning_rate": 1.9281472764419962e-05, + "loss": 0.8448, + "step": 1932 + }, + { + "epoch": 0.2957918898240245, + "grad_norm": 2.5403826284863418, + "learning_rate": 1.9280550067090003e-05, + "loss": 0.7956, + "step": 1933 + }, + { + "epoch": 0.29594491201224177, + "grad_norm": 2.2358612452947093, + "learning_rate": 1.927962679980516e-05, + "loss": 0.7314, + "step": 1934 + }, + { + "epoch": 0.29609793420045905, + "grad_norm": 2.43089626487857, + "learning_rate": 1.927870296262213e-05, + "loss": 0.8507, + "step": 1935 + }, + { + "epoch": 0.2962509563886764, + "grad_norm": 2.455578482886986, + "learning_rate": 1.927777855559766e-05, + "loss": 0.8509, + "step": 1936 + }, + { + "epoch": 0.29640397857689366, + "grad_norm": 2.564323058941907, + "learning_rate": 1.9276853578788502e-05, + "loss": 0.9223, + "step": 1937 + }, + { + "epoch": 0.29655700076511093, + "grad_norm": 2.7623433756645994, + "learning_rate": 1.9275928032251484e-05, + "loss": 0.9499, + "step": 1938 + }, + { + "epoch": 0.2967100229533282, + "grad_norm": 2.3216848528978558, + "learning_rate": 1.9275001916043436e-05, + "loss": 0.7305, + "step": 1939 + }, + { + "epoch": 0.29686304514154555, + "grad_norm": 2.388246616293578, + "learning_rate": 1.927407523022123e-05, + "loss": 0.928, + "step": 1940 + }, + { + "epoch": 0.2970160673297628, + "grad_norm": 2.3536391454598906, + "learning_rate": 1.927314797484179e-05, + "loss": 0.7234, + "step": 1941 + }, + { + "epoch": 0.2971690895179801, + "grad_norm": 2.424189518386324, + "learning_rate": 1.9272220149962057e-05, + "loss": 0.8198, + "step": 1942 + }, + { + "epoch": 0.2973221117061974, + "grad_norm": 2.763169194414181, + "learning_rate": 1.927129175563901e-05, + "loss": 0.8644, + "step": 1943 + }, + { + "epoch": 0.2974751338944147, + "grad_norm": 2.4385903706286274, + "learning_rate": 1.9270362791929663e-05, + "loss": 0.7624, + "step": 1944 + }, + { + "epoch": 0.297628156082632, + "grad_norm": 2.3830375578538048, + "learning_rate": 1.9269433258891076e-05, + "loss": 0.785, + "step": 1945 + }, + { + "epoch": 0.29778117827084927, + "grad_norm": 3.284351176237654, + "learning_rate": 1.926850315658033e-05, + "loss": 0.9291, + "step": 1946 + }, + { + "epoch": 0.29793420045906654, + "grad_norm": 2.3165311177119188, + "learning_rate": 1.9267572485054544e-05, + "loss": 0.8254, + "step": 1947 + }, + { + "epoch": 0.2980872226472839, + "grad_norm": 2.4524168762163483, + "learning_rate": 1.926664124437088e-05, + "loss": 0.7993, + "step": 1948 + }, + { + "epoch": 0.29824024483550116, + "grad_norm": 2.6058768862175388, + "learning_rate": 1.9265709434586522e-05, + "loss": 0.816, + "step": 1949 + }, + { + "epoch": 0.29839326702371843, + "grad_norm": 2.243298273276326, + "learning_rate": 1.9264777055758704e-05, + "loss": 0.7164, + "step": 1950 + }, + { + "epoch": 0.2985462892119357, + "grad_norm": 2.62737281453155, + "learning_rate": 1.926384410794468e-05, + "loss": 0.9408, + "step": 1951 + }, + { + "epoch": 0.29869931140015304, + "grad_norm": 2.661164387856838, + "learning_rate": 1.9262910591201752e-05, + "loss": 0.8959, + "step": 1952 + }, + { + "epoch": 0.2988523335883703, + "grad_norm": 2.463256788156899, + "learning_rate": 1.9261976505587245e-05, + "loss": 0.7347, + "step": 1953 + }, + { + "epoch": 0.2990053557765876, + "grad_norm": 2.696961971747593, + "learning_rate": 1.9261041851158526e-05, + "loss": 0.8956, + "step": 1954 + }, + { + "epoch": 0.2991583779648049, + "grad_norm": 2.3188961750528403, + "learning_rate": 1.9260106627973e-05, + "loss": 0.8279, + "step": 1955 + }, + { + "epoch": 0.2993114001530222, + "grad_norm": 2.4269478964045117, + "learning_rate": 1.92591708360881e-05, + "loss": 0.7931, + "step": 1956 + }, + { + "epoch": 0.2994644223412395, + "grad_norm": 2.48052465786968, + "learning_rate": 1.92582344755613e-05, + "loss": 0.8876, + "step": 1957 + }, + { + "epoch": 0.29961744452945677, + "grad_norm": 2.309190634912905, + "learning_rate": 1.9257297546450097e-05, + "loss": 0.8796, + "step": 1958 + }, + { + "epoch": 0.29977046671767404, + "grad_norm": 2.5168712148937717, + "learning_rate": 1.925636004881204e-05, + "loss": 0.9515, + "step": 1959 + }, + { + "epoch": 0.2999234889058914, + "grad_norm": 2.3043960080089723, + "learning_rate": 1.92554219827047e-05, + "loss": 0.8175, + "step": 1960 + }, + { + "epoch": 0.30007651109410866, + "grad_norm": 2.5426296255764176, + "learning_rate": 1.925448334818569e-05, + "loss": 0.7521, + "step": 1961 + }, + { + "epoch": 0.30022953328232593, + "grad_norm": 2.414666262662447, + "learning_rate": 1.9253544145312654e-05, + "loss": 0.9249, + "step": 1962 + }, + { + "epoch": 0.3003825554705432, + "grad_norm": 2.5867144779294184, + "learning_rate": 1.925260437414327e-05, + "loss": 0.9349, + "step": 1963 + }, + { + "epoch": 0.30053557765876054, + "grad_norm": 2.473431576610012, + "learning_rate": 1.9251664034735258e-05, + "loss": 0.8798, + "step": 1964 + }, + { + "epoch": 0.3006885998469778, + "grad_norm": 2.1385680671542278, + "learning_rate": 1.9250723127146362e-05, + "loss": 0.7429, + "step": 1965 + }, + { + "epoch": 0.3008416220351951, + "grad_norm": 2.5981076958560956, + "learning_rate": 1.9249781651434372e-05, + "loss": 0.7934, + "step": 1966 + }, + { + "epoch": 0.3009946442234124, + "grad_norm": 2.455515129458916, + "learning_rate": 1.92488396076571e-05, + "loss": 0.905, + "step": 1967 + }, + { + "epoch": 0.3011476664116297, + "grad_norm": 2.8389366912830303, + "learning_rate": 1.9247896995872413e-05, + "loss": 0.843, + "step": 1968 + }, + { + "epoch": 0.301300688599847, + "grad_norm": 2.420967448885132, + "learning_rate": 1.924695381613819e-05, + "loss": 0.8977, + "step": 1969 + }, + { + "epoch": 0.30145371078806427, + "grad_norm": 2.2695379545380314, + "learning_rate": 1.924601006851236e-05, + "loss": 0.6859, + "step": 1970 + }, + { + "epoch": 0.30160673297628154, + "grad_norm": 2.2636926091187144, + "learning_rate": 1.924506575305288e-05, + "loss": 0.7773, + "step": 1971 + }, + { + "epoch": 0.3017597551644989, + "grad_norm": 2.7898273917009755, + "learning_rate": 1.9244120869817746e-05, + "loss": 0.8155, + "step": 1972 + }, + { + "epoch": 0.30191277735271616, + "grad_norm": 2.4340034177552856, + "learning_rate": 1.924317541886499e-05, + "loss": 0.8266, + "step": 1973 + }, + { + "epoch": 0.30206579954093343, + "grad_norm": 2.2766416109438583, + "learning_rate": 1.9242229400252666e-05, + "loss": 0.8102, + "step": 1974 + }, + { + "epoch": 0.3022188217291507, + "grad_norm": 2.5561178004217955, + "learning_rate": 1.924128281403888e-05, + "loss": 0.8036, + "step": 1975 + }, + { + "epoch": 0.30237184391736804, + "grad_norm": 2.379553828425003, + "learning_rate": 1.9240335660281764e-05, + "loss": 0.7768, + "step": 1976 + }, + { + "epoch": 0.3025248661055853, + "grad_norm": 2.5252891466757292, + "learning_rate": 1.923938793903949e-05, + "loss": 0.7895, + "step": 1977 + }, + { + "epoch": 0.3026778882938026, + "grad_norm": 2.2715040776578252, + "learning_rate": 1.9238439650370256e-05, + "loss": 0.8965, + "step": 1978 + }, + { + "epoch": 0.3028309104820199, + "grad_norm": 2.465898560897087, + "learning_rate": 1.92374907943323e-05, + "loss": 0.7972, + "step": 1979 + }, + { + "epoch": 0.3029839326702372, + "grad_norm": 2.628192379084707, + "learning_rate": 1.92365413709839e-05, + "loss": 0.8772, + "step": 1980 + }, + { + "epoch": 0.3031369548584545, + "grad_norm": 2.59756485098828, + "learning_rate": 1.9235591380383362e-05, + "loss": 0.8435, + "step": 1981 + }, + { + "epoch": 0.30328997704667177, + "grad_norm": 2.3284828288876116, + "learning_rate": 1.9234640822589023e-05, + "loss": 0.7544, + "step": 1982 + }, + { + "epoch": 0.30344299923488904, + "grad_norm": 2.603774040163367, + "learning_rate": 1.9233689697659268e-05, + "loss": 0.8142, + "step": 1983 + }, + { + "epoch": 0.3035960214231064, + "grad_norm": 2.4923640409532393, + "learning_rate": 1.923273800565251e-05, + "loss": 0.9354, + "step": 1984 + }, + { + "epoch": 0.30374904361132365, + "grad_norm": 2.6499802758692765, + "learning_rate": 1.9231785746627186e-05, + "loss": 0.8094, + "step": 1985 + }, + { + "epoch": 0.30390206579954093, + "grad_norm": 2.2595162035840697, + "learning_rate": 1.9230832920641784e-05, + "loss": 0.8003, + "step": 1986 + }, + { + "epoch": 0.3040550879877582, + "grad_norm": 2.4192559742903246, + "learning_rate": 1.922987952775482e-05, + "loss": 0.804, + "step": 1987 + }, + { + "epoch": 0.30420811017597554, + "grad_norm": 2.316981550914932, + "learning_rate": 1.9228925568024855e-05, + "loss": 0.754, + "step": 1988 + }, + { + "epoch": 0.3043611323641928, + "grad_norm": 7.151782661755483, + "learning_rate": 1.9227971041510463e-05, + "loss": 0.8844, + "step": 1989 + }, + { + "epoch": 0.3045141545524101, + "grad_norm": 2.734107657629692, + "learning_rate": 1.9227015948270266e-05, + "loss": 0.7944, + "step": 1990 + }, + { + "epoch": 0.3046671767406274, + "grad_norm": 2.529668625783798, + "learning_rate": 1.9226060288362927e-05, + "loss": 0.8836, + "step": 1991 + }, + { + "epoch": 0.3048201989288447, + "grad_norm": 2.546234327275014, + "learning_rate": 1.922510406184713e-05, + "loss": 0.7898, + "step": 1992 + }, + { + "epoch": 0.304973221117062, + "grad_norm": 2.395162331895941, + "learning_rate": 1.922414726878161e-05, + "loss": 0.7806, + "step": 1993 + }, + { + "epoch": 0.30512624330527927, + "grad_norm": 2.55658252083043, + "learning_rate": 1.9223189909225114e-05, + "loss": 0.8855, + "step": 1994 + }, + { + "epoch": 0.30527926549349654, + "grad_norm": 2.4501674002626976, + "learning_rate": 1.9222231983236447e-05, + "loss": 0.8802, + "step": 1995 + }, + { + "epoch": 0.3054322876817138, + "grad_norm": 2.4889603134355314, + "learning_rate": 1.9221273490874436e-05, + "loss": 0.7977, + "step": 1996 + }, + { + "epoch": 0.30558530986993115, + "grad_norm": 2.3944327612655725, + "learning_rate": 1.922031443219795e-05, + "loss": 0.832, + "step": 1997 + }, + { + "epoch": 0.30573833205814843, + "grad_norm": 2.995473969176262, + "learning_rate": 1.921935480726588e-05, + "loss": 0.7878, + "step": 1998 + }, + { + "epoch": 0.3058913542463657, + "grad_norm": 2.7168338445829474, + "learning_rate": 1.9218394616137166e-05, + "loss": 0.8424, + "step": 1999 + }, + { + "epoch": 0.306044376434583, + "grad_norm": 2.5441476948697854, + "learning_rate": 1.921743385887078e-05, + "loss": 0.8708, + "step": 2000 + }, + { + "epoch": 0.3061973986228003, + "grad_norm": 2.4494392535414096, + "learning_rate": 1.9216472535525717e-05, + "loss": 0.8481, + "step": 2001 + }, + { + "epoch": 0.3063504208110176, + "grad_norm": 2.6112325729047923, + "learning_rate": 1.921551064616102e-05, + "loss": 0.8245, + "step": 2002 + }, + { + "epoch": 0.3065034429992349, + "grad_norm": 2.3444443429432416, + "learning_rate": 1.9214548190835766e-05, + "loss": 0.7449, + "step": 2003 + }, + { + "epoch": 0.30665646518745215, + "grad_norm": 2.446188822684606, + "learning_rate": 1.9213585169609058e-05, + "loss": 0.831, + "step": 2004 + }, + { + "epoch": 0.3068094873756695, + "grad_norm": 2.2145856338125602, + "learning_rate": 1.9212621582540042e-05, + "loss": 0.7718, + "step": 2005 + }, + { + "epoch": 0.30696250956388677, + "grad_norm": 2.5819277402890526, + "learning_rate": 1.9211657429687895e-05, + "loss": 0.8231, + "step": 2006 + }, + { + "epoch": 0.30711553175210404, + "grad_norm": 2.0650856048224617, + "learning_rate": 1.9210692711111827e-05, + "loss": 0.8361, + "step": 2007 + }, + { + "epoch": 0.3072685539403213, + "grad_norm": 2.6944527828836615, + "learning_rate": 1.920972742687109e-05, + "loss": 0.8581, + "step": 2008 + }, + { + "epoch": 0.30742157612853865, + "grad_norm": 2.1937400834532825, + "learning_rate": 1.920876157702496e-05, + "loss": 0.7833, + "step": 2009 + }, + { + "epoch": 0.30757459831675593, + "grad_norm": 2.5777698912582205, + "learning_rate": 1.920779516163276e-05, + "loss": 0.8232, + "step": 2010 + }, + { + "epoch": 0.3077276205049732, + "grad_norm": 2.2556884773655463, + "learning_rate": 1.920682818075384e-05, + "loss": 0.7792, + "step": 2011 + }, + { + "epoch": 0.3078806426931905, + "grad_norm": 2.682048126514191, + "learning_rate": 1.920586063444758e-05, + "loss": 0.8919, + "step": 2012 + }, + { + "epoch": 0.3080336648814078, + "grad_norm": 2.570180102807476, + "learning_rate": 1.9204892522773405e-05, + "loss": 0.8524, + "step": 2013 + }, + { + "epoch": 0.3081866870696251, + "grad_norm": 2.563307171833568, + "learning_rate": 1.9203923845790767e-05, + "loss": 0.9107, + "step": 2014 + }, + { + "epoch": 0.3083397092578424, + "grad_norm": 2.2104842092303336, + "learning_rate": 1.9202954603559164e-05, + "loss": 0.843, + "step": 2015 + }, + { + "epoch": 0.30849273144605965, + "grad_norm": 2.5360024853513417, + "learning_rate": 1.920198479613812e-05, + "loss": 0.7056, + "step": 2016 + }, + { + "epoch": 0.308645753634277, + "grad_norm": 2.7399669049444255, + "learning_rate": 1.9201014423587187e-05, + "loss": 0.8708, + "step": 2017 + }, + { + "epoch": 0.30879877582249426, + "grad_norm": 2.4553097150606447, + "learning_rate": 1.9200043485965963e-05, + "loss": 0.8834, + "step": 2018 + }, + { + "epoch": 0.30895179801071154, + "grad_norm": 2.8655692998053697, + "learning_rate": 1.919907198333408e-05, + "loss": 0.9893, + "step": 2019 + }, + { + "epoch": 0.3091048201989288, + "grad_norm": 2.2561640988275546, + "learning_rate": 1.9198099915751198e-05, + "loss": 0.8193, + "step": 2020 + }, + { + "epoch": 0.30925784238714615, + "grad_norm": 2.4836616435229, + "learning_rate": 1.9197127283277017e-05, + "loss": 0.8635, + "step": 2021 + }, + { + "epoch": 0.30941086457536343, + "grad_norm": 2.5715982339637065, + "learning_rate": 1.919615408597127e-05, + "loss": 0.8791, + "step": 2022 + }, + { + "epoch": 0.3095638867635807, + "grad_norm": 2.5943107227567848, + "learning_rate": 1.9195180323893728e-05, + "loss": 0.8121, + "step": 2023 + }, + { + "epoch": 0.309716908951798, + "grad_norm": 2.2801541193319492, + "learning_rate": 1.9194205997104187e-05, + "loss": 0.7939, + "step": 2024 + }, + { + "epoch": 0.3098699311400153, + "grad_norm": 2.459839184463675, + "learning_rate": 1.919323110566249e-05, + "loss": 0.9424, + "step": 2025 + }, + { + "epoch": 0.3100229533282326, + "grad_norm": 2.1911119079151993, + "learning_rate": 1.9192255649628503e-05, + "loss": 0.8289, + "step": 2026 + }, + { + "epoch": 0.3101759755164499, + "grad_norm": 2.4644860950022722, + "learning_rate": 1.919127962906214e-05, + "loss": 0.8372, + "step": 2027 + }, + { + "epoch": 0.31032899770466715, + "grad_norm": 2.2356742468806026, + "learning_rate": 1.919030304402334e-05, + "loss": 0.7998, + "step": 2028 + }, + { + "epoch": 0.3104820198928845, + "grad_norm": 2.5171160782594795, + "learning_rate": 1.9189325894572076e-05, + "loss": 0.8296, + "step": 2029 + }, + { + "epoch": 0.31063504208110176, + "grad_norm": 2.7014831046380934, + "learning_rate": 1.9188348180768356e-05, + "loss": 0.9374, + "step": 2030 + }, + { + "epoch": 0.31078806426931904, + "grad_norm": 2.734471374350287, + "learning_rate": 1.918736990267223e-05, + "loss": 0.7857, + "step": 2031 + }, + { + "epoch": 0.3109410864575363, + "grad_norm": 2.7131784176496487, + "learning_rate": 1.918639106034378e-05, + "loss": 0.8601, + "step": 2032 + }, + { + "epoch": 0.31109410864575365, + "grad_norm": 2.513092380080759, + "learning_rate": 1.918541165384312e-05, + "loss": 0.7936, + "step": 2033 + }, + { + "epoch": 0.31124713083397093, + "grad_norm": 2.927728563084092, + "learning_rate": 1.9184431683230392e-05, + "loss": 1.0046, + "step": 2034 + }, + { + "epoch": 0.3114001530221882, + "grad_norm": 2.404772452838863, + "learning_rate": 1.9183451148565787e-05, + "loss": 0.7403, + "step": 2035 + }, + { + "epoch": 0.3115531752104055, + "grad_norm": 2.516603840983081, + "learning_rate": 1.918247004990952e-05, + "loss": 0.7745, + "step": 2036 + }, + { + "epoch": 0.3117061973986228, + "grad_norm": 2.386770013882607, + "learning_rate": 1.9181488387321848e-05, + "loss": 0.8298, + "step": 2037 + }, + { + "epoch": 0.3118592195868401, + "grad_norm": 2.6428767252854555, + "learning_rate": 1.918050616086305e-05, + "loss": 0.8006, + "step": 2038 + }, + { + "epoch": 0.3120122417750574, + "grad_norm": 3.1631302400947825, + "learning_rate": 1.917952337059346e-05, + "loss": 0.8649, + "step": 2039 + }, + { + "epoch": 0.31216526396327465, + "grad_norm": 2.442779992040568, + "learning_rate": 1.9178540016573425e-05, + "loss": 0.6726, + "step": 2040 + }, + { + "epoch": 0.312318286151492, + "grad_norm": 2.44707016093417, + "learning_rate": 1.9177556098863345e-05, + "loss": 0.8137, + "step": 2041 + }, + { + "epoch": 0.31247130833970926, + "grad_norm": 2.525282513459153, + "learning_rate": 1.9176571617523637e-05, + "loss": 0.686, + "step": 2042 + }, + { + "epoch": 0.31262433052792654, + "grad_norm": 2.5982517000505405, + "learning_rate": 1.9175586572614772e-05, + "loss": 0.7722, + "step": 2043 + }, + { + "epoch": 0.3127773527161438, + "grad_norm": 2.452454622090508, + "learning_rate": 1.9174600964197243e-05, + "loss": 0.8334, + "step": 2044 + }, + { + "epoch": 0.31293037490436115, + "grad_norm": 2.618928484678063, + "learning_rate": 1.917361479233157e-05, + "loss": 0.9241, + "step": 2045 + }, + { + "epoch": 0.31308339709257843, + "grad_norm": 2.435076329407817, + "learning_rate": 1.9172628057078328e-05, + "loss": 0.7933, + "step": 2046 + }, + { + "epoch": 0.3132364192807957, + "grad_norm": 2.533700576320131, + "learning_rate": 1.9171640758498117e-05, + "loss": 0.7797, + "step": 2047 + }, + { + "epoch": 0.313389441469013, + "grad_norm": 2.4696336812088546, + "learning_rate": 1.917065289665156e-05, + "loss": 0.7301, + "step": 2048 + }, + { + "epoch": 0.3135424636572303, + "grad_norm": 2.4661352148431868, + "learning_rate": 1.9169664471599338e-05, + "loss": 0.8389, + "step": 2049 + }, + { + "epoch": 0.3136954858454476, + "grad_norm": 2.3361325316370767, + "learning_rate": 1.916867548340215e-05, + "loss": 0.7848, + "step": 2050 + }, + { + "epoch": 0.3138485080336649, + "grad_norm": 2.7039151927382106, + "learning_rate": 1.916768593212073e-05, + "loss": 0.7517, + "step": 2051 + }, + { + "epoch": 0.31400153022188215, + "grad_norm": 2.302389279273721, + "learning_rate": 1.9166695817815854e-05, + "loss": 0.816, + "step": 2052 + }, + { + "epoch": 0.3141545524100995, + "grad_norm": 2.33484318748721, + "learning_rate": 1.9165705140548325e-05, + "loss": 0.7547, + "step": 2053 + }, + { + "epoch": 0.31430757459831676, + "grad_norm": 2.4695189957920336, + "learning_rate": 1.916471390037899e-05, + "loss": 0.8419, + "step": 2054 + }, + { + "epoch": 0.31446059678653404, + "grad_norm": 2.6640125173002573, + "learning_rate": 1.916372209736872e-05, + "loss": 0.8851, + "step": 2055 + }, + { + "epoch": 0.3146136189747513, + "grad_norm": 2.5342638647426154, + "learning_rate": 1.9162729731578428e-05, + "loss": 0.7527, + "step": 2056 + }, + { + "epoch": 0.31476664116296865, + "grad_norm": 2.534955417695161, + "learning_rate": 1.916173680306906e-05, + "loss": 0.8765, + "step": 2057 + }, + { + "epoch": 0.31491966335118593, + "grad_norm": 2.6830494811488, + "learning_rate": 1.9160743311901592e-05, + "loss": 0.7909, + "step": 2058 + }, + { + "epoch": 0.3150726855394032, + "grad_norm": 2.550549374015315, + "learning_rate": 1.915974925813704e-05, + "loss": 0.7659, + "step": 2059 + }, + { + "epoch": 0.3152257077276205, + "grad_norm": 2.3928412556709624, + "learning_rate": 1.9158754641836452e-05, + "loss": 0.7826, + "step": 2060 + }, + { + "epoch": 0.3153787299158378, + "grad_norm": 2.9224489475145274, + "learning_rate": 1.9157759463060914e-05, + "loss": 1.0251, + "step": 2061 + }, + { + "epoch": 0.3155317521040551, + "grad_norm": 2.5484811016781466, + "learning_rate": 1.9156763721871542e-05, + "loss": 0.8764, + "step": 2062 + }, + { + "epoch": 0.3156847742922724, + "grad_norm": 2.446852756534984, + "learning_rate": 1.9155767418329488e-05, + "loss": 0.8835, + "step": 2063 + }, + { + "epoch": 0.31583779648048965, + "grad_norm": 2.72442600547987, + "learning_rate": 1.915477055249594e-05, + "loss": 0.8934, + "step": 2064 + }, + { + "epoch": 0.315990818668707, + "grad_norm": 2.4582101077154195, + "learning_rate": 1.9153773124432117e-05, + "loss": 0.7802, + "step": 2065 + }, + { + "epoch": 0.31614384085692426, + "grad_norm": 2.5396120010948358, + "learning_rate": 1.9152775134199278e-05, + "loss": 0.867, + "step": 2066 + }, + { + "epoch": 0.31629686304514154, + "grad_norm": 2.3762790446329487, + "learning_rate": 1.9151776581858708e-05, + "loss": 0.7767, + "step": 2067 + }, + { + "epoch": 0.3164498852333588, + "grad_norm": 2.433257226366239, + "learning_rate": 1.915077746747174e-05, + "loss": 0.791, + "step": 2068 + }, + { + "epoch": 0.31660290742157615, + "grad_norm": 2.4692823299271867, + "learning_rate": 1.914977779109973e-05, + "loss": 0.8087, + "step": 2069 + }, + { + "epoch": 0.31675592960979343, + "grad_norm": 2.131604538235996, + "learning_rate": 1.9148777552804075e-05, + "loss": 0.7566, + "step": 2070 + }, + { + "epoch": 0.3169089517980107, + "grad_norm": 2.4472979026288533, + "learning_rate": 1.9147776752646193e-05, + "loss": 0.9065, + "step": 2071 + }, + { + "epoch": 0.317061973986228, + "grad_norm": 2.484643272652111, + "learning_rate": 1.914677539068756e-05, + "loss": 0.8602, + "step": 2072 + }, + { + "epoch": 0.3172149961744453, + "grad_norm": 2.5281446531443112, + "learning_rate": 1.9145773466989665e-05, + "loss": 0.8182, + "step": 2073 + }, + { + "epoch": 0.3173680183626626, + "grad_norm": 2.5908840665780755, + "learning_rate": 1.9144770981614043e-05, + "loss": 0.7987, + "step": 2074 + }, + { + "epoch": 0.3175210405508799, + "grad_norm": 2.0941908773437943, + "learning_rate": 1.914376793462226e-05, + "loss": 0.896, + "step": 2075 + }, + { + "epoch": 0.31767406273909715, + "grad_norm": 2.2421817228071705, + "learning_rate": 1.914276432607592e-05, + "loss": 0.6662, + "step": 2076 + }, + { + "epoch": 0.3178270849273145, + "grad_norm": 2.4922409133172394, + "learning_rate": 1.9141760156036656e-05, + "loss": 0.7889, + "step": 2077 + }, + { + "epoch": 0.31798010711553176, + "grad_norm": 2.3788296857697695, + "learning_rate": 1.9140755424566135e-05, + "loss": 0.7257, + "step": 2078 + }, + { + "epoch": 0.31813312930374904, + "grad_norm": 2.3379923933589017, + "learning_rate": 1.9139750131726067e-05, + "loss": 0.7745, + "step": 2079 + }, + { + "epoch": 0.3182861514919663, + "grad_norm": 2.873252027227465, + "learning_rate": 1.9138744277578186e-05, + "loss": 0.8744, + "step": 2080 + }, + { + "epoch": 0.31843917368018365, + "grad_norm": 2.4216698005628503, + "learning_rate": 1.9137737862184272e-05, + "loss": 0.8559, + "step": 2081 + }, + { + "epoch": 0.31859219586840093, + "grad_norm": 2.4418905760060454, + "learning_rate": 1.9136730885606124e-05, + "loss": 0.7539, + "step": 2082 + }, + { + "epoch": 0.3187452180566182, + "grad_norm": 2.4020958445199936, + "learning_rate": 1.9135723347905593e-05, + "loss": 0.827, + "step": 2083 + }, + { + "epoch": 0.3188982402448355, + "grad_norm": 2.5406101984226908, + "learning_rate": 1.913471524914455e-05, + "loss": 0.8508, + "step": 2084 + }, + { + "epoch": 0.3190512624330528, + "grad_norm": 2.2918266846443736, + "learning_rate": 1.913370658938491e-05, + "loss": 0.7455, + "step": 2085 + }, + { + "epoch": 0.3192042846212701, + "grad_norm": 2.480568139223291, + "learning_rate": 1.9132697368688616e-05, + "loss": 0.8116, + "step": 2086 + }, + { + "epoch": 0.3193573068094874, + "grad_norm": 2.854465154555059, + "learning_rate": 1.913168758711765e-05, + "loss": 0.9847, + "step": 2087 + }, + { + "epoch": 0.31951032899770465, + "grad_norm": 2.572434816180969, + "learning_rate": 1.9130677244734026e-05, + "loss": 0.8183, + "step": 2088 + }, + { + "epoch": 0.319663351185922, + "grad_norm": 3.003205537027778, + "learning_rate": 1.9129666341599793e-05, + "loss": 0.853, + "step": 2089 + }, + { + "epoch": 0.31981637337413926, + "grad_norm": 2.4337824036824953, + "learning_rate": 1.9128654877777034e-05, + "loss": 0.7883, + "step": 2090 + }, + { + "epoch": 0.31996939556235654, + "grad_norm": 2.5391007309566302, + "learning_rate": 1.9127642853327867e-05, + "loss": 0.7448, + "step": 2091 + }, + { + "epoch": 0.3201224177505738, + "grad_norm": 2.5479704491291986, + "learning_rate": 1.9126630268314447e-05, + "loss": 0.7959, + "step": 2092 + }, + { + "epoch": 0.32027543993879115, + "grad_norm": 2.7326391324081016, + "learning_rate": 1.9125617122798952e-05, + "loss": 0.8832, + "step": 2093 + }, + { + "epoch": 0.32042846212700843, + "grad_norm": 2.1860153989983298, + "learning_rate": 1.9124603416843617e-05, + "loss": 0.8377, + "step": 2094 + }, + { + "epoch": 0.3205814843152257, + "grad_norm": 2.503472081997963, + "learning_rate": 1.912358915051069e-05, + "loss": 0.7138, + "step": 2095 + }, + { + "epoch": 0.320734506503443, + "grad_norm": 2.2062061908006845, + "learning_rate": 1.912257432386246e-05, + "loss": 0.7035, + "step": 2096 + }, + { + "epoch": 0.3208875286916603, + "grad_norm": 2.077975827552243, + "learning_rate": 1.912155893696125e-05, + "loss": 0.71, + "step": 2097 + }, + { + "epoch": 0.3210405508798776, + "grad_norm": 2.4162267436191307, + "learning_rate": 1.9120542989869427e-05, + "loss": 0.8393, + "step": 2098 + }, + { + "epoch": 0.3211935730680949, + "grad_norm": 2.5988343283265154, + "learning_rate": 1.911952648264938e-05, + "loss": 0.8368, + "step": 2099 + }, + { + "epoch": 0.32134659525631215, + "grad_norm": 2.2723363777964662, + "learning_rate": 1.911850941536353e-05, + "loss": 0.7547, + "step": 2100 + }, + { + "epoch": 0.3214996174445295, + "grad_norm": 2.413632966823172, + "learning_rate": 1.9117491788074348e-05, + "loss": 0.774, + "step": 2101 + }, + { + "epoch": 0.32165263963274676, + "grad_norm": 2.439605514523114, + "learning_rate": 1.9116473600844327e-05, + "loss": 0.7549, + "step": 2102 + }, + { + "epoch": 0.32180566182096404, + "grad_norm": 2.2988123659235473, + "learning_rate": 1.9115454853736e-05, + "loss": 0.7875, + "step": 2103 + }, + { + "epoch": 0.3219586840091813, + "grad_norm": 2.3470597989174697, + "learning_rate": 1.9114435546811928e-05, + "loss": 0.7903, + "step": 2104 + }, + { + "epoch": 0.3221117061973986, + "grad_norm": 2.562614663596328, + "learning_rate": 1.9113415680134717e-05, + "loss": 0.8583, + "step": 2105 + }, + { + "epoch": 0.32226472838561593, + "grad_norm": 2.6369756053615907, + "learning_rate": 1.9112395253766995e-05, + "loss": 0.9672, + "step": 2106 + }, + { + "epoch": 0.3224177505738332, + "grad_norm": 2.914943083424258, + "learning_rate": 1.911137426777143e-05, + "loss": 0.827, + "step": 2107 + }, + { + "epoch": 0.3225707727620505, + "grad_norm": 2.3463201198738863, + "learning_rate": 1.911035272221073e-05, + "loss": 0.8843, + "step": 2108 + }, + { + "epoch": 0.32272379495026776, + "grad_norm": 2.5753832932412637, + "learning_rate": 1.910933061714763e-05, + "loss": 0.7845, + "step": 2109 + }, + { + "epoch": 0.3228768171384851, + "grad_norm": 2.677331183762575, + "learning_rate": 1.91083079526449e-05, + "loss": 0.8315, + "step": 2110 + }, + { + "epoch": 0.3230298393267024, + "grad_norm": 2.619466419949009, + "learning_rate": 1.9107284728765347e-05, + "loss": 0.9082, + "step": 2111 + }, + { + "epoch": 0.32318286151491965, + "grad_norm": 2.318593309042997, + "learning_rate": 1.910626094557181e-05, + "loss": 0.7603, + "step": 2112 + }, + { + "epoch": 0.32333588370313693, + "grad_norm": 2.5023945668186625, + "learning_rate": 1.9105236603127167e-05, + "loss": 0.8891, + "step": 2113 + }, + { + "epoch": 0.32348890589135426, + "grad_norm": 2.6435011702222755, + "learning_rate": 1.910421170149432e-05, + "loss": 0.8287, + "step": 2114 + }, + { + "epoch": 0.32364192807957154, + "grad_norm": 2.3861826759572966, + "learning_rate": 1.910318624073622e-05, + "loss": 0.7984, + "step": 2115 + }, + { + "epoch": 0.3237949502677888, + "grad_norm": 2.5742746321849106, + "learning_rate": 1.910216022091584e-05, + "loss": 0.9795, + "step": 2116 + }, + { + "epoch": 0.3239479724560061, + "grad_norm": 2.471229350103612, + "learning_rate": 1.910113364209619e-05, + "loss": 0.7576, + "step": 2117 + }, + { + "epoch": 0.32410099464422343, + "grad_norm": 2.4970893298757546, + "learning_rate": 1.9100106504340324e-05, + "loss": 0.7732, + "step": 2118 + }, + { + "epoch": 0.3242540168324407, + "grad_norm": 2.268656403612154, + "learning_rate": 1.9099078807711318e-05, + "loss": 0.7906, + "step": 2119 + }, + { + "epoch": 0.324407039020658, + "grad_norm": 2.4314592035006557, + "learning_rate": 1.9098050552272284e-05, + "loss": 0.9111, + "step": 2120 + }, + { + "epoch": 0.32456006120887526, + "grad_norm": 2.4708369395051046, + "learning_rate": 1.909702173808637e-05, + "loss": 0.8278, + "step": 2121 + }, + { + "epoch": 0.3247130833970926, + "grad_norm": 2.6888386351450113, + "learning_rate": 1.909599236521677e-05, + "loss": 0.785, + "step": 2122 + }, + { + "epoch": 0.3248661055853099, + "grad_norm": 2.530033705647798, + "learning_rate": 1.9094962433726692e-05, + "loss": 0.7601, + "step": 2123 + }, + { + "epoch": 0.32501912777352715, + "grad_norm": 2.577311046308443, + "learning_rate": 1.9093931943679395e-05, + "loss": 0.8827, + "step": 2124 + }, + { + "epoch": 0.32517214996174443, + "grad_norm": 2.682264853957065, + "learning_rate": 1.909290089513816e-05, + "loss": 0.8972, + "step": 2125 + }, + { + "epoch": 0.32532517214996176, + "grad_norm": 2.292095938273781, + "learning_rate": 1.9091869288166305e-05, + "loss": 0.8293, + "step": 2126 + }, + { + "epoch": 0.32547819433817904, + "grad_norm": 2.5223280792400704, + "learning_rate": 1.9090837122827195e-05, + "loss": 0.8509, + "step": 2127 + }, + { + "epoch": 0.3256312165263963, + "grad_norm": 2.447119418811232, + "learning_rate": 1.908980439918421e-05, + "loss": 0.7685, + "step": 2128 + }, + { + "epoch": 0.3257842387146136, + "grad_norm": 2.6601569588663048, + "learning_rate": 1.9088771117300778e-05, + "loss": 0.9052, + "step": 2129 + }, + { + "epoch": 0.3259372609028309, + "grad_norm": 2.536018852055662, + "learning_rate": 1.908773727724036e-05, + "loss": 0.8709, + "step": 2130 + }, + { + "epoch": 0.3260902830910482, + "grad_norm": 2.330125546657301, + "learning_rate": 1.908670287906644e-05, + "loss": 0.8163, + "step": 2131 + }, + { + "epoch": 0.3262433052792655, + "grad_norm": 2.368910748289313, + "learning_rate": 1.9085667922842547e-05, + "loss": 0.8446, + "step": 2132 + }, + { + "epoch": 0.32639632746748276, + "grad_norm": 2.460618804621041, + "learning_rate": 1.908463240863225e-05, + "loss": 0.8018, + "step": 2133 + }, + { + "epoch": 0.3265493496557001, + "grad_norm": 2.5082175611570063, + "learning_rate": 1.9083596336499133e-05, + "loss": 0.7482, + "step": 2134 + }, + { + "epoch": 0.32670237184391737, + "grad_norm": 2.8829890496898294, + "learning_rate": 1.908255970650683e-05, + "loss": 0.7766, + "step": 2135 + }, + { + "epoch": 0.32685539403213465, + "grad_norm": 2.6191018339121346, + "learning_rate": 1.9081522518719006e-05, + "loss": 0.8283, + "step": 2136 + }, + { + "epoch": 0.3270084162203519, + "grad_norm": 2.675185201207612, + "learning_rate": 1.9080484773199356e-05, + "loss": 1.0072, + "step": 2137 + }, + { + "epoch": 0.32716143840856926, + "grad_norm": 2.5802166600723293, + "learning_rate": 1.9079446470011615e-05, + "loss": 0.789, + "step": 2138 + }, + { + "epoch": 0.32731446059678654, + "grad_norm": 2.7923015418179107, + "learning_rate": 1.9078407609219543e-05, + "loss": 0.7883, + "step": 2139 + }, + { + "epoch": 0.3274674827850038, + "grad_norm": 2.418163628175102, + "learning_rate": 1.9077368190886946e-05, + "loss": 0.7245, + "step": 2140 + }, + { + "epoch": 0.3276205049732211, + "grad_norm": 2.598496284037849, + "learning_rate": 1.907632821507766e-05, + "loss": 0.7575, + "step": 2141 + }, + { + "epoch": 0.3277735271614384, + "grad_norm": 2.7343117555473846, + "learning_rate": 1.907528768185555e-05, + "loss": 0.874, + "step": 2142 + }, + { + "epoch": 0.3279265493496557, + "grad_norm": 2.642891739247081, + "learning_rate": 1.9074246591284522e-05, + "loss": 0.8379, + "step": 2143 + }, + { + "epoch": 0.328079571537873, + "grad_norm": 2.608322969653126, + "learning_rate": 1.9073204943428512e-05, + "loss": 0.8056, + "step": 2144 + }, + { + "epoch": 0.32823259372609026, + "grad_norm": 2.6873976836473767, + "learning_rate": 1.907216273835149e-05, + "loss": 0.7994, + "step": 2145 + }, + { + "epoch": 0.3283856159143076, + "grad_norm": 2.4514711844365658, + "learning_rate": 1.9071119976117465e-05, + "loss": 0.8881, + "step": 2146 + }, + { + "epoch": 0.32853863810252487, + "grad_norm": 2.5267854318495515, + "learning_rate": 1.9070076656790475e-05, + "loss": 0.8897, + "step": 2147 + }, + { + "epoch": 0.32869166029074215, + "grad_norm": 2.1105651231627647, + "learning_rate": 1.9069032780434596e-05, + "loss": 0.7618, + "step": 2148 + }, + { + "epoch": 0.3288446824789594, + "grad_norm": 2.3346993598677765, + "learning_rate": 1.9067988347113934e-05, + "loss": 0.785, + "step": 2149 + }, + { + "epoch": 0.32899770466717676, + "grad_norm": 2.4214920221821203, + "learning_rate": 1.9066943356892636e-05, + "loss": 0.8432, + "step": 2150 + }, + { + "epoch": 0.32915072685539404, + "grad_norm": 2.558431202242363, + "learning_rate": 1.9065897809834877e-05, + "loss": 0.9265, + "step": 2151 + }, + { + "epoch": 0.3293037490436113, + "grad_norm": 2.411042024636327, + "learning_rate": 1.9064851706004862e-05, + "loss": 0.8417, + "step": 2152 + }, + { + "epoch": 0.3294567712318286, + "grad_norm": 2.832038379528674, + "learning_rate": 1.9063805045466847e-05, + "loss": 0.8054, + "step": 2153 + }, + { + "epoch": 0.3296097934200459, + "grad_norm": 2.6592576047547123, + "learning_rate": 1.9062757828285105e-05, + "loss": 0.9434, + "step": 2154 + }, + { + "epoch": 0.3297628156082632, + "grad_norm": 2.352342451790287, + "learning_rate": 1.9061710054523953e-05, + "loss": 0.7586, + "step": 2155 + }, + { + "epoch": 0.3299158377964805, + "grad_norm": 2.544209759964932, + "learning_rate": 1.9060661724247734e-05, + "loss": 0.8578, + "step": 2156 + }, + { + "epoch": 0.33006885998469776, + "grad_norm": 2.276504238923556, + "learning_rate": 1.9059612837520834e-05, + "loss": 0.8868, + "step": 2157 + }, + { + "epoch": 0.3302218821729151, + "grad_norm": 2.5733234300853938, + "learning_rate": 1.9058563394407668e-05, + "loss": 0.9171, + "step": 2158 + }, + { + "epoch": 0.33037490436113237, + "grad_norm": 2.431640470434858, + "learning_rate": 1.9057513394972687e-05, + "loss": 0.6812, + "step": 2159 + }, + { + "epoch": 0.33052792654934965, + "grad_norm": 2.603271164903475, + "learning_rate": 1.9056462839280376e-05, + "loss": 0.8457, + "step": 2160 + }, + { + "epoch": 0.3306809487375669, + "grad_norm": 2.731126948671394, + "learning_rate": 1.905541172739525e-05, + "loss": 0.7824, + "step": 2161 + }, + { + "epoch": 0.33083397092578426, + "grad_norm": 2.753119967092687, + "learning_rate": 1.905436005938187e-05, + "loss": 0.8542, + "step": 2162 + }, + { + "epoch": 0.33098699311400154, + "grad_norm": 2.337685299070537, + "learning_rate": 1.9053307835304812e-05, + "loss": 0.8189, + "step": 2163 + }, + { + "epoch": 0.3311400153022188, + "grad_norm": 2.320976989090415, + "learning_rate": 1.9052255055228707e-05, + "loss": 0.6544, + "step": 2164 + }, + { + "epoch": 0.3312930374904361, + "grad_norm": 2.5843477334717653, + "learning_rate": 1.9051201719218207e-05, + "loss": 0.8549, + "step": 2165 + }, + { + "epoch": 0.3314460596786534, + "grad_norm": 2.129202487167354, + "learning_rate": 1.9050147827337996e-05, + "loss": 0.7141, + "step": 2166 + }, + { + "epoch": 0.3315990818668707, + "grad_norm": 2.5652483228964242, + "learning_rate": 1.904909337965281e-05, + "loss": 0.7735, + "step": 2167 + }, + { + "epoch": 0.331752104055088, + "grad_norm": 2.3273524046603633, + "learning_rate": 1.9048038376227392e-05, + "loss": 0.8203, + "step": 2168 + }, + { + "epoch": 0.33190512624330526, + "grad_norm": 2.731759408744939, + "learning_rate": 1.9046982817126545e-05, + "loss": 0.7929, + "step": 2169 + }, + { + "epoch": 0.3320581484315226, + "grad_norm": 2.395296587410217, + "learning_rate": 1.904592670241509e-05, + "loss": 0.9106, + "step": 2170 + }, + { + "epoch": 0.33221117061973987, + "grad_norm": 2.6281801579811566, + "learning_rate": 1.904487003215789e-05, + "loss": 0.8377, + "step": 2171 + }, + { + "epoch": 0.33236419280795715, + "grad_norm": 2.496937430980088, + "learning_rate": 1.9043812806419833e-05, + "loss": 0.7716, + "step": 2172 + }, + { + "epoch": 0.3325172149961744, + "grad_norm": 2.335221367154538, + "learning_rate": 1.9042755025265858e-05, + "loss": 0.9313, + "step": 2173 + }, + { + "epoch": 0.33267023718439176, + "grad_norm": 2.4705985458013457, + "learning_rate": 1.904169668876092e-05, + "loss": 0.8674, + "step": 2174 + }, + { + "epoch": 0.33282325937260904, + "grad_norm": 2.470941759164986, + "learning_rate": 1.9040637796970013e-05, + "loss": 0.8822, + "step": 2175 + }, + { + "epoch": 0.3329762815608263, + "grad_norm": 2.7471317200021725, + "learning_rate": 1.9039578349958177e-05, + "loss": 0.9441, + "step": 2176 + }, + { + "epoch": 0.3331293037490436, + "grad_norm": 2.561583712652535, + "learning_rate": 1.9038518347790468e-05, + "loss": 0.8602, + "step": 2177 + }, + { + "epoch": 0.3332823259372609, + "grad_norm": 2.636580914055904, + "learning_rate": 1.9037457790531988e-05, + "loss": 0.8297, + "step": 2178 + }, + { + "epoch": 0.3334353481254782, + "grad_norm": 2.874894906101151, + "learning_rate": 1.9036396678247872e-05, + "loss": 0.7911, + "step": 2179 + }, + { + "epoch": 0.3335883703136955, + "grad_norm": 2.449895925283834, + "learning_rate": 1.9035335011003286e-05, + "loss": 0.8366, + "step": 2180 + }, + { + "epoch": 0.33374139250191276, + "grad_norm": 2.294977029076781, + "learning_rate": 1.903427278886343e-05, + "loss": 0.7386, + "step": 2181 + }, + { + "epoch": 0.3338944146901301, + "grad_norm": 3.231281267707513, + "learning_rate": 1.903321001189354e-05, + "loss": 0.8684, + "step": 2182 + }, + { + "epoch": 0.33404743687834737, + "grad_norm": 2.787124174288599, + "learning_rate": 1.9032146680158884e-05, + "loss": 0.8729, + "step": 2183 + }, + { + "epoch": 0.33420045906656465, + "grad_norm": 2.636855875087689, + "learning_rate": 1.903108279372477e-05, + "loss": 0.8681, + "step": 2184 + }, + { + "epoch": 0.3343534812547819, + "grad_norm": 2.3818211247344054, + "learning_rate": 1.9030018352656525e-05, + "loss": 0.9681, + "step": 2185 + }, + { + "epoch": 0.33450650344299926, + "grad_norm": 2.6499490500959655, + "learning_rate": 1.9028953357019534e-05, + "loss": 0.7594, + "step": 2186 + }, + { + "epoch": 0.33465952563121654, + "grad_norm": 2.803284880960482, + "learning_rate": 1.902788780687919e-05, + "loss": 0.8759, + "step": 2187 + }, + { + "epoch": 0.3348125478194338, + "grad_norm": 2.6200835706423558, + "learning_rate": 1.9026821702300942e-05, + "loss": 0.8095, + "step": 2188 + }, + { + "epoch": 0.3349655700076511, + "grad_norm": 2.5244225371364255, + "learning_rate": 1.902575504335026e-05, + "loss": 0.8257, + "step": 2189 + }, + { + "epoch": 0.3351185921958684, + "grad_norm": 2.272322663580629, + "learning_rate": 1.9024687830092653e-05, + "loss": 0.8683, + "step": 2190 + }, + { + "epoch": 0.3352716143840857, + "grad_norm": 2.3507940993495855, + "learning_rate": 1.902362006259366e-05, + "loss": 0.7174, + "step": 2191 + }, + { + "epoch": 0.335424636572303, + "grad_norm": 2.3419236603411786, + "learning_rate": 1.9022551740918858e-05, + "loss": 0.8454, + "step": 2192 + }, + { + "epoch": 0.33557765876052026, + "grad_norm": 2.2407962986631427, + "learning_rate": 1.9021482865133857e-05, + "loss": 0.8129, + "step": 2193 + }, + { + "epoch": 0.3357306809487376, + "grad_norm": 2.829969199508418, + "learning_rate": 1.90204134353043e-05, + "loss": 0.9139, + "step": 2194 + }, + { + "epoch": 0.33588370313695487, + "grad_norm": 2.570297450213411, + "learning_rate": 1.901934345149587e-05, + "loss": 0.8215, + "step": 2195 + }, + { + "epoch": 0.33603672532517215, + "grad_norm": 2.1742508587047356, + "learning_rate": 1.9018272913774272e-05, + "loss": 0.7921, + "step": 2196 + }, + { + "epoch": 0.3361897475133894, + "grad_norm": 2.5516210884648354, + "learning_rate": 1.9017201822205257e-05, + "loss": 0.777, + "step": 2197 + }, + { + "epoch": 0.33634276970160676, + "grad_norm": 2.5965934611015524, + "learning_rate": 1.9016130176854595e-05, + "loss": 0.8242, + "step": 2198 + }, + { + "epoch": 0.33649579188982404, + "grad_norm": 2.633683261887388, + "learning_rate": 1.9015057977788115e-05, + "loss": 0.8029, + "step": 2199 + }, + { + "epoch": 0.3366488140780413, + "grad_norm": 2.6200741185136533, + "learning_rate": 1.9013985225071657e-05, + "loss": 0.8098, + "step": 2200 + }, + { + "epoch": 0.3368018362662586, + "grad_norm": 2.385459716174837, + "learning_rate": 1.90129119187711e-05, + "loss": 0.9182, + "step": 2201 + }, + { + "epoch": 0.3369548584544759, + "grad_norm": 2.8745143161019677, + "learning_rate": 1.901183805895237e-05, + "loss": 0.833, + "step": 2202 + }, + { + "epoch": 0.3371078806426932, + "grad_norm": 2.316707026669433, + "learning_rate": 1.90107636456814e-05, + "loss": 0.8279, + "step": 2203 + }, + { + "epoch": 0.3372609028309105, + "grad_norm": 2.7035829713270187, + "learning_rate": 1.900968867902419e-05, + "loss": 0.8465, + "step": 2204 + }, + { + "epoch": 0.33741392501912776, + "grad_norm": 2.43981629724686, + "learning_rate": 1.9008613159046755e-05, + "loss": 0.8019, + "step": 2205 + }, + { + "epoch": 0.3375669472073451, + "grad_norm": 2.2730341237566107, + "learning_rate": 1.900753708581514e-05, + "loss": 0.8162, + "step": 2206 + }, + { + "epoch": 0.33771996939556237, + "grad_norm": 2.2234217115960955, + "learning_rate": 1.9006460459395436e-05, + "loss": 0.7875, + "step": 2207 + }, + { + "epoch": 0.33787299158377965, + "grad_norm": 2.8015510519189264, + "learning_rate": 1.9005383279853763e-05, + "loss": 0.9112, + "step": 2208 + }, + { + "epoch": 0.3380260137719969, + "grad_norm": 2.5704163897021255, + "learning_rate": 1.900430554725627e-05, + "loss": 0.904, + "step": 2209 + }, + { + "epoch": 0.33817903596021426, + "grad_norm": 2.305524302766331, + "learning_rate": 1.9003227261669154e-05, + "loss": 0.8197, + "step": 2210 + }, + { + "epoch": 0.33833205814843154, + "grad_norm": 2.5338540294728613, + "learning_rate": 1.9002148423158626e-05, + "loss": 0.7963, + "step": 2211 + }, + { + "epoch": 0.3384850803366488, + "grad_norm": 2.767802849237396, + "learning_rate": 1.9001069031790948e-05, + "loss": 0.8008, + "step": 2212 + }, + { + "epoch": 0.3386381025248661, + "grad_norm": 2.6297059670318785, + "learning_rate": 1.899998908763241e-05, + "loss": 0.8164, + "step": 2213 + }, + { + "epoch": 0.3387911247130834, + "grad_norm": 2.5239777756138095, + "learning_rate": 1.8998908590749333e-05, + "loss": 0.7567, + "step": 2214 + }, + { + "epoch": 0.3389441469013007, + "grad_norm": 2.6121503690567582, + "learning_rate": 1.8997827541208073e-05, + "loss": 0.7942, + "step": 2215 + }, + { + "epoch": 0.339097169089518, + "grad_norm": 2.513509488316226, + "learning_rate": 1.8996745939075024e-05, + "loss": 0.8487, + "step": 2216 + }, + { + "epoch": 0.33925019127773526, + "grad_norm": 2.406167309519931, + "learning_rate": 1.8995663784416613e-05, + "loss": 0.8935, + "step": 2217 + }, + { + "epoch": 0.33940321346595254, + "grad_norm": 2.2466325362978314, + "learning_rate": 1.8994581077299303e-05, + "loss": 0.764, + "step": 2218 + }, + { + "epoch": 0.33955623565416987, + "grad_norm": 2.3061195054332155, + "learning_rate": 1.8993497817789574e-05, + "loss": 0.7542, + "step": 2219 + }, + { + "epoch": 0.33970925784238715, + "grad_norm": 2.8382048184352957, + "learning_rate": 1.8992414005953964e-05, + "loss": 0.9037, + "step": 2220 + }, + { + "epoch": 0.3398622800306044, + "grad_norm": 2.3287211280392777, + "learning_rate": 1.8991329641859035e-05, + "loss": 0.8013, + "step": 2221 + }, + { + "epoch": 0.3400153022188217, + "grad_norm": 2.299742943168751, + "learning_rate": 1.8990244725571374e-05, + "loss": 0.7663, + "step": 2222 + }, + { + "epoch": 0.34016832440703904, + "grad_norm": 2.6471103760984764, + "learning_rate": 1.8989159257157616e-05, + "loss": 0.9033, + "step": 2223 + }, + { + "epoch": 0.3403213465952563, + "grad_norm": 2.280805425315999, + "learning_rate": 1.8988073236684422e-05, + "loss": 0.8295, + "step": 2224 + }, + { + "epoch": 0.3404743687834736, + "grad_norm": 2.141508514611218, + "learning_rate": 1.898698666421849e-05, + "loss": 0.7833, + "step": 2225 + }, + { + "epoch": 0.34062739097169087, + "grad_norm": 2.7267105114369685, + "learning_rate": 1.8985899539826547e-05, + "loss": 0.7954, + "step": 2226 + }, + { + "epoch": 0.3407804131599082, + "grad_norm": 2.2761348371968584, + "learning_rate": 1.898481186357536e-05, + "loss": 0.7791, + "step": 2227 + }, + { + "epoch": 0.3409334353481255, + "grad_norm": 2.311320182693125, + "learning_rate": 1.8983723635531733e-05, + "loss": 0.7803, + "step": 2228 + }, + { + "epoch": 0.34108645753634276, + "grad_norm": 2.416674743767647, + "learning_rate": 1.898263485576249e-05, + "loss": 0.7712, + "step": 2229 + }, + { + "epoch": 0.34123947972456004, + "grad_norm": 2.41675063992554, + "learning_rate": 1.8981545524334497e-05, + "loss": 0.8321, + "step": 2230 + }, + { + "epoch": 0.34139250191277737, + "grad_norm": 2.374216831787798, + "learning_rate": 1.8980455641314658e-05, + "loss": 0.759, + "step": 2231 + }, + { + "epoch": 0.34154552410099465, + "grad_norm": 2.5464606847654414, + "learning_rate": 1.897936520676991e-05, + "loss": 0.8562, + "step": 2232 + }, + { + "epoch": 0.3416985462892119, + "grad_norm": 2.138460875090068, + "learning_rate": 1.8978274220767218e-05, + "loss": 0.7936, + "step": 2233 + }, + { + "epoch": 0.3418515684774292, + "grad_norm": 2.430427227076344, + "learning_rate": 1.8977182683373577e-05, + "loss": 0.7254, + "step": 2234 + }, + { + "epoch": 0.34200459066564654, + "grad_norm": 2.4379531574708335, + "learning_rate": 1.8976090594656034e-05, + "loss": 0.7694, + "step": 2235 + }, + { + "epoch": 0.3421576128538638, + "grad_norm": 2.3778128523783413, + "learning_rate": 1.8974997954681652e-05, + "loss": 0.7247, + "step": 2236 + }, + { + "epoch": 0.3423106350420811, + "grad_norm": 2.6390102931039183, + "learning_rate": 1.8973904763517534e-05, + "loss": 0.8236, + "step": 2237 + }, + { + "epoch": 0.34246365723029837, + "grad_norm": 2.298857942777125, + "learning_rate": 1.8972811021230816e-05, + "loss": 0.7822, + "step": 2238 + }, + { + "epoch": 0.3426166794185157, + "grad_norm": 2.1998620792842702, + "learning_rate": 1.8971716727888677e-05, + "loss": 0.7806, + "step": 2239 + }, + { + "epoch": 0.342769701606733, + "grad_norm": 2.656918161912584, + "learning_rate": 1.897062188355831e-05, + "loss": 0.9258, + "step": 2240 + }, + { + "epoch": 0.34292272379495026, + "grad_norm": 2.4514776639915032, + "learning_rate": 1.8969526488306965e-05, + "loss": 0.771, + "step": 2241 + }, + { + "epoch": 0.34307574598316753, + "grad_norm": 2.404064889745198, + "learning_rate": 1.8968430542201905e-05, + "loss": 0.7684, + "step": 2242 + }, + { + "epoch": 0.34322876817138487, + "grad_norm": 2.289169317066614, + "learning_rate": 1.8967334045310445e-05, + "loss": 0.9253, + "step": 2243 + }, + { + "epoch": 0.34338179035960215, + "grad_norm": 2.3330229020319337, + "learning_rate": 1.896623699769992e-05, + "loss": 0.8232, + "step": 2244 + }, + { + "epoch": 0.3435348125478194, + "grad_norm": 2.513086642418052, + "learning_rate": 1.89651393994377e-05, + "loss": 0.8884, + "step": 2245 + }, + { + "epoch": 0.3436878347360367, + "grad_norm": 2.8654016636145743, + "learning_rate": 1.8964041250591203e-05, + "loss": 0.873, + "step": 2246 + }, + { + "epoch": 0.34384085692425403, + "grad_norm": 2.565973612887949, + "learning_rate": 1.8962942551227862e-05, + "loss": 0.7987, + "step": 2247 + }, + { + "epoch": 0.3439938791124713, + "grad_norm": 2.36055185125566, + "learning_rate": 1.896184330141516e-05, + "loss": 0.7303, + "step": 2248 + }, + { + "epoch": 0.3441469013006886, + "grad_norm": 2.3238221779857984, + "learning_rate": 1.8960743501220597e-05, + "loss": 0.797, + "step": 2249 + }, + { + "epoch": 0.34429992348890587, + "grad_norm": 2.455737367611592, + "learning_rate": 1.8959643150711723e-05, + "loss": 0.759, + "step": 2250 + }, + { + "epoch": 0.3444529456771232, + "grad_norm": 2.3782474528258106, + "learning_rate": 1.8958542249956116e-05, + "loss": 0.8592, + "step": 2251 + }, + { + "epoch": 0.3446059678653405, + "grad_norm": 2.278193327077733, + "learning_rate": 1.8957440799021377e-05, + "loss": 0.7989, + "step": 2252 + }, + { + "epoch": 0.34475899005355776, + "grad_norm": 2.299702774333914, + "learning_rate": 1.895633879797516e-05, + "loss": 0.8959, + "step": 2253 + }, + { + "epoch": 0.34491201224177503, + "grad_norm": 2.050241850848599, + "learning_rate": 1.8955236246885143e-05, + "loss": 0.6608, + "step": 2254 + }, + { + "epoch": 0.34506503442999237, + "grad_norm": 2.1911195420404725, + "learning_rate": 1.8954133145819028e-05, + "loss": 0.696, + "step": 2255 + }, + { + "epoch": 0.34521805661820965, + "grad_norm": 2.505315912925584, + "learning_rate": 1.8953029494844575e-05, + "loss": 0.8803, + "step": 2256 + }, + { + "epoch": 0.3453710788064269, + "grad_norm": 2.3649033111770716, + "learning_rate": 1.8951925294029553e-05, + "loss": 0.7616, + "step": 2257 + }, + { + "epoch": 0.3455241009946442, + "grad_norm": 2.3759627706360256, + "learning_rate": 1.895082054344178e-05, + "loss": 0.8723, + "step": 2258 + }, + { + "epoch": 0.34567712318286153, + "grad_norm": 2.4769807539083124, + "learning_rate": 1.89497152431491e-05, + "loss": 0.8755, + "step": 2259 + }, + { + "epoch": 0.3458301453710788, + "grad_norm": 2.372527770894994, + "learning_rate": 1.8948609393219394e-05, + "loss": 0.8229, + "step": 2260 + }, + { + "epoch": 0.3459831675592961, + "grad_norm": 2.30265390487159, + "learning_rate": 1.894750299372058e-05, + "loss": 0.7476, + "step": 2261 + }, + { + "epoch": 0.34613618974751337, + "grad_norm": 2.355500546095671, + "learning_rate": 1.8946396044720607e-05, + "loss": 0.732, + "step": 2262 + }, + { + "epoch": 0.3462892119357307, + "grad_norm": 2.324299123570443, + "learning_rate": 1.8945288546287452e-05, + "loss": 0.803, + "step": 2263 + }, + { + "epoch": 0.346442234123948, + "grad_norm": 2.4044763714157695, + "learning_rate": 1.894418049848913e-05, + "loss": 0.7983, + "step": 2264 + }, + { + "epoch": 0.34659525631216526, + "grad_norm": 2.6759518579838932, + "learning_rate": 1.8943071901393698e-05, + "loss": 0.8131, + "step": 2265 + }, + { + "epoch": 0.34674827850038253, + "grad_norm": 2.3991278259603535, + "learning_rate": 1.894196275506923e-05, + "loss": 0.8602, + "step": 2266 + }, + { + "epoch": 0.34690130068859987, + "grad_norm": 2.2733901741363542, + "learning_rate": 1.8940853059583853e-05, + "loss": 0.8241, + "step": 2267 + }, + { + "epoch": 0.34705432287681715, + "grad_norm": 3.8759891438201906, + "learning_rate": 1.8939742815005716e-05, + "loss": 0.8396, + "step": 2268 + }, + { + "epoch": 0.3472073450650344, + "grad_norm": 2.3324798219158316, + "learning_rate": 1.8938632021402994e-05, + "loss": 0.676, + "step": 2269 + }, + { + "epoch": 0.3473603672532517, + "grad_norm": 2.2829117471511373, + "learning_rate": 1.893752067884391e-05, + "loss": 0.7266, + "step": 2270 + }, + { + "epoch": 0.34751338944146903, + "grad_norm": 2.544061089313835, + "learning_rate": 1.8936408787396724e-05, + "loss": 0.7567, + "step": 2271 + }, + { + "epoch": 0.3476664116296863, + "grad_norm": 2.8639141424306698, + "learning_rate": 1.893529634712971e-05, + "loss": 0.9786, + "step": 2272 + }, + { + "epoch": 0.3478194338179036, + "grad_norm": 2.2951994146776244, + "learning_rate": 1.8934183358111194e-05, + "loss": 0.7286, + "step": 2273 + }, + { + "epoch": 0.34797245600612087, + "grad_norm": 2.514494070203749, + "learning_rate": 1.8933069820409528e-05, + "loss": 0.8458, + "step": 2274 + }, + { + "epoch": 0.3481254781943382, + "grad_norm": 2.3009159384379516, + "learning_rate": 1.8931955734093096e-05, + "loss": 0.809, + "step": 2275 + }, + { + "epoch": 0.3482785003825555, + "grad_norm": 2.6169208483293187, + "learning_rate": 1.893084109923032e-05, + "loss": 0.7312, + "step": 2276 + }, + { + "epoch": 0.34843152257077276, + "grad_norm": 2.318572642711375, + "learning_rate": 1.8929725915889656e-05, + "loss": 0.8057, + "step": 2277 + }, + { + "epoch": 0.34858454475899003, + "grad_norm": 2.4384080072672605, + "learning_rate": 1.892861018413959e-05, + "loss": 0.7845, + "step": 2278 + }, + { + "epoch": 0.34873756694720737, + "grad_norm": 2.233659076191945, + "learning_rate": 1.8927493904048645e-05, + "loss": 0.7987, + "step": 2279 + }, + { + "epoch": 0.34889058913542464, + "grad_norm": 2.8132353615209555, + "learning_rate": 1.8926377075685375e-05, + "loss": 0.792, + "step": 2280 + }, + { + "epoch": 0.3490436113236419, + "grad_norm": 2.745124612350017, + "learning_rate": 1.8925259699118363e-05, + "loss": 0.8503, + "step": 2281 + }, + { + "epoch": 0.3491966335118592, + "grad_norm": 2.3941958793093483, + "learning_rate": 1.8924141774416243e-05, + "loss": 0.7677, + "step": 2282 + }, + { + "epoch": 0.34934965570007653, + "grad_norm": 2.3379365924269493, + "learning_rate": 1.892302330164766e-05, + "loss": 0.8122, + "step": 2283 + }, + { + "epoch": 0.3495026778882938, + "grad_norm": 2.533816103656603, + "learning_rate": 1.8921904280881316e-05, + "loss": 0.7815, + "step": 2284 + }, + { + "epoch": 0.3496557000765111, + "grad_norm": 2.6931588712567947, + "learning_rate": 1.8920784712185925e-05, + "loss": 0.9097, + "step": 2285 + }, + { + "epoch": 0.34980872226472837, + "grad_norm": 2.270706751809004, + "learning_rate": 1.8919664595630244e-05, + "loss": 0.8721, + "step": 2286 + }, + { + "epoch": 0.3499617444529457, + "grad_norm": 2.645794975569509, + "learning_rate": 1.8918543931283065e-05, + "loss": 0.8858, + "step": 2287 + }, + { + "epoch": 0.350114766641163, + "grad_norm": 2.747285576887863, + "learning_rate": 1.891742271921322e-05, + "loss": 0.87, + "step": 2288 + }, + { + "epoch": 0.35026778882938026, + "grad_norm": 2.363498122115401, + "learning_rate": 1.8916300959489555e-05, + "loss": 0.7907, + "step": 2289 + }, + { + "epoch": 0.35042081101759753, + "grad_norm": 2.4364836344114784, + "learning_rate": 1.8915178652180968e-05, + "loss": 0.8606, + "step": 2290 + }, + { + "epoch": 0.35057383320581487, + "grad_norm": 2.353223078511656, + "learning_rate": 1.8914055797356385e-05, + "loss": 0.7986, + "step": 2291 + }, + { + "epoch": 0.35072685539403214, + "grad_norm": 2.7270978738275513, + "learning_rate": 1.8912932395084763e-05, + "loss": 0.7931, + "step": 2292 + }, + { + "epoch": 0.3508798775822494, + "grad_norm": 2.537289091735032, + "learning_rate": 1.8911808445435097e-05, + "loss": 0.9002, + "step": 2293 + }, + { + "epoch": 0.3510328997704667, + "grad_norm": 2.6426827278918488, + "learning_rate": 1.8910683948476407e-05, + "loss": 0.7942, + "step": 2294 + }, + { + "epoch": 0.35118592195868403, + "grad_norm": 2.2600513381334273, + "learning_rate": 1.890955890427776e-05, + "loss": 0.7864, + "step": 2295 + }, + { + "epoch": 0.3513389441469013, + "grad_norm": 2.1081811704121085, + "learning_rate": 1.8908433312908245e-05, + "loss": 0.6954, + "step": 2296 + }, + { + "epoch": 0.3514919663351186, + "grad_norm": 2.75647258716163, + "learning_rate": 1.8907307174436993e-05, + "loss": 0.833, + "step": 2297 + }, + { + "epoch": 0.35164498852333587, + "grad_norm": 2.4321268972579246, + "learning_rate": 1.8906180488933157e-05, + "loss": 0.8105, + "step": 2298 + }, + { + "epoch": 0.3517980107115532, + "grad_norm": 2.1495453410926504, + "learning_rate": 1.8905053256465938e-05, + "loss": 0.664, + "step": 2299 + }, + { + "epoch": 0.3519510328997705, + "grad_norm": 2.6676960585373695, + "learning_rate": 1.8903925477104564e-05, + "loss": 0.8758, + "step": 2300 + }, + { + "epoch": 0.35210405508798776, + "grad_norm": 2.7198945884747974, + "learning_rate": 1.8902797150918296e-05, + "loss": 0.7659, + "step": 2301 + }, + { + "epoch": 0.35225707727620503, + "grad_norm": 2.6559777133855365, + "learning_rate": 1.8901668277976418e-05, + "loss": 0.8424, + "step": 2302 + }, + { + "epoch": 0.35241009946442237, + "grad_norm": 2.434513815774383, + "learning_rate": 1.8900538858348278e-05, + "loss": 0.8141, + "step": 2303 + }, + { + "epoch": 0.35256312165263964, + "grad_norm": 2.484738285350647, + "learning_rate": 1.889940889210322e-05, + "loss": 0.735, + "step": 2304 + }, + { + "epoch": 0.3527161438408569, + "grad_norm": 2.2508751836804537, + "learning_rate": 1.889827837931065e-05, + "loss": 0.7989, + "step": 2305 + }, + { + "epoch": 0.3528691660290742, + "grad_norm": 2.640756293380206, + "learning_rate": 1.8897147320039996e-05, + "loss": 0.881, + "step": 2306 + }, + { + "epoch": 0.35302218821729153, + "grad_norm": 2.5263231329934652, + "learning_rate": 1.8896015714360717e-05, + "loss": 0.8671, + "step": 2307 + }, + { + "epoch": 0.3531752104055088, + "grad_norm": 2.085279323673372, + "learning_rate": 1.8894883562342312e-05, + "loss": 0.6073, + "step": 2308 + }, + { + "epoch": 0.3533282325937261, + "grad_norm": 2.691825024482245, + "learning_rate": 1.8893750864054313e-05, + "loss": 0.8882, + "step": 2309 + }, + { + "epoch": 0.35348125478194337, + "grad_norm": 2.4796532361562282, + "learning_rate": 1.8892617619566277e-05, + "loss": 0.859, + "step": 2310 + }, + { + "epoch": 0.3536342769701607, + "grad_norm": 2.498491204421089, + "learning_rate": 1.8891483828947805e-05, + "loss": 0.8075, + "step": 2311 + }, + { + "epoch": 0.353787299158378, + "grad_norm": 2.4618340934724237, + "learning_rate": 1.889034949226853e-05, + "loss": 0.8162, + "step": 2312 + }, + { + "epoch": 0.35394032134659525, + "grad_norm": 2.3787322282171117, + "learning_rate": 1.8889214609598113e-05, + "loss": 0.7916, + "step": 2313 + }, + { + "epoch": 0.35409334353481253, + "grad_norm": 2.331498599753699, + "learning_rate": 1.888807918100625e-05, + "loss": 0.8632, + "step": 2314 + }, + { + "epoch": 0.35424636572302987, + "grad_norm": 2.221565693142028, + "learning_rate": 1.8886943206562677e-05, + "loss": 0.7716, + "step": 2315 + }, + { + "epoch": 0.35439938791124714, + "grad_norm": 2.7336828188903874, + "learning_rate": 1.8885806686337154e-05, + "loss": 0.8538, + "step": 2316 + }, + { + "epoch": 0.3545524100994644, + "grad_norm": 2.3932636463702948, + "learning_rate": 1.888466962039948e-05, + "loss": 0.8395, + "step": 2317 + }, + { + "epoch": 0.3547054322876817, + "grad_norm": 2.2898696428789775, + "learning_rate": 1.888353200881949e-05, + "loss": 0.7866, + "step": 2318 + }, + { + "epoch": 0.35485845447589903, + "grad_norm": 2.2783308963748494, + "learning_rate": 1.8882393851667046e-05, + "loss": 0.7497, + "step": 2319 + }, + { + "epoch": 0.3550114766641163, + "grad_norm": 2.3945871355573454, + "learning_rate": 1.8881255149012047e-05, + "loss": 0.8037, + "step": 2320 + }, + { + "epoch": 0.3551644988523336, + "grad_norm": 2.4599488406377965, + "learning_rate": 1.8880115900924426e-05, + "loss": 0.7507, + "step": 2321 + }, + { + "epoch": 0.35531752104055087, + "grad_norm": 2.4065925989291492, + "learning_rate": 1.8878976107474147e-05, + "loss": 0.7945, + "step": 2322 + }, + { + "epoch": 0.3554705432287682, + "grad_norm": 2.4302388243500834, + "learning_rate": 1.8877835768731212e-05, + "loss": 0.8201, + "step": 2323 + }, + { + "epoch": 0.3556235654169855, + "grad_norm": 2.446682716888137, + "learning_rate": 1.8876694884765648e-05, + "loss": 0.8339, + "step": 2324 + }, + { + "epoch": 0.35577658760520275, + "grad_norm": 2.656460918351645, + "learning_rate": 1.887555345564753e-05, + "loss": 0.8646, + "step": 2325 + }, + { + "epoch": 0.35592960979342003, + "grad_norm": 2.349546610049587, + "learning_rate": 1.887441148144695e-05, + "loss": 0.7222, + "step": 2326 + }, + { + "epoch": 0.3560826319816373, + "grad_norm": 2.404762586158525, + "learning_rate": 1.887326896223405e-05, + "loss": 0.8084, + "step": 2327 + }, + { + "epoch": 0.35623565416985464, + "grad_norm": 2.283494276451758, + "learning_rate": 1.8872125898078985e-05, + "loss": 0.841, + "step": 2328 + }, + { + "epoch": 0.3563886763580719, + "grad_norm": 2.448512269421924, + "learning_rate": 1.8870982289051962e-05, + "loss": 0.7663, + "step": 2329 + }, + { + "epoch": 0.3565416985462892, + "grad_norm": 2.457783473444631, + "learning_rate": 1.8869838135223214e-05, + "loss": 0.763, + "step": 2330 + }, + { + "epoch": 0.3566947207345065, + "grad_norm": 2.5155960596034785, + "learning_rate": 1.8868693436663006e-05, + "loss": 0.8754, + "step": 2331 + }, + { + "epoch": 0.3568477429227238, + "grad_norm": 2.4245646747227085, + "learning_rate": 1.886754819344164e-05, + "loss": 0.7733, + "step": 2332 + }, + { + "epoch": 0.3570007651109411, + "grad_norm": 2.2376909756083623, + "learning_rate": 1.8866402405629447e-05, + "loss": 0.7319, + "step": 2333 + }, + { + "epoch": 0.35715378729915837, + "grad_norm": 2.4584252279555714, + "learning_rate": 1.88652560732968e-05, + "loss": 0.8691, + "step": 2334 + }, + { + "epoch": 0.35730680948737564, + "grad_norm": 2.541807107424281, + "learning_rate": 1.8864109196514096e-05, + "loss": 0.8073, + "step": 2335 + }, + { + "epoch": 0.357459831675593, + "grad_norm": 2.359184716636419, + "learning_rate": 1.886296177535177e-05, + "loss": 0.7375, + "step": 2336 + }, + { + "epoch": 0.35761285386381025, + "grad_norm": 2.3320992648966636, + "learning_rate": 1.8861813809880288e-05, + "loss": 0.7953, + "step": 2337 + }, + { + "epoch": 0.35776587605202753, + "grad_norm": 2.304009796345526, + "learning_rate": 1.8860665300170155e-05, + "loss": 0.7176, + "step": 2338 + }, + { + "epoch": 0.3579188982402448, + "grad_norm": 2.3503014104713094, + "learning_rate": 1.8859516246291897e-05, + "loss": 0.8045, + "step": 2339 + }, + { + "epoch": 0.35807192042846214, + "grad_norm": 2.619508152477919, + "learning_rate": 1.8858366648316093e-05, + "loss": 0.908, + "step": 2340 + }, + { + "epoch": 0.3582249426166794, + "grad_norm": 2.4429041886742264, + "learning_rate": 1.8857216506313334e-05, + "loss": 0.7526, + "step": 2341 + }, + { + "epoch": 0.3583779648048967, + "grad_norm": 2.543918354148765, + "learning_rate": 1.8856065820354267e-05, + "loss": 0.9139, + "step": 2342 + }, + { + "epoch": 0.358530986993114, + "grad_norm": 2.6126350593443632, + "learning_rate": 1.8854914590509547e-05, + "loss": 0.8782, + "step": 2343 + }, + { + "epoch": 0.3586840091813313, + "grad_norm": 2.6110295147807583, + "learning_rate": 1.8853762816849882e-05, + "loss": 0.8034, + "step": 2344 + }, + { + "epoch": 0.3588370313695486, + "grad_norm": 2.3093166827506866, + "learning_rate": 1.885261049944601e-05, + "loss": 0.8519, + "step": 2345 + }, + { + "epoch": 0.35899005355776586, + "grad_norm": 2.355282542968486, + "learning_rate": 1.885145763836869e-05, + "loss": 0.8291, + "step": 2346 + }, + { + "epoch": 0.35914307574598314, + "grad_norm": 2.347727191149938, + "learning_rate": 1.8850304233688735e-05, + "loss": 0.8327, + "step": 2347 + }, + { + "epoch": 0.3592960979342005, + "grad_norm": 2.7381027335841504, + "learning_rate": 1.884915028547697e-05, + "loss": 0.8587, + "step": 2348 + }, + { + "epoch": 0.35944912012241775, + "grad_norm": 2.2608152358156977, + "learning_rate": 1.884799579380427e-05, + "loss": 0.7851, + "step": 2349 + }, + { + "epoch": 0.35960214231063503, + "grad_norm": 2.5461028673938064, + "learning_rate": 1.8846840758741533e-05, + "loss": 0.8457, + "step": 2350 + }, + { + "epoch": 0.3597551644988523, + "grad_norm": 2.197749699940499, + "learning_rate": 1.88456851803597e-05, + "loss": 0.8232, + "step": 2351 + }, + { + "epoch": 0.35990818668706964, + "grad_norm": 2.2081723097087247, + "learning_rate": 1.8844529058729732e-05, + "loss": 0.8364, + "step": 2352 + }, + { + "epoch": 0.3600612088752869, + "grad_norm": 2.2167658053229014, + "learning_rate": 1.8843372393922633e-05, + "loss": 0.7587, + "step": 2353 + }, + { + "epoch": 0.3602142310635042, + "grad_norm": 2.5104046616375006, + "learning_rate": 1.8842215186009447e-05, + "loss": 0.8437, + "step": 2354 + }, + { + "epoch": 0.3603672532517215, + "grad_norm": 2.4725723695877857, + "learning_rate": 1.884105743506123e-05, + "loss": 0.7279, + "step": 2355 + }, + { + "epoch": 0.3605202754399388, + "grad_norm": 2.457372115979769, + "learning_rate": 1.883989914114909e-05, + "loss": 0.827, + "step": 2356 + }, + { + "epoch": 0.3606732976281561, + "grad_norm": 2.410585651428132, + "learning_rate": 1.883874030434416e-05, + "loss": 0.7795, + "step": 2357 + }, + { + "epoch": 0.36082631981637336, + "grad_norm": 2.4141145834331135, + "learning_rate": 1.8837580924717614e-05, + "loss": 0.777, + "step": 2358 + }, + { + "epoch": 0.36097934200459064, + "grad_norm": 2.2617813201912673, + "learning_rate": 1.883642100234065e-05, + "loss": 0.7614, + "step": 2359 + }, + { + "epoch": 0.361132364192808, + "grad_norm": 2.2966321276793735, + "learning_rate": 1.88352605372845e-05, + "loss": 0.7941, + "step": 2360 + }, + { + "epoch": 0.36128538638102525, + "grad_norm": 2.110102263249786, + "learning_rate": 1.8834099529620442e-05, + "loss": 0.6854, + "step": 2361 + }, + { + "epoch": 0.36143840856924253, + "grad_norm": 2.7367322503438754, + "learning_rate": 1.883293797941977e-05, + "loss": 0.8223, + "step": 2362 + }, + { + "epoch": 0.3615914307574598, + "grad_norm": 2.709621371396735, + "learning_rate": 1.8831775886753822e-05, + "loss": 0.7787, + "step": 2363 + }, + { + "epoch": 0.36174445294567714, + "grad_norm": 2.440509867786845, + "learning_rate": 1.883061325169397e-05, + "loss": 0.8569, + "step": 2364 + }, + { + "epoch": 0.3618974751338944, + "grad_norm": 2.504540793639551, + "learning_rate": 1.882945007431161e-05, + "loss": 0.7796, + "step": 2365 + }, + { + "epoch": 0.3620504973221117, + "grad_norm": 2.6996923030735096, + "learning_rate": 1.882828635467818e-05, + "loss": 0.8674, + "step": 2366 + }, + { + "epoch": 0.362203519510329, + "grad_norm": 2.8062029268635267, + "learning_rate": 1.8827122092865147e-05, + "loss": 0.9018, + "step": 2367 + }, + { + "epoch": 0.3623565416985463, + "grad_norm": 2.393990682979781, + "learning_rate": 1.8825957288944017e-05, + "loss": 0.8371, + "step": 2368 + }, + { + "epoch": 0.3625095638867636, + "grad_norm": 2.302645835173265, + "learning_rate": 1.882479194298632e-05, + "loss": 0.7554, + "step": 2369 + }, + { + "epoch": 0.36266258607498086, + "grad_norm": 2.42744947114573, + "learning_rate": 1.882362605506363e-05, + "loss": 0.8365, + "step": 2370 + }, + { + "epoch": 0.36281560826319814, + "grad_norm": 2.541019733868751, + "learning_rate": 1.8822459625247544e-05, + "loss": 0.9123, + "step": 2371 + }, + { + "epoch": 0.3629686304514155, + "grad_norm": 1.9423446059365113, + "learning_rate": 1.8821292653609698e-05, + "loss": 0.6642, + "step": 2372 + }, + { + "epoch": 0.36312165263963275, + "grad_norm": 2.662428175077465, + "learning_rate": 1.8820125140221762e-05, + "loss": 0.934, + "step": 2373 + }, + { + "epoch": 0.36327467482785003, + "grad_norm": 2.3608324759387105, + "learning_rate": 1.8818957085155437e-05, + "loss": 0.8484, + "step": 2374 + }, + { + "epoch": 0.3634276970160673, + "grad_norm": 2.43897927593089, + "learning_rate": 1.881778848848246e-05, + "loss": 0.9507, + "step": 2375 + }, + { + "epoch": 0.36358071920428464, + "grad_norm": 2.2402163894990617, + "learning_rate": 1.881661935027459e-05, + "loss": 0.7647, + "step": 2376 + }, + { + "epoch": 0.3637337413925019, + "grad_norm": 2.4107917616959322, + "learning_rate": 1.881544967060364e-05, + "loss": 0.8487, + "step": 2377 + }, + { + "epoch": 0.3638867635807192, + "grad_norm": 2.291052028819227, + "learning_rate": 1.881427944954144e-05, + "loss": 0.765, + "step": 2378 + }, + { + "epoch": 0.3640397857689365, + "grad_norm": 2.33261212765457, + "learning_rate": 1.8813108687159852e-05, + "loss": 0.7599, + "step": 2379 + }, + { + "epoch": 0.3641928079571538, + "grad_norm": 2.3741799543583517, + "learning_rate": 1.8811937383530786e-05, + "loss": 0.9039, + "step": 2380 + }, + { + "epoch": 0.3643458301453711, + "grad_norm": 2.697621817440523, + "learning_rate": 1.8810765538726175e-05, + "loss": 0.8248, + "step": 2381 + }, + { + "epoch": 0.36449885233358836, + "grad_norm": 2.317964315825029, + "learning_rate": 1.880959315281798e-05, + "loss": 0.6964, + "step": 2382 + }, + { + "epoch": 0.36465187452180564, + "grad_norm": 2.2431665552546427, + "learning_rate": 1.8808420225878207e-05, + "loss": 0.8107, + "step": 2383 + }, + { + "epoch": 0.364804896710023, + "grad_norm": 2.4004243356517687, + "learning_rate": 1.8807246757978892e-05, + "loss": 0.7904, + "step": 2384 + }, + { + "epoch": 0.36495791889824025, + "grad_norm": 2.7187226403241267, + "learning_rate": 1.8806072749192096e-05, + "loss": 0.9686, + "step": 2385 + }, + { + "epoch": 0.36511094108645753, + "grad_norm": 2.8316217465961344, + "learning_rate": 1.8804898199589925e-05, + "loss": 0.8722, + "step": 2386 + }, + { + "epoch": 0.3652639632746748, + "grad_norm": 2.3925054684622915, + "learning_rate": 1.8803723109244513e-05, + "loss": 0.9331, + "step": 2387 + }, + { + "epoch": 0.36541698546289214, + "grad_norm": 2.4682270072369636, + "learning_rate": 1.880254747822802e-05, + "loss": 0.7587, + "step": 2388 + }, + { + "epoch": 0.3655700076511094, + "grad_norm": 2.3194225175157093, + "learning_rate": 1.8801371306612652e-05, + "loss": 0.8076, + "step": 2389 + }, + { + "epoch": 0.3657230298393267, + "grad_norm": 2.4399649171572486, + "learning_rate": 1.880019459447064e-05, + "loss": 0.8232, + "step": 2390 + }, + { + "epoch": 0.365876052027544, + "grad_norm": 2.3831569144484126, + "learning_rate": 1.8799017341874255e-05, + "loss": 0.7088, + "step": 2391 + }, + { + "epoch": 0.3660290742157613, + "grad_norm": 2.3319903377417854, + "learning_rate": 1.879783954889579e-05, + "loss": 0.7227, + "step": 2392 + }, + { + "epoch": 0.3661820964039786, + "grad_norm": 2.495408648387555, + "learning_rate": 1.879666121560758e-05, + "loss": 0.8645, + "step": 2393 + }, + { + "epoch": 0.36633511859219586, + "grad_norm": 2.008295423288152, + "learning_rate": 1.8795482342081995e-05, + "loss": 0.7434, + "step": 2394 + }, + { + "epoch": 0.36648814078041314, + "grad_norm": 2.3226281552138484, + "learning_rate": 1.879430292839143e-05, + "loss": 0.7134, + "step": 2395 + }, + { + "epoch": 0.3666411629686305, + "grad_norm": 2.7280664670536074, + "learning_rate": 1.8793122974608318e-05, + "loss": 0.8971, + "step": 2396 + }, + { + "epoch": 0.36679418515684775, + "grad_norm": 2.417968916079334, + "learning_rate": 1.8791942480805127e-05, + "loss": 0.9469, + "step": 2397 + }, + { + "epoch": 0.36694720734506503, + "grad_norm": 2.3321600957081703, + "learning_rate": 1.8790761447054353e-05, + "loss": 0.8373, + "step": 2398 + }, + { + "epoch": 0.3671002295332823, + "grad_norm": 2.9661860123961263, + "learning_rate": 1.8789579873428526e-05, + "loss": 0.8054, + "step": 2399 + }, + { + "epoch": 0.36725325172149964, + "grad_norm": 2.8557791816349822, + "learning_rate": 1.878839776000022e-05, + "loss": 0.8268, + "step": 2400 + }, + { + "epoch": 0.3674062739097169, + "grad_norm": 2.5578308951105853, + "learning_rate": 1.8787215106842022e-05, + "loss": 0.7479, + "step": 2401 + }, + { + "epoch": 0.3675592960979342, + "grad_norm": 2.9152984669056323, + "learning_rate": 1.878603191402657e-05, + "loss": 0.8661, + "step": 2402 + }, + { + "epoch": 0.3677123182861515, + "grad_norm": 2.3207274090171843, + "learning_rate": 1.878484818162653e-05, + "loss": 0.805, + "step": 2403 + }, + { + "epoch": 0.3678653404743688, + "grad_norm": 2.3006454257014557, + "learning_rate": 1.878366390971459e-05, + "loss": 0.7455, + "step": 2404 + }, + { + "epoch": 0.3680183626625861, + "grad_norm": 2.3525946228739545, + "learning_rate": 1.8782479098363494e-05, + "loss": 0.8276, + "step": 2405 + }, + { + "epoch": 0.36817138485080336, + "grad_norm": 2.4311958649745504, + "learning_rate": 1.8781293747645998e-05, + "loss": 0.8442, + "step": 2406 + }, + { + "epoch": 0.36832440703902064, + "grad_norm": 2.2341059242635364, + "learning_rate": 1.87801078576349e-05, + "loss": 0.7892, + "step": 2407 + }, + { + "epoch": 0.368477429227238, + "grad_norm": 2.4789478015956288, + "learning_rate": 1.877892142840303e-05, + "loss": 0.7717, + "step": 2408 + }, + { + "epoch": 0.36863045141545525, + "grad_norm": 2.3251520271339743, + "learning_rate": 1.877773446002325e-05, + "loss": 0.9049, + "step": 2409 + }, + { + "epoch": 0.36878347360367253, + "grad_norm": 2.4284067911124843, + "learning_rate": 1.8776546952568456e-05, + "loss": 0.8632, + "step": 2410 + }, + { + "epoch": 0.3689364957918898, + "grad_norm": 2.407972854722055, + "learning_rate": 1.8775358906111586e-05, + "loss": 0.6936, + "step": 2411 + }, + { + "epoch": 0.36908951798010714, + "grad_norm": 2.2389103408646136, + "learning_rate": 1.8774170320725592e-05, + "loss": 0.6321, + "step": 2412 + }, + { + "epoch": 0.3692425401683244, + "grad_norm": 2.5869540597200396, + "learning_rate": 1.8772981196483474e-05, + "loss": 0.9377, + "step": 2413 + }, + { + "epoch": 0.3693955623565417, + "grad_norm": 2.752160756736885, + "learning_rate": 1.8771791533458264e-05, + "loss": 0.8435, + "step": 2414 + }, + { + "epoch": 0.369548584544759, + "grad_norm": 2.522539089724177, + "learning_rate": 1.8770601331723017e-05, + "loss": 0.8243, + "step": 2415 + }, + { + "epoch": 0.3697016067329763, + "grad_norm": 2.810550793971596, + "learning_rate": 1.8769410591350832e-05, + "loss": 0.9468, + "step": 2416 + }, + { + "epoch": 0.3698546289211936, + "grad_norm": 2.469864839731524, + "learning_rate": 1.8768219312414838e-05, + "loss": 0.7879, + "step": 2417 + }, + { + "epoch": 0.37000765110941086, + "grad_norm": 2.271877626883605, + "learning_rate": 1.8767027494988194e-05, + "loss": 0.8244, + "step": 2418 + }, + { + "epoch": 0.37016067329762814, + "grad_norm": 2.6112175112151044, + "learning_rate": 1.8765835139144093e-05, + "loss": 0.775, + "step": 2419 + }, + { + "epoch": 0.3703136954858455, + "grad_norm": 2.266093145167797, + "learning_rate": 1.8764642244955767e-05, + "loss": 0.7782, + "step": 2420 + }, + { + "epoch": 0.37046671767406275, + "grad_norm": 2.3075593915599724, + "learning_rate": 1.876344881249647e-05, + "loss": 0.8186, + "step": 2421 + }, + { + "epoch": 0.37061973986228003, + "grad_norm": 2.560441628674782, + "learning_rate": 1.8762254841839503e-05, + "loss": 0.9051, + "step": 2422 + }, + { + "epoch": 0.3707727620504973, + "grad_norm": 2.542019562348531, + "learning_rate": 1.8761060333058183e-05, + "loss": 0.8128, + "step": 2423 + }, + { + "epoch": 0.37092578423871464, + "grad_norm": 2.6372207763697197, + "learning_rate": 1.8759865286225878e-05, + "loss": 0.8735, + "step": 2424 + }, + { + "epoch": 0.3710788064269319, + "grad_norm": 2.5326515666594593, + "learning_rate": 1.8758669701415975e-05, + "loss": 0.7602, + "step": 2425 + }, + { + "epoch": 0.3712318286151492, + "grad_norm": 2.368763311753748, + "learning_rate": 1.8757473578701902e-05, + "loss": 0.7612, + "step": 2426 + }, + { + "epoch": 0.3713848508033665, + "grad_norm": 2.128504255109629, + "learning_rate": 1.8756276918157118e-05, + "loss": 0.7579, + "step": 2427 + }, + { + "epoch": 0.3715378729915838, + "grad_norm": 2.471878320157333, + "learning_rate": 1.8755079719855113e-05, + "loss": 0.8326, + "step": 2428 + }, + { + "epoch": 0.3716908951798011, + "grad_norm": 2.7949857804868428, + "learning_rate": 1.8753881983869414e-05, + "loss": 0.854, + "step": 2429 + }, + { + "epoch": 0.37184391736801836, + "grad_norm": 2.7751068465784123, + "learning_rate": 1.875268371027357e-05, + "loss": 0.8798, + "step": 2430 + }, + { + "epoch": 0.37199693955623564, + "grad_norm": 2.7780322221768863, + "learning_rate": 1.8751484899141185e-05, + "loss": 0.8468, + "step": 2431 + }, + { + "epoch": 0.372149961744453, + "grad_norm": 2.4139073729391702, + "learning_rate": 1.8750285550545877e-05, + "loss": 0.807, + "step": 2432 + }, + { + "epoch": 0.37230298393267025, + "grad_norm": 2.5798622524914503, + "learning_rate": 1.87490856645613e-05, + "loss": 0.7849, + "step": 2433 + }, + { + "epoch": 0.37245600612088753, + "grad_norm": 2.3845029258074173, + "learning_rate": 1.874788524126115e-05, + "loss": 0.7316, + "step": 2434 + }, + { + "epoch": 0.3726090283091048, + "grad_norm": 2.2918954995904763, + "learning_rate": 1.8746684280719137e-05, + "loss": 0.7754, + "step": 2435 + }, + { + "epoch": 0.37276205049732214, + "grad_norm": 2.8944457285794725, + "learning_rate": 1.874548278300903e-05, + "loss": 0.8968, + "step": 2436 + }, + { + "epoch": 0.3729150726855394, + "grad_norm": 2.512484041846354, + "learning_rate": 1.8744280748204614e-05, + "loss": 0.8099, + "step": 2437 + }, + { + "epoch": 0.3730680948737567, + "grad_norm": 2.652834197290464, + "learning_rate": 1.8743078176379707e-05, + "loss": 0.841, + "step": 2438 + }, + { + "epoch": 0.373221117061974, + "grad_norm": 2.784629393063227, + "learning_rate": 1.8741875067608167e-05, + "loss": 0.8983, + "step": 2439 + }, + { + "epoch": 0.37337413925019125, + "grad_norm": 2.676099241658654, + "learning_rate": 1.874067142196388e-05, + "loss": 0.8388, + "step": 2440 + }, + { + "epoch": 0.3735271614384086, + "grad_norm": 2.5217409667731534, + "learning_rate": 1.8739467239520767e-05, + "loss": 0.818, + "step": 2441 + }, + { + "epoch": 0.37368018362662586, + "grad_norm": 2.1378603895356583, + "learning_rate": 1.8738262520352783e-05, + "loss": 0.7787, + "step": 2442 + }, + { + "epoch": 0.37383320581484314, + "grad_norm": 2.5527568920487917, + "learning_rate": 1.8737057264533915e-05, + "loss": 0.8348, + "step": 2443 + }, + { + "epoch": 0.3739862280030604, + "grad_norm": 2.624247786380471, + "learning_rate": 1.873585147213818e-05, + "loss": 0.7936, + "step": 2444 + }, + { + "epoch": 0.37413925019127775, + "grad_norm": 2.3965554595283836, + "learning_rate": 1.873464514323963e-05, + "loss": 0.7522, + "step": 2445 + }, + { + "epoch": 0.37429227237949503, + "grad_norm": 2.5187732475788396, + "learning_rate": 1.873343827791235e-05, + "loss": 0.753, + "step": 2446 + }, + { + "epoch": 0.3744452945677123, + "grad_norm": 2.415007932936088, + "learning_rate": 1.8732230876230463e-05, + "loss": 0.865, + "step": 2447 + }, + { + "epoch": 0.3745983167559296, + "grad_norm": 2.411623125230876, + "learning_rate": 1.8731022938268114e-05, + "loss": 0.7445, + "step": 2448 + }, + { + "epoch": 0.3747513389441469, + "grad_norm": 2.504678058017117, + "learning_rate": 1.8729814464099492e-05, + "loss": 0.8019, + "step": 2449 + }, + { + "epoch": 0.3749043611323642, + "grad_norm": 2.4121077808768683, + "learning_rate": 1.8728605453798816e-05, + "loss": 0.8243, + "step": 2450 + }, + { + "epoch": 0.37505738332058147, + "grad_norm": 2.3792207835192927, + "learning_rate": 1.8727395907440328e-05, + "loss": 0.7216, + "step": 2451 + }, + { + "epoch": 0.37521040550879875, + "grad_norm": 2.7011902930310323, + "learning_rate": 1.8726185825098317e-05, + "loss": 0.834, + "step": 2452 + }, + { + "epoch": 0.3753634276970161, + "grad_norm": 2.37612362801207, + "learning_rate": 1.87249752068471e-05, + "loss": 0.7309, + "step": 2453 + }, + { + "epoch": 0.37551644988523336, + "grad_norm": 2.5429947552774097, + "learning_rate": 1.8723764052761018e-05, + "loss": 0.7774, + "step": 2454 + }, + { + "epoch": 0.37566947207345064, + "grad_norm": 2.399185893707355, + "learning_rate": 1.8722552362914463e-05, + "loss": 0.8234, + "step": 2455 + }, + { + "epoch": 0.3758224942616679, + "grad_norm": 2.495485499967576, + "learning_rate": 1.872134013738184e-05, + "loss": 0.7609, + "step": 2456 + }, + { + "epoch": 0.37597551644988525, + "grad_norm": 2.352447545297215, + "learning_rate": 1.8720127376237606e-05, + "loss": 0.7429, + "step": 2457 + }, + { + "epoch": 0.3761285386381025, + "grad_norm": 2.040482423517943, + "learning_rate": 1.871891407955623e-05, + "loss": 0.7081, + "step": 2458 + }, + { + "epoch": 0.3762815608263198, + "grad_norm": 2.4635143143098253, + "learning_rate": 1.8717700247412237e-05, + "loss": 0.8347, + "step": 2459 + }, + { + "epoch": 0.3764345830145371, + "grad_norm": 2.4888516381763712, + "learning_rate": 1.8716485879880165e-05, + "loss": 0.7666, + "step": 2460 + }, + { + "epoch": 0.3765876052027544, + "grad_norm": 2.3853061470198953, + "learning_rate": 1.87152709770346e-05, + "loss": 0.7858, + "step": 2461 + }, + { + "epoch": 0.3767406273909717, + "grad_norm": 2.8397222753085583, + "learning_rate": 1.8714055538950145e-05, + "loss": 0.7969, + "step": 2462 + }, + { + "epoch": 0.37689364957918897, + "grad_norm": 2.2192558442628387, + "learning_rate": 1.8712839565701455e-05, + "loss": 0.893, + "step": 2463 + }, + { + "epoch": 0.37704667176740625, + "grad_norm": 2.1752035742885645, + "learning_rate": 1.87116230573632e-05, + "loss": 0.7395, + "step": 2464 + }, + { + "epoch": 0.3771996939556236, + "grad_norm": 2.4877185407030167, + "learning_rate": 1.8710406014010094e-05, + "loss": 0.8267, + "step": 2465 + }, + { + "epoch": 0.37735271614384086, + "grad_norm": 2.5339436958201067, + "learning_rate": 1.870918843571688e-05, + "loss": 0.7972, + "step": 2466 + }, + { + "epoch": 0.37750573833205814, + "grad_norm": 2.557231748291859, + "learning_rate": 1.8707970322558328e-05, + "loss": 0.8761, + "step": 2467 + }, + { + "epoch": 0.3776587605202754, + "grad_norm": 2.2694866860882574, + "learning_rate": 1.8706751674609258e-05, + "loss": 0.8071, + "step": 2468 + }, + { + "epoch": 0.37781178270849275, + "grad_norm": 2.3669762986337104, + "learning_rate": 1.8705532491944505e-05, + "loss": 0.8245, + "step": 2469 + }, + { + "epoch": 0.37796480489671, + "grad_norm": 2.3039821470771846, + "learning_rate": 1.8704312774638945e-05, + "loss": 0.6956, + "step": 2470 + }, + { + "epoch": 0.3781178270849273, + "grad_norm": 2.438094533399992, + "learning_rate": 1.8703092522767487e-05, + "loss": 0.8558, + "step": 2471 + }, + { + "epoch": 0.3782708492731446, + "grad_norm": 2.690744376860686, + "learning_rate": 1.870187173640507e-05, + "loss": 0.988, + "step": 2472 + }, + { + "epoch": 0.3784238714613619, + "grad_norm": 2.410059836592625, + "learning_rate": 1.870065041562667e-05, + "loss": 0.7595, + "step": 2473 + }, + { + "epoch": 0.3785768936495792, + "grad_norm": 2.3714407252038523, + "learning_rate": 1.8699428560507288e-05, + "loss": 0.8068, + "step": 2474 + }, + { + "epoch": 0.37872991583779647, + "grad_norm": 2.87103136320842, + "learning_rate": 1.8698206171121963e-05, + "loss": 0.7212, + "step": 2475 + }, + { + "epoch": 0.37888293802601375, + "grad_norm": 2.419031011584517, + "learning_rate": 1.8696983247545776e-05, + "loss": 0.8506, + "step": 2476 + }, + { + "epoch": 0.3790359602142311, + "grad_norm": 2.3590944560091667, + "learning_rate": 1.869575978985382e-05, + "loss": 0.9662, + "step": 2477 + }, + { + "epoch": 0.37918898240244836, + "grad_norm": 2.4656630507450314, + "learning_rate": 1.8694535798121244e-05, + "loss": 0.8289, + "step": 2478 + }, + { + "epoch": 0.37934200459066564, + "grad_norm": 2.295792371436799, + "learning_rate": 1.8693311272423204e-05, + "loss": 0.8399, + "step": 2479 + }, + { + "epoch": 0.3794950267788829, + "grad_norm": 2.961244599837281, + "learning_rate": 1.8692086212834912e-05, + "loss": 0.8367, + "step": 2480 + }, + { + "epoch": 0.37964804896710025, + "grad_norm": 2.6693969074114365, + "learning_rate": 1.8690860619431604e-05, + "loss": 0.9139, + "step": 2481 + }, + { + "epoch": 0.3798010711553175, + "grad_norm": 2.635077494495987, + "learning_rate": 1.8689634492288547e-05, + "loss": 0.7848, + "step": 2482 + }, + { + "epoch": 0.3799540933435348, + "grad_norm": 2.561238780281626, + "learning_rate": 1.8688407831481037e-05, + "loss": 0.9398, + "step": 2483 + }, + { + "epoch": 0.3801071155317521, + "grad_norm": 2.3136951444719007, + "learning_rate": 1.8687180637084418e-05, + "loss": 0.7758, + "step": 2484 + }, + { + "epoch": 0.3802601377199694, + "grad_norm": 2.504907279331964, + "learning_rate": 1.868595290917405e-05, + "loss": 0.8735, + "step": 2485 + }, + { + "epoch": 0.3804131599081867, + "grad_norm": 2.2085626782374854, + "learning_rate": 1.8684724647825333e-05, + "loss": 0.74, + "step": 2486 + }, + { + "epoch": 0.38056618209640397, + "grad_norm": 2.6191916388341125, + "learning_rate": 1.8683495853113703e-05, + "loss": 0.787, + "step": 2487 + }, + { + "epoch": 0.38071920428462125, + "grad_norm": 2.602426306454107, + "learning_rate": 1.868226652511462e-05, + "loss": 0.7568, + "step": 2488 + }, + { + "epoch": 0.3808722264728386, + "grad_norm": 2.3846879215375076, + "learning_rate": 1.8681036663903585e-05, + "loss": 0.7223, + "step": 2489 + }, + { + "epoch": 0.38102524866105586, + "grad_norm": 2.2431261468268686, + "learning_rate": 1.867980626955613e-05, + "loss": 0.7147, + "step": 2490 + }, + { + "epoch": 0.38117827084927314, + "grad_norm": 2.5618430976248265, + "learning_rate": 1.8678575342147815e-05, + "loss": 0.8167, + "step": 2491 + }, + { + "epoch": 0.3813312930374904, + "grad_norm": 2.7244295536027194, + "learning_rate": 1.867734388175424e-05, + "loss": 0.7683, + "step": 2492 + }, + { + "epoch": 0.38148431522570775, + "grad_norm": 2.6472413539846857, + "learning_rate": 1.8676111888451028e-05, + "loss": 0.8227, + "step": 2493 + }, + { + "epoch": 0.381637337413925, + "grad_norm": 2.308898420544816, + "learning_rate": 1.8674879362313843e-05, + "loss": 0.8609, + "step": 2494 + }, + { + "epoch": 0.3817903596021423, + "grad_norm": 2.563563794604606, + "learning_rate": 1.8673646303418382e-05, + "loss": 0.8591, + "step": 2495 + }, + { + "epoch": 0.3819433817903596, + "grad_norm": 2.327587598177729, + "learning_rate": 1.867241271184037e-05, + "loss": 0.7396, + "step": 2496 + }, + { + "epoch": 0.3820964039785769, + "grad_norm": 2.2452666295710717, + "learning_rate": 1.8671178587655567e-05, + "loss": 0.8333, + "step": 2497 + }, + { + "epoch": 0.3822494261667942, + "grad_norm": 2.358122242033259, + "learning_rate": 1.8669943930939763e-05, + "loss": 0.715, + "step": 2498 + }, + { + "epoch": 0.38240244835501147, + "grad_norm": 2.242928896018524, + "learning_rate": 1.866870874176879e-05, + "loss": 0.8568, + "step": 2499 + }, + { + "epoch": 0.38255547054322875, + "grad_norm": 2.393411774568154, + "learning_rate": 1.8667473020218497e-05, + "loss": 0.8591, + "step": 2500 + }, + { + "epoch": 0.3827084927314461, + "grad_norm": 2.508761300085195, + "learning_rate": 1.8666236766364778e-05, + "loss": 0.6883, + "step": 2501 + }, + { + "epoch": 0.38286151491966336, + "grad_norm": 2.273285381962697, + "learning_rate": 1.8664999980283558e-05, + "loss": 0.7873, + "step": 2502 + }, + { + "epoch": 0.38301453710788064, + "grad_norm": 2.5456143107904405, + "learning_rate": 1.8663762662050793e-05, + "loss": 0.7534, + "step": 2503 + }, + { + "epoch": 0.3831675592960979, + "grad_norm": 2.5340046766612954, + "learning_rate": 1.8662524811742467e-05, + "loss": 0.9405, + "step": 2504 + }, + { + "epoch": 0.38332058148431525, + "grad_norm": 2.2030346255099857, + "learning_rate": 1.8661286429434607e-05, + "loss": 0.8104, + "step": 2505 + }, + { + "epoch": 0.3834736036725325, + "grad_norm": 2.0899223105089977, + "learning_rate": 1.8660047515203263e-05, + "loss": 0.7298, + "step": 2506 + }, + { + "epoch": 0.3836266258607498, + "grad_norm": 2.091576745913007, + "learning_rate": 1.8658808069124523e-05, + "loss": 0.7127, + "step": 2507 + }, + { + "epoch": 0.3837796480489671, + "grad_norm": 2.442967562778881, + "learning_rate": 1.8657568091274503e-05, + "loss": 0.928, + "step": 2508 + }, + { + "epoch": 0.3839326702371844, + "grad_norm": 2.6676064813396496, + "learning_rate": 1.8656327581729364e-05, + "loss": 0.8753, + "step": 2509 + }, + { + "epoch": 0.3840856924254017, + "grad_norm": 2.9341603959582567, + "learning_rate": 1.865508654056528e-05, + "loss": 0.9654, + "step": 2510 + }, + { + "epoch": 0.38423871461361897, + "grad_norm": 2.4344257952073156, + "learning_rate": 1.8653844967858475e-05, + "loss": 0.7526, + "step": 2511 + }, + { + "epoch": 0.38439173680183625, + "grad_norm": 2.361905617278103, + "learning_rate": 1.8652602863685195e-05, + "loss": 0.8353, + "step": 2512 + }, + { + "epoch": 0.3845447589900536, + "grad_norm": 2.218282200611466, + "learning_rate": 1.8651360228121724e-05, + "loss": 0.6705, + "step": 2513 + }, + { + "epoch": 0.38469778117827086, + "grad_norm": 2.5345150182492953, + "learning_rate": 1.8650117061244378e-05, + "loss": 0.8431, + "step": 2514 + }, + { + "epoch": 0.38485080336648814, + "grad_norm": 2.2846439326524712, + "learning_rate": 1.8648873363129502e-05, + "loss": 0.7474, + "step": 2515 + }, + { + "epoch": 0.3850038255547054, + "grad_norm": 2.3803085886744992, + "learning_rate": 1.864762913385348e-05, + "loss": 0.7584, + "step": 2516 + }, + { + "epoch": 0.38515684774292275, + "grad_norm": 2.257800923753178, + "learning_rate": 1.864638437349272e-05, + "loss": 0.751, + "step": 2517 + }, + { + "epoch": 0.38530986993114, + "grad_norm": 2.4758559918625016, + "learning_rate": 1.8645139082123675e-05, + "loss": 0.8829, + "step": 2518 + }, + { + "epoch": 0.3854628921193573, + "grad_norm": 2.508363591784869, + "learning_rate": 1.8643893259822817e-05, + "loss": 0.8091, + "step": 2519 + }, + { + "epoch": 0.3856159143075746, + "grad_norm": 2.344123531418125, + "learning_rate": 1.864264690666666e-05, + "loss": 0.8086, + "step": 2520 + }, + { + "epoch": 0.3857689364957919, + "grad_norm": 2.6911838867682327, + "learning_rate": 1.8641400022731746e-05, + "loss": 0.8829, + "step": 2521 + }, + { + "epoch": 0.3859219586840092, + "grad_norm": 2.454881184279251, + "learning_rate": 1.864015260809465e-05, + "loss": 0.8059, + "step": 2522 + }, + { + "epoch": 0.38607498087222647, + "grad_norm": 2.3972818653488353, + "learning_rate": 1.8638904662831985e-05, + "loss": 0.7309, + "step": 2523 + }, + { + "epoch": 0.38622800306044375, + "grad_norm": 2.2703191459881573, + "learning_rate": 1.8637656187020385e-05, + "loss": 0.7365, + "step": 2524 + }, + { + "epoch": 0.3863810252486611, + "grad_norm": 2.480521094293675, + "learning_rate": 1.863640718073653e-05, + "loss": 0.7481, + "step": 2525 + }, + { + "epoch": 0.38653404743687836, + "grad_norm": 2.32410916843462, + "learning_rate": 1.8635157644057124e-05, + "loss": 0.7864, + "step": 2526 + }, + { + "epoch": 0.38668706962509564, + "grad_norm": 2.1555637796345617, + "learning_rate": 1.8633907577058905e-05, + "loss": 0.7126, + "step": 2527 + }, + { + "epoch": 0.3868400918133129, + "grad_norm": 2.0035595797324537, + "learning_rate": 1.8632656979818645e-05, + "loss": 0.7262, + "step": 2528 + }, + { + "epoch": 0.38699311400153025, + "grad_norm": 2.500240065738295, + "learning_rate": 1.863140585241315e-05, + "loss": 0.8232, + "step": 2529 + }, + { + "epoch": 0.3871461361897475, + "grad_norm": 2.182399464955277, + "learning_rate": 1.8630154194919256e-05, + "loss": 0.7309, + "step": 2530 + }, + { + "epoch": 0.3872991583779648, + "grad_norm": 2.547552862072076, + "learning_rate": 1.8628902007413835e-05, + "loss": 0.9328, + "step": 2531 + }, + { + "epoch": 0.3874521805661821, + "grad_norm": 2.4185769090684404, + "learning_rate": 1.8627649289973776e-05, + "loss": 0.8189, + "step": 2532 + }, + { + "epoch": 0.3876052027543994, + "grad_norm": 2.220438695870478, + "learning_rate": 1.862639604267603e-05, + "loss": 0.7749, + "step": 2533 + }, + { + "epoch": 0.3877582249426167, + "grad_norm": 2.578841434734547, + "learning_rate": 1.8625142265597556e-05, + "loss": 0.8771, + "step": 2534 + }, + { + "epoch": 0.38791124713083397, + "grad_norm": 2.2401417785438382, + "learning_rate": 1.862388795881535e-05, + "loss": 0.7575, + "step": 2535 + }, + { + "epoch": 0.38806426931905125, + "grad_norm": 2.478671799326241, + "learning_rate": 1.862263312240645e-05, + "loss": 0.7846, + "step": 2536 + }, + { + "epoch": 0.3882172915072686, + "grad_norm": 2.2303927085740565, + "learning_rate": 1.8621377756447918e-05, + "loss": 0.8156, + "step": 2537 + }, + { + "epoch": 0.38837031369548586, + "grad_norm": 2.247486841768642, + "learning_rate": 1.8620121861016854e-05, + "loss": 0.8024, + "step": 2538 + }, + { + "epoch": 0.38852333588370314, + "grad_norm": 2.41891025833812, + "learning_rate": 1.861886543619038e-05, + "loss": 0.9377, + "step": 2539 + }, + { + "epoch": 0.3886763580719204, + "grad_norm": 2.490839576225125, + "learning_rate": 1.8617608482045662e-05, + "loss": 0.7237, + "step": 2540 + }, + { + "epoch": 0.38882938026013775, + "grad_norm": 2.204949493052091, + "learning_rate": 1.8616350998659895e-05, + "loss": 0.7152, + "step": 2541 + }, + { + "epoch": 0.388982402448355, + "grad_norm": 2.520927297137216, + "learning_rate": 1.8615092986110308e-05, + "loss": 0.5829, + "step": 2542 + }, + { + "epoch": 0.3891354246365723, + "grad_norm": 2.0925039293849834, + "learning_rate": 1.861383444447416e-05, + "loss": 0.6984, + "step": 2543 + }, + { + "epoch": 0.3892884468247896, + "grad_norm": 2.425641347817572, + "learning_rate": 1.8612575373828735e-05, + "loss": 0.8305, + "step": 2544 + }, + { + "epoch": 0.3894414690130069, + "grad_norm": 2.573051475202199, + "learning_rate": 1.8611315774251367e-05, + "loss": 0.7923, + "step": 2545 + }, + { + "epoch": 0.3895944912012242, + "grad_norm": 2.5068821947821625, + "learning_rate": 1.861005564581941e-05, + "loss": 0.9357, + "step": 2546 + }, + { + "epoch": 0.38974751338944147, + "grad_norm": 2.324998200221432, + "learning_rate": 1.8608794988610256e-05, + "loss": 0.8629, + "step": 2547 + }, + { + "epoch": 0.38990053557765875, + "grad_norm": 2.52960849031247, + "learning_rate": 1.8607533802701318e-05, + "loss": 0.7974, + "step": 2548 + }, + { + "epoch": 0.390053557765876, + "grad_norm": 2.4815636587464494, + "learning_rate": 1.860627208817006e-05, + "loss": 0.8583, + "step": 2549 + }, + { + "epoch": 0.39020657995409336, + "grad_norm": 2.2653029816484582, + "learning_rate": 1.8605009845093964e-05, + "loss": 0.7958, + "step": 2550 + }, + { + "epoch": 0.39035960214231064, + "grad_norm": 2.3990609889203807, + "learning_rate": 1.8603747073550552e-05, + "loss": 0.81, + "step": 2551 + }, + { + "epoch": 0.3905126243305279, + "grad_norm": 2.32455958546504, + "learning_rate": 1.8602483773617373e-05, + "loss": 0.7526, + "step": 2552 + }, + { + "epoch": 0.3906656465187452, + "grad_norm": 2.3427643871420774, + "learning_rate": 1.8601219945372015e-05, + "loss": 0.8194, + "step": 2553 + }, + { + "epoch": 0.3908186687069625, + "grad_norm": 2.2513504889406226, + "learning_rate": 1.8599955588892086e-05, + "loss": 0.7728, + "step": 2554 + }, + { + "epoch": 0.3909716908951798, + "grad_norm": 2.338401329885293, + "learning_rate": 1.8598690704255245e-05, + "loss": 0.8088, + "step": 2555 + }, + { + "epoch": 0.3911247130833971, + "grad_norm": 2.3014418358202473, + "learning_rate": 1.859742529153917e-05, + "loss": 0.7908, + "step": 2556 + }, + { + "epoch": 0.39127773527161436, + "grad_norm": 2.6220498441628957, + "learning_rate": 1.8596159350821573e-05, + "loss": 0.851, + "step": 2557 + }, + { + "epoch": 0.3914307574598317, + "grad_norm": 2.5560864630883477, + "learning_rate": 1.8594892882180202e-05, + "loss": 0.7645, + "step": 2558 + }, + { + "epoch": 0.39158377964804897, + "grad_norm": 2.4441604741310794, + "learning_rate": 1.8593625885692835e-05, + "loss": 0.8209, + "step": 2559 + }, + { + "epoch": 0.39173680183626625, + "grad_norm": 2.394828726374545, + "learning_rate": 1.8592358361437287e-05, + "loss": 0.7607, + "step": 2560 + }, + { + "epoch": 0.3918898240244835, + "grad_norm": 2.7066788691327055, + "learning_rate": 1.8591090309491397e-05, + "loss": 0.8225, + "step": 2561 + }, + { + "epoch": 0.39204284621270086, + "grad_norm": 2.466903893172232, + "learning_rate": 1.858982172993304e-05, + "loss": 0.7865, + "step": 2562 + }, + { + "epoch": 0.39219586840091814, + "grad_norm": 2.673740227798785, + "learning_rate": 1.858855262284013e-05, + "loss": 0.9003, + "step": 2563 + }, + { + "epoch": 0.3923488905891354, + "grad_norm": 2.655379391443632, + "learning_rate": 1.8587282988290604e-05, + "loss": 0.8515, + "step": 2564 + }, + { + "epoch": 0.3925019127773527, + "grad_norm": 2.387993715416807, + "learning_rate": 1.8586012826362437e-05, + "loss": 0.8344, + "step": 2565 + }, + { + "epoch": 0.39265493496557, + "grad_norm": 2.122389308766076, + "learning_rate": 1.8584742137133635e-05, + "loss": 0.7624, + "step": 2566 + }, + { + "epoch": 0.3928079571537873, + "grad_norm": 2.4675672480936175, + "learning_rate": 1.8583470920682232e-05, + "loss": 0.7786, + "step": 2567 + }, + { + "epoch": 0.3929609793420046, + "grad_norm": 2.1616293938457827, + "learning_rate": 1.8582199177086302e-05, + "loss": 0.682, + "step": 2568 + }, + { + "epoch": 0.39311400153022186, + "grad_norm": 2.424779733933877, + "learning_rate": 1.8580926906423944e-05, + "loss": 0.7539, + "step": 2569 + }, + { + "epoch": 0.3932670237184392, + "grad_norm": 2.468301808496714, + "learning_rate": 1.8579654108773296e-05, + "loss": 0.7523, + "step": 2570 + }, + { + "epoch": 0.39342004590665647, + "grad_norm": 2.5668439800269223, + "learning_rate": 1.857838078421253e-05, + "loss": 0.9001, + "step": 2571 + }, + { + "epoch": 0.39357306809487375, + "grad_norm": 2.6861943370437285, + "learning_rate": 1.857710693281984e-05, + "loss": 1.0394, + "step": 2572 + }, + { + "epoch": 0.393726090283091, + "grad_norm": 2.45276904000766, + "learning_rate": 1.8575832554673457e-05, + "loss": 0.7552, + "step": 2573 + }, + { + "epoch": 0.39387911247130836, + "grad_norm": 2.355317126126015, + "learning_rate": 1.857455764985165e-05, + "loss": 0.723, + "step": 2574 + }, + { + "epoch": 0.39403213465952563, + "grad_norm": 2.303305717824098, + "learning_rate": 1.8573282218432712e-05, + "loss": 0.8183, + "step": 2575 + }, + { + "epoch": 0.3941851568477429, + "grad_norm": 2.274204248618728, + "learning_rate": 1.8572006260494975e-05, + "loss": 0.7922, + "step": 2576 + }, + { + "epoch": 0.3943381790359602, + "grad_norm": 2.6013486170787905, + "learning_rate": 1.85707297761168e-05, + "loss": 0.8621, + "step": 2577 + }, + { + "epoch": 0.3944912012241775, + "grad_norm": 2.6464460011195183, + "learning_rate": 1.856945276537658e-05, + "loss": 1.0042, + "step": 2578 + }, + { + "epoch": 0.3946442234123948, + "grad_norm": 2.071644475187324, + "learning_rate": 1.856817522835274e-05, + "loss": 0.7901, + "step": 2579 + }, + { + "epoch": 0.3947972456006121, + "grad_norm": 2.1078648872166936, + "learning_rate": 1.8566897165123742e-05, + "loss": 0.6951, + "step": 2580 + }, + { + "epoch": 0.39495026778882936, + "grad_norm": 2.390748321191043, + "learning_rate": 1.8565618575768078e-05, + "loss": 0.7759, + "step": 2581 + }, + { + "epoch": 0.3951032899770467, + "grad_norm": 2.3383337718182724, + "learning_rate": 1.8564339460364268e-05, + "loss": 0.7706, + "step": 2582 + }, + { + "epoch": 0.39525631216526397, + "grad_norm": 2.2256538171713944, + "learning_rate": 1.8563059818990864e-05, + "loss": 0.756, + "step": 2583 + }, + { + "epoch": 0.39540933435348125, + "grad_norm": 2.35477670073321, + "learning_rate": 1.856177965172646e-05, + "loss": 0.8469, + "step": 2584 + }, + { + "epoch": 0.3955623565416985, + "grad_norm": 2.394424568314491, + "learning_rate": 1.8560498958649675e-05, + "loss": 0.8801, + "step": 2585 + }, + { + "epoch": 0.39571537872991586, + "grad_norm": 2.200020298735911, + "learning_rate": 1.8559217739839156e-05, + "loss": 0.7756, + "step": 2586 + }, + { + "epoch": 0.39586840091813313, + "grad_norm": 2.5743268374527686, + "learning_rate": 1.8557935995373593e-05, + "loss": 0.8344, + "step": 2587 + }, + { + "epoch": 0.3960214231063504, + "grad_norm": 2.5434499031469753, + "learning_rate": 1.8556653725331703e-05, + "loss": 0.8237, + "step": 2588 + }, + { + "epoch": 0.3961744452945677, + "grad_norm": 2.2710097319079243, + "learning_rate": 1.8555370929792237e-05, + "loss": 0.7575, + "step": 2589 + }, + { + "epoch": 0.396327467482785, + "grad_norm": 2.1784171165885837, + "learning_rate": 1.8554087608833967e-05, + "loss": 0.7892, + "step": 2590 + }, + { + "epoch": 0.3964804896710023, + "grad_norm": 2.4856988681527072, + "learning_rate": 1.8552803762535717e-05, + "loss": 0.9424, + "step": 2591 + }, + { + "epoch": 0.3966335118592196, + "grad_norm": 2.7840026809571716, + "learning_rate": 1.855151939097633e-05, + "loss": 0.8897, + "step": 2592 + }, + { + "epoch": 0.39678653404743686, + "grad_norm": 2.5930974653909384, + "learning_rate": 1.855023449423468e-05, + "loss": 0.8272, + "step": 2593 + }, + { + "epoch": 0.3969395562356542, + "grad_norm": 2.3916332977591277, + "learning_rate": 1.8548949072389684e-05, + "loss": 0.8303, + "step": 2594 + }, + { + "epoch": 0.39709257842387147, + "grad_norm": 2.465859918701186, + "learning_rate": 1.8547663125520282e-05, + "loss": 0.8453, + "step": 2595 + }, + { + "epoch": 0.39724560061208875, + "grad_norm": 2.5220271459852586, + "learning_rate": 1.8546376653705446e-05, + "loss": 0.9473, + "step": 2596 + }, + { + "epoch": 0.397398622800306, + "grad_norm": 2.2621031004903163, + "learning_rate": 1.8545089657024185e-05, + "loss": 0.7321, + "step": 2597 + }, + { + "epoch": 0.39755164498852336, + "grad_norm": 2.21402172022864, + "learning_rate": 1.8543802135555544e-05, + "loss": 0.7898, + "step": 2598 + }, + { + "epoch": 0.39770466717674063, + "grad_norm": 2.5160554926745142, + "learning_rate": 1.854251408937859e-05, + "loss": 0.8523, + "step": 2599 + }, + { + "epoch": 0.3978576893649579, + "grad_norm": 2.472537617150156, + "learning_rate": 1.8541225518572425e-05, + "loss": 0.9072, + "step": 2600 + }, + { + "epoch": 0.3980107115531752, + "grad_norm": 2.43355365679199, + "learning_rate": 1.8539936423216187e-05, + "loss": 0.8955, + "step": 2601 + }, + { + "epoch": 0.3981637337413925, + "grad_norm": 2.352095084295794, + "learning_rate": 1.8538646803389048e-05, + "loss": 0.8204, + "step": 2602 + }, + { + "epoch": 0.3983167559296098, + "grad_norm": 2.4611685288216885, + "learning_rate": 1.8537356659170204e-05, + "loss": 0.8142, + "step": 2603 + }, + { + "epoch": 0.3984697781178271, + "grad_norm": 2.186102788808551, + "learning_rate": 1.8536065990638884e-05, + "loss": 0.7595, + "step": 2604 + }, + { + "epoch": 0.39862280030604436, + "grad_norm": 2.202120235361913, + "learning_rate": 1.8534774797874363e-05, + "loss": 0.7505, + "step": 2605 + }, + { + "epoch": 0.3987758224942617, + "grad_norm": 2.362986852327419, + "learning_rate": 1.8533483080955938e-05, + "loss": 0.7464, + "step": 2606 + }, + { + "epoch": 0.39892884468247897, + "grad_norm": 2.200352843576885, + "learning_rate": 1.8532190839962927e-05, + "loss": 0.696, + "step": 2607 + }, + { + "epoch": 0.39908186687069624, + "grad_norm": 2.255800540526346, + "learning_rate": 1.85308980749747e-05, + "loss": 0.8086, + "step": 2608 + }, + { + "epoch": 0.3992348890589135, + "grad_norm": 2.469916004202286, + "learning_rate": 1.852960478607065e-05, + "loss": 0.8218, + "step": 2609 + }, + { + "epoch": 0.39938791124713086, + "grad_norm": 2.2232561093923064, + "learning_rate": 1.8528310973330202e-05, + "loss": 0.7753, + "step": 2610 + }, + { + "epoch": 0.39954093343534813, + "grad_norm": 2.465870849875202, + "learning_rate": 1.8527016636832812e-05, + "loss": 0.8572, + "step": 2611 + }, + { + "epoch": 0.3996939556235654, + "grad_norm": 2.5829792855369487, + "learning_rate": 1.8525721776657976e-05, + "loss": 0.8348, + "step": 2612 + }, + { + "epoch": 0.3998469778117827, + "grad_norm": 2.3325205099538198, + "learning_rate": 1.8524426392885214e-05, + "loss": 0.8031, + "step": 2613 + }, + { + "epoch": 0.4, + "grad_norm": 2.276647860896121, + "learning_rate": 1.8523130485594073e-05, + "loss": 0.8267, + "step": 2614 + }, + { + "epoch": 0.4001530221882173, + "grad_norm": 2.3928052935724744, + "learning_rate": 1.8521834054864153e-05, + "loss": 0.8535, + "step": 2615 + }, + { + "epoch": 0.4003060443764346, + "grad_norm": 2.511577722772227, + "learning_rate": 1.852053710077506e-05, + "loss": 0.7447, + "step": 2616 + }, + { + "epoch": 0.40045906656465186, + "grad_norm": 2.110248564429131, + "learning_rate": 1.8519239623406458e-05, + "loss": 0.7667, + "step": 2617 + }, + { + "epoch": 0.4006120887528692, + "grad_norm": 2.488627880598713, + "learning_rate": 1.8517941622838017e-05, + "loss": 0.7103, + "step": 2618 + }, + { + "epoch": 0.40076511094108647, + "grad_norm": 2.2009676567345653, + "learning_rate": 1.851664309914946e-05, + "loss": 0.7582, + "step": 2619 + }, + { + "epoch": 0.40091813312930374, + "grad_norm": 2.1310514990703013, + "learning_rate": 1.8515344052420537e-05, + "loss": 0.7643, + "step": 2620 + }, + { + "epoch": 0.401071155317521, + "grad_norm": 2.6259759882870988, + "learning_rate": 1.851404448273102e-05, + "loss": 0.9047, + "step": 2621 + }, + { + "epoch": 0.40122417750573836, + "grad_norm": 2.2943155620505915, + "learning_rate": 1.8512744390160723e-05, + "loss": 0.8079, + "step": 2622 + }, + { + "epoch": 0.40137719969395563, + "grad_norm": 2.4544127384323327, + "learning_rate": 1.851144377478949e-05, + "loss": 0.8562, + "step": 2623 + }, + { + "epoch": 0.4015302218821729, + "grad_norm": 2.3951768753685965, + "learning_rate": 1.8510142636697206e-05, + "loss": 0.7953, + "step": 2624 + }, + { + "epoch": 0.4016832440703902, + "grad_norm": 2.3766515796882186, + "learning_rate": 1.8508840975963763e-05, + "loss": 0.8557, + "step": 2625 + }, + { + "epoch": 0.4018362662586075, + "grad_norm": 2.5023847534931067, + "learning_rate": 1.850753879266911e-05, + "loss": 0.812, + "step": 2626 + }, + { + "epoch": 0.4019892884468248, + "grad_norm": 2.509151267955778, + "learning_rate": 1.850623608689322e-05, + "loss": 0.7625, + "step": 2627 + }, + { + "epoch": 0.4021423106350421, + "grad_norm": 2.4184906288540073, + "learning_rate": 1.8504932858716097e-05, + "loss": 0.8336, + "step": 2628 + }, + { + "epoch": 0.40229533282325936, + "grad_norm": 2.3168938864394515, + "learning_rate": 1.8503629108217772e-05, + "loss": 0.7943, + "step": 2629 + }, + { + "epoch": 0.4024483550114767, + "grad_norm": 2.3915931853920256, + "learning_rate": 1.8502324835478316e-05, + "loss": 0.8279, + "step": 2630 + }, + { + "epoch": 0.40260137719969397, + "grad_norm": 2.2349446691616706, + "learning_rate": 1.8501020040577838e-05, + "loss": 0.7348, + "step": 2631 + }, + { + "epoch": 0.40275439938791124, + "grad_norm": 2.0151409908112594, + "learning_rate": 1.8499714723596455e-05, + "loss": 0.7296, + "step": 2632 + }, + { + "epoch": 0.4029074215761285, + "grad_norm": 2.5360099818874646, + "learning_rate": 1.8498408884614343e-05, + "loss": 0.8572, + "step": 2633 + }, + { + "epoch": 0.40306044376434585, + "grad_norm": 2.164183952890459, + "learning_rate": 1.8497102523711698e-05, + "loss": 0.6298, + "step": 2634 + }, + { + "epoch": 0.40321346595256313, + "grad_norm": 2.4630834283639, + "learning_rate": 1.849579564096874e-05, + "loss": 0.7419, + "step": 2635 + }, + { + "epoch": 0.4033664881407804, + "grad_norm": 2.350449227312488, + "learning_rate": 1.849448823646574e-05, + "loss": 0.8939, + "step": 2636 + }, + { + "epoch": 0.4035195103289977, + "grad_norm": 2.3427406354106886, + "learning_rate": 1.8493180310282985e-05, + "loss": 0.9118, + "step": 2637 + }, + { + "epoch": 0.403672532517215, + "grad_norm": 2.578044045167987, + "learning_rate": 1.8491871862500805e-05, + "loss": 0.8345, + "step": 2638 + }, + { + "epoch": 0.4038255547054323, + "grad_norm": 2.5999091900545612, + "learning_rate": 1.849056289319955e-05, + "loss": 0.9699, + "step": 2639 + }, + { + "epoch": 0.4039785768936496, + "grad_norm": 2.4447201013727304, + "learning_rate": 1.8489253402459615e-05, + "loss": 0.9392, + "step": 2640 + }, + { + "epoch": 0.40413159908186685, + "grad_norm": 2.232214493716397, + "learning_rate": 1.848794339036142e-05, + "loss": 0.819, + "step": 2641 + }, + { + "epoch": 0.4042846212700842, + "grad_norm": 2.3284688831273765, + "learning_rate": 1.8486632856985413e-05, + "loss": 0.7519, + "step": 2642 + }, + { + "epoch": 0.40443764345830147, + "grad_norm": 2.363636249635232, + "learning_rate": 1.848532180241208e-05, + "loss": 0.8482, + "step": 2643 + }, + { + "epoch": 0.40459066564651874, + "grad_norm": 2.2295062565276074, + "learning_rate": 1.8484010226721943e-05, + "loss": 0.8025, + "step": 2644 + }, + { + "epoch": 0.404743687834736, + "grad_norm": 2.358853456902877, + "learning_rate": 1.8482698129995552e-05, + "loss": 0.7455, + "step": 2645 + }, + { + "epoch": 0.40489671002295335, + "grad_norm": 2.150877684604761, + "learning_rate": 1.848138551231348e-05, + "loss": 0.7321, + "step": 2646 + }, + { + "epoch": 0.40504973221117063, + "grad_norm": 2.4426513307013624, + "learning_rate": 1.8480072373756344e-05, + "loss": 0.8077, + "step": 2647 + }, + { + "epoch": 0.4052027543993879, + "grad_norm": 2.2088244537437114, + "learning_rate": 1.847875871440479e-05, + "loss": 0.7363, + "step": 2648 + }, + { + "epoch": 0.4053557765876052, + "grad_norm": 2.4812489057806553, + "learning_rate": 1.8477444534339494e-05, + "loss": 0.8045, + "step": 2649 + }, + { + "epoch": 0.4055087987758225, + "grad_norm": 2.540939066788791, + "learning_rate": 1.8476129833641167e-05, + "loss": 0.8186, + "step": 2650 + }, + { + "epoch": 0.4056618209640398, + "grad_norm": 2.212331345125525, + "learning_rate": 1.847481461239055e-05, + "loss": 0.809, + "step": 2651 + }, + { + "epoch": 0.4058148431522571, + "grad_norm": 2.401064578558019, + "learning_rate": 1.847349887066841e-05, + "loss": 0.7607, + "step": 2652 + }, + { + "epoch": 0.40596786534047435, + "grad_norm": 2.3926401890762485, + "learning_rate": 1.8472182608555554e-05, + "loss": 0.8287, + "step": 2653 + }, + { + "epoch": 0.4061208875286917, + "grad_norm": 2.4175965911225785, + "learning_rate": 1.8470865826132823e-05, + "loss": 0.9406, + "step": 2654 + }, + { + "epoch": 0.40627390971690897, + "grad_norm": 2.4696218970242025, + "learning_rate": 1.8469548523481084e-05, + "loss": 0.8894, + "step": 2655 + }, + { + "epoch": 0.40642693190512624, + "grad_norm": 2.6511801074082726, + "learning_rate": 1.846823070068124e-05, + "loss": 0.8922, + "step": 2656 + }, + { + "epoch": 0.4065799540933435, + "grad_norm": 2.314650500785454, + "learning_rate": 1.846691235781422e-05, + "loss": 0.7068, + "step": 2657 + }, + { + "epoch": 0.40673297628156085, + "grad_norm": 2.4866883666060637, + "learning_rate": 1.8465593494960984e-05, + "loss": 0.8283, + "step": 2658 + }, + { + "epoch": 0.40688599846977813, + "grad_norm": 2.1590639354930192, + "learning_rate": 1.846427411220254e-05, + "loss": 0.7625, + "step": 2659 + }, + { + "epoch": 0.4070390206579954, + "grad_norm": 2.3846505274108063, + "learning_rate": 1.8462954209619906e-05, + "loss": 0.9031, + "step": 2660 + }, + { + "epoch": 0.4071920428462127, + "grad_norm": 2.2267645532444775, + "learning_rate": 1.846163378729415e-05, + "loss": 0.7436, + "step": 2661 + }, + { + "epoch": 0.40734506503442997, + "grad_norm": 2.398986928697425, + "learning_rate": 1.8460312845306355e-05, + "loss": 0.8195, + "step": 2662 + }, + { + "epoch": 0.4074980872226473, + "grad_norm": 2.61225008925308, + "learning_rate": 1.8458991383737658e-05, + "loss": 0.7914, + "step": 2663 + }, + { + "epoch": 0.4076511094108646, + "grad_norm": 2.356902302425031, + "learning_rate": 1.8457669402669204e-05, + "loss": 0.8502, + "step": 2664 + }, + { + "epoch": 0.40780413159908185, + "grad_norm": 2.560117797837266, + "learning_rate": 1.8456346902182186e-05, + "loss": 0.781, + "step": 2665 + }, + { + "epoch": 0.40795715378729913, + "grad_norm": 2.4632168985463303, + "learning_rate": 1.8455023882357828e-05, + "loss": 0.8861, + "step": 2666 + }, + { + "epoch": 0.40811017597551646, + "grad_norm": 2.3380920013005424, + "learning_rate": 1.845370034327737e-05, + "loss": 0.7989, + "step": 2667 + }, + { + "epoch": 0.40826319816373374, + "grad_norm": 2.718768158973965, + "learning_rate": 1.845237628502211e-05, + "loss": 0.8724, + "step": 2668 + }, + { + "epoch": 0.408416220351951, + "grad_norm": 2.2335921822951366, + "learning_rate": 1.8451051707673354e-05, + "loss": 0.786, + "step": 2669 + }, + { + "epoch": 0.4085692425401683, + "grad_norm": 2.2166311232529865, + "learning_rate": 1.8449726611312448e-05, + "loss": 0.7835, + "step": 2670 + }, + { + "epoch": 0.40872226472838563, + "grad_norm": 2.46656392359808, + "learning_rate": 1.844840099602078e-05, + "loss": 0.8221, + "step": 2671 + }, + { + "epoch": 0.4088752869166029, + "grad_norm": 2.222395842583903, + "learning_rate": 1.8447074861879755e-05, + "loss": 0.8084, + "step": 2672 + }, + { + "epoch": 0.4090283091048202, + "grad_norm": 2.4012567726291816, + "learning_rate": 1.8445748208970817e-05, + "loss": 0.8552, + "step": 2673 + }, + { + "epoch": 0.40918133129303746, + "grad_norm": 2.5844110968126883, + "learning_rate": 1.844442103737544e-05, + "loss": 0.8058, + "step": 2674 + }, + { + "epoch": 0.4093343534812548, + "grad_norm": 2.4874765360529185, + "learning_rate": 1.8443093347175136e-05, + "loss": 0.8288, + "step": 2675 + }, + { + "epoch": 0.4094873756694721, + "grad_norm": 2.3078682180408956, + "learning_rate": 1.8441765138451436e-05, + "loss": 0.929, + "step": 2676 + }, + { + "epoch": 0.40964039785768935, + "grad_norm": 2.2629872570756553, + "learning_rate": 1.8440436411285917e-05, + "loss": 0.7427, + "step": 2677 + }, + { + "epoch": 0.40979342004590663, + "grad_norm": 2.4094935497777974, + "learning_rate": 1.843910716576018e-05, + "loss": 0.7792, + "step": 2678 + }, + { + "epoch": 0.40994644223412396, + "grad_norm": 2.2940908840934267, + "learning_rate": 1.8437777401955855e-05, + "loss": 0.7945, + "step": 2679 + }, + { + "epoch": 0.41009946442234124, + "grad_norm": 2.4885993708595526, + "learning_rate": 1.8436447119954614e-05, + "loss": 0.8593, + "step": 2680 + }, + { + "epoch": 0.4102524866105585, + "grad_norm": 2.193032443148415, + "learning_rate": 1.843511631983815e-05, + "loss": 0.7174, + "step": 2681 + }, + { + "epoch": 0.4104055087987758, + "grad_norm": 2.3278540581719143, + "learning_rate": 1.8433785001688193e-05, + "loss": 0.7327, + "step": 2682 + }, + { + "epoch": 0.41055853098699313, + "grad_norm": 3.241194420703757, + "learning_rate": 1.843245316558651e-05, + "loss": 0.6707, + "step": 2683 + }, + { + "epoch": 0.4107115531752104, + "grad_norm": 2.4359746230564747, + "learning_rate": 1.843112081161489e-05, + "loss": 0.8612, + "step": 2684 + }, + { + "epoch": 0.4108645753634277, + "grad_norm": 2.3943737897219366, + "learning_rate": 1.8429787939855153e-05, + "loss": 0.695, + "step": 2685 + }, + { + "epoch": 0.41101759755164496, + "grad_norm": 2.4256681273927287, + "learning_rate": 1.8428454550389167e-05, + "loss": 0.7882, + "step": 2686 + }, + { + "epoch": 0.4111706197398623, + "grad_norm": 3.3705908991777704, + "learning_rate": 1.842712064329881e-05, + "loss": 0.8501, + "step": 2687 + }, + { + "epoch": 0.4113236419280796, + "grad_norm": 2.419989046073004, + "learning_rate": 1.8425786218666008e-05, + "loss": 0.9159, + "step": 2688 + }, + { + "epoch": 0.41147666411629685, + "grad_norm": 2.4530844333412904, + "learning_rate": 1.8424451276572712e-05, + "loss": 0.7439, + "step": 2689 + }, + { + "epoch": 0.41162968630451413, + "grad_norm": 2.7154613196392163, + "learning_rate": 1.8423115817100908e-05, + "loss": 0.7944, + "step": 2690 + }, + { + "epoch": 0.41178270849273146, + "grad_norm": 2.0993592442625464, + "learning_rate": 1.842177984033261e-05, + "loss": 0.6573, + "step": 2691 + }, + { + "epoch": 0.41193573068094874, + "grad_norm": 2.251977839855377, + "learning_rate": 1.8420443346349866e-05, + "loss": 0.7723, + "step": 2692 + }, + { + "epoch": 0.412088752869166, + "grad_norm": 2.4973317883727355, + "learning_rate": 1.8419106335234757e-05, + "loss": 0.8563, + "step": 2693 + }, + { + "epoch": 0.4122417750573833, + "grad_norm": 2.16198116036908, + "learning_rate": 1.8417768807069388e-05, + "loss": 0.7834, + "step": 2694 + }, + { + "epoch": 0.41239479724560063, + "grad_norm": 2.2356466035192177, + "learning_rate": 1.8416430761935912e-05, + "loss": 0.8431, + "step": 2695 + }, + { + "epoch": 0.4125478194338179, + "grad_norm": 2.3601155991963574, + "learning_rate": 1.8415092199916493e-05, + "loss": 0.7505, + "step": 2696 + }, + { + "epoch": 0.4127008416220352, + "grad_norm": 2.4839307976282297, + "learning_rate": 1.8413753121093342e-05, + "loss": 0.834, + "step": 2697 + }, + { + "epoch": 0.41285386381025246, + "grad_norm": 2.5380141603149733, + "learning_rate": 1.84124135255487e-05, + "loss": 0.8884, + "step": 2698 + }, + { + "epoch": 0.4130068859984698, + "grad_norm": 2.359684411575416, + "learning_rate": 1.841107341336483e-05, + "loss": 0.8136, + "step": 2699 + }, + { + "epoch": 0.4131599081866871, + "grad_norm": 2.246915060713202, + "learning_rate": 1.840973278462404e-05, + "loss": 0.853, + "step": 2700 + }, + { + "epoch": 0.41331293037490435, + "grad_norm": 2.743278992870679, + "learning_rate": 1.8408391639408662e-05, + "loss": 0.9538, + "step": 2701 + }, + { + "epoch": 0.41346595256312163, + "grad_norm": 2.3414868128353445, + "learning_rate": 1.8407049977801057e-05, + "loss": 0.7642, + "step": 2702 + }, + { + "epoch": 0.41361897475133896, + "grad_norm": 2.2150403808055095, + "learning_rate": 1.8405707799883625e-05, + "loss": 0.8568, + "step": 2703 + }, + { + "epoch": 0.41377199693955624, + "grad_norm": 2.1600067523897595, + "learning_rate": 1.8404365105738793e-05, + "loss": 0.7698, + "step": 2704 + }, + { + "epoch": 0.4139250191277735, + "grad_norm": 2.5630873870166027, + "learning_rate": 1.8403021895449022e-05, + "loss": 0.8071, + "step": 2705 + }, + { + "epoch": 0.4140780413159908, + "grad_norm": 2.2015946582709724, + "learning_rate": 1.8401678169096803e-05, + "loss": 0.7721, + "step": 2706 + }, + { + "epoch": 0.41423106350420813, + "grad_norm": 2.3404124500020616, + "learning_rate": 1.840033392676466e-05, + "loss": 0.7017, + "step": 2707 + }, + { + "epoch": 0.4143840856924254, + "grad_norm": 2.5603216912077142, + "learning_rate": 1.8398989168535153e-05, + "loss": 0.7995, + "step": 2708 + }, + { + "epoch": 0.4145371078806427, + "grad_norm": 2.588436369527036, + "learning_rate": 1.8397643894490857e-05, + "loss": 0.8638, + "step": 2709 + }, + { + "epoch": 0.41469013006885996, + "grad_norm": 2.2405037181742418, + "learning_rate": 1.8396298104714404e-05, + "loss": 0.7631, + "step": 2710 + }, + { + "epoch": 0.4148431522570773, + "grad_norm": 2.4837864680181805, + "learning_rate": 1.8394951799288433e-05, + "loss": 0.8381, + "step": 2711 + }, + { + "epoch": 0.4149961744452946, + "grad_norm": 2.434315227598334, + "learning_rate": 1.8393604978295634e-05, + "loss": 0.7599, + "step": 2712 + }, + { + "epoch": 0.41514919663351185, + "grad_norm": 2.3929964810627875, + "learning_rate": 1.8392257641818717e-05, + "loss": 0.8392, + "step": 2713 + }, + { + "epoch": 0.41530221882172913, + "grad_norm": 2.3769240010713784, + "learning_rate": 1.8390909789940424e-05, + "loss": 0.914, + "step": 2714 + }, + { + "epoch": 0.41545524100994646, + "grad_norm": 2.4156353290483996, + "learning_rate": 1.8389561422743537e-05, + "loss": 0.8067, + "step": 2715 + }, + { + "epoch": 0.41560826319816374, + "grad_norm": 2.3762181100367727, + "learning_rate": 1.838821254031087e-05, + "loss": 0.8145, + "step": 2716 + }, + { + "epoch": 0.415761285386381, + "grad_norm": 2.7032932387797994, + "learning_rate": 1.8386863142725245e-05, + "loss": 0.7724, + "step": 2717 + }, + { + "epoch": 0.4159143075745983, + "grad_norm": 2.4199311872817675, + "learning_rate": 1.8385513230069552e-05, + "loss": 0.796, + "step": 2718 + }, + { + "epoch": 0.41606732976281563, + "grad_norm": 2.611915068916634, + "learning_rate": 1.8384162802426683e-05, + "loss": 0.8, + "step": 2719 + }, + { + "epoch": 0.4162203519510329, + "grad_norm": 2.4719883493550663, + "learning_rate": 1.838281185987958e-05, + "loss": 0.7554, + "step": 2720 + }, + { + "epoch": 0.4163733741392502, + "grad_norm": 6.780799641945747, + "learning_rate": 1.8381460402511206e-05, + "loss": 0.8375, + "step": 2721 + }, + { + "epoch": 0.41652639632746746, + "grad_norm": 2.2876882146123765, + "learning_rate": 1.8380108430404558e-05, + "loss": 0.8064, + "step": 2722 + }, + { + "epoch": 0.4166794185156848, + "grad_norm": 2.451624523809711, + "learning_rate": 1.8378755943642674e-05, + "loss": 0.7935, + "step": 2723 + }, + { + "epoch": 0.4168324407039021, + "grad_norm": 2.8942584708078827, + "learning_rate": 1.8377402942308606e-05, + "loss": 0.7752, + "step": 2724 + }, + { + "epoch": 0.41698546289211935, + "grad_norm": 2.4380212312342024, + "learning_rate": 1.837604942648545e-05, + "loss": 0.8369, + "step": 2725 + }, + { + "epoch": 0.41713848508033663, + "grad_norm": 2.42312875758766, + "learning_rate": 1.8374695396256333e-05, + "loss": 0.8675, + "step": 2726 + }, + { + "epoch": 0.41729150726855396, + "grad_norm": 2.822238056417803, + "learning_rate": 1.837334085170441e-05, + "loss": 0.8107, + "step": 2727 + }, + { + "epoch": 0.41744452945677124, + "grad_norm": 2.7255201999810397, + "learning_rate": 1.8371985792912867e-05, + "loss": 0.8395, + "step": 2728 + }, + { + "epoch": 0.4175975516449885, + "grad_norm": 2.7738793587791295, + "learning_rate": 1.8370630219964924e-05, + "loss": 0.7883, + "step": 2729 + }, + { + "epoch": 0.4177505738332058, + "grad_norm": 3.0113058281040987, + "learning_rate": 1.8369274132943836e-05, + "loss": 0.928, + "step": 2730 + }, + { + "epoch": 0.41790359602142313, + "grad_norm": 2.3332835609228626, + "learning_rate": 1.8367917531932883e-05, + "loss": 0.9316, + "step": 2731 + }, + { + "epoch": 0.4180566182096404, + "grad_norm": 2.52833709344253, + "learning_rate": 1.8366560417015376e-05, + "loss": 0.7773, + "step": 2732 + }, + { + "epoch": 0.4182096403978577, + "grad_norm": 2.776855292461947, + "learning_rate": 1.8365202788274665e-05, + "loss": 0.9908, + "step": 2733 + }, + { + "epoch": 0.41836266258607496, + "grad_norm": 3.1038722511338848, + "learning_rate": 1.8363844645794127e-05, + "loss": 0.7926, + "step": 2734 + }, + { + "epoch": 0.4185156847742923, + "grad_norm": 2.6755227443722966, + "learning_rate": 1.836248598965717e-05, + "loss": 0.7397, + "step": 2735 + }, + { + "epoch": 0.4186687069625096, + "grad_norm": 2.193456846703546, + "learning_rate": 1.836112681994723e-05, + "loss": 0.7661, + "step": 2736 + }, + { + "epoch": 0.41882172915072685, + "grad_norm": 2.4328277091982704, + "learning_rate": 1.8359767136747787e-05, + "loss": 0.8149, + "step": 2737 + }, + { + "epoch": 0.41897475133894413, + "grad_norm": 2.5640952513553996, + "learning_rate": 1.835840694014234e-05, + "loss": 0.8672, + "step": 2738 + }, + { + "epoch": 0.41912777352716146, + "grad_norm": 2.372258279722657, + "learning_rate": 1.8357046230214424e-05, + "loss": 0.7194, + "step": 2739 + }, + { + "epoch": 0.41928079571537874, + "grad_norm": 2.694024313287626, + "learning_rate": 1.8355685007047602e-05, + "loss": 0.8198, + "step": 2740 + }, + { + "epoch": 0.419433817903596, + "grad_norm": 2.717663032912964, + "learning_rate": 1.8354323270725483e-05, + "loss": 0.754, + "step": 2741 + }, + { + "epoch": 0.4195868400918133, + "grad_norm": 2.1220023192705466, + "learning_rate": 1.8352961021331686e-05, + "loss": 0.7195, + "step": 2742 + }, + { + "epoch": 0.41973986228003063, + "grad_norm": 2.5635730910519428, + "learning_rate": 1.8351598258949878e-05, + "loss": 0.6638, + "step": 2743 + }, + { + "epoch": 0.4198928844682479, + "grad_norm": 2.307124143928578, + "learning_rate": 1.8350234983663745e-05, + "loss": 0.779, + "step": 2744 + }, + { + "epoch": 0.4200459066564652, + "grad_norm": 2.581024604321476, + "learning_rate": 1.834887119555702e-05, + "loss": 0.9084, + "step": 2745 + }, + { + "epoch": 0.42019892884468246, + "grad_norm": 2.486303856122351, + "learning_rate": 1.834750689471345e-05, + "loss": 0.8191, + "step": 2746 + }, + { + "epoch": 0.4203519510328998, + "grad_norm": 2.794251757152545, + "learning_rate": 1.8346142081216828e-05, + "loss": 0.7747, + "step": 2747 + }, + { + "epoch": 0.4205049732211171, + "grad_norm": 2.5361298102326923, + "learning_rate": 1.8344776755150972e-05, + "loss": 0.8419, + "step": 2748 + }, + { + "epoch": 0.42065799540933435, + "grad_norm": 2.453075570214805, + "learning_rate": 1.8343410916599728e-05, + "loss": 0.7913, + "step": 2749 + }, + { + "epoch": 0.42081101759755163, + "grad_norm": 2.658949348190422, + "learning_rate": 1.8342044565646985e-05, + "loss": 0.8207, + "step": 2750 + }, + { + "epoch": 0.42096403978576896, + "grad_norm": 2.6080813836023204, + "learning_rate": 1.8340677702376646e-05, + "loss": 0.9163, + "step": 2751 + }, + { + "epoch": 0.42111706197398624, + "grad_norm": 2.4580878290478765, + "learning_rate": 1.833931032687266e-05, + "loss": 0.7212, + "step": 2752 + }, + { + "epoch": 0.4212700841622035, + "grad_norm": 2.6866209108407992, + "learning_rate": 1.8337942439219002e-05, + "loss": 0.9074, + "step": 2753 + }, + { + "epoch": 0.4214231063504208, + "grad_norm": 2.5069055372513565, + "learning_rate": 1.8336574039499686e-05, + "loss": 0.8655, + "step": 2754 + }, + { + "epoch": 0.42157612853863813, + "grad_norm": 2.8233674919495506, + "learning_rate": 1.8335205127798743e-05, + "loss": 0.8258, + "step": 2755 + }, + { + "epoch": 0.4217291507268554, + "grad_norm": 2.455363331998748, + "learning_rate": 1.8333835704200244e-05, + "loss": 0.7243, + "step": 2756 + }, + { + "epoch": 0.4218821729150727, + "grad_norm": 2.6976338544948764, + "learning_rate": 1.8332465768788294e-05, + "loss": 0.8472, + "step": 2757 + }, + { + "epoch": 0.42203519510328996, + "grad_norm": 2.2772022318054317, + "learning_rate": 1.8331095321647024e-05, + "loss": 0.7334, + "step": 2758 + }, + { + "epoch": 0.4221882172915073, + "grad_norm": 2.332527092805009, + "learning_rate": 1.83297243628606e-05, + "loss": 0.8488, + "step": 2759 + }, + { + "epoch": 0.4223412394797246, + "grad_norm": 2.65635386373047, + "learning_rate": 1.8328352892513213e-05, + "loss": 0.8725, + "step": 2760 + }, + { + "epoch": 0.42249426166794185, + "grad_norm": 2.595147960241488, + "learning_rate": 1.8326980910689097e-05, + "loss": 0.8825, + "step": 2761 + }, + { + "epoch": 0.42264728385615913, + "grad_norm": 2.4590965620170357, + "learning_rate": 1.832560841747251e-05, + "loss": 0.7872, + "step": 2762 + }, + { + "epoch": 0.42280030604437646, + "grad_norm": 2.105599760748347, + "learning_rate": 1.8324235412947734e-05, + "loss": 0.7948, + "step": 2763 + }, + { + "epoch": 0.42295332823259374, + "grad_norm": 2.1926721287195607, + "learning_rate": 1.83228618971991e-05, + "loss": 0.6939, + "step": 2764 + }, + { + "epoch": 0.423106350420811, + "grad_norm": 2.3227283833432444, + "learning_rate": 1.832148787031096e-05, + "loss": 0.7924, + "step": 2765 + }, + { + "epoch": 0.4232593726090283, + "grad_norm": 2.439579480687344, + "learning_rate": 1.8320113332367695e-05, + "loss": 0.798, + "step": 2766 + }, + { + "epoch": 0.42341239479724563, + "grad_norm": 2.4905389517694707, + "learning_rate": 1.831873828345372e-05, + "loss": 0.9101, + "step": 2767 + }, + { + "epoch": 0.4235654169854629, + "grad_norm": 1.9986218789081267, + "learning_rate": 1.8317362723653486e-05, + "loss": 0.6391, + "step": 2768 + }, + { + "epoch": 0.4237184391736802, + "grad_norm": 2.265857474114456, + "learning_rate": 1.8315986653051466e-05, + "loss": 0.8241, + "step": 2769 + }, + { + "epoch": 0.42387146136189746, + "grad_norm": 2.4080058443982058, + "learning_rate": 1.8314610071732178e-05, + "loss": 0.7932, + "step": 2770 + }, + { + "epoch": 0.42402448355011474, + "grad_norm": 2.3055478158698404, + "learning_rate": 1.8313232979780154e-05, + "loss": 0.7345, + "step": 2771 + }, + { + "epoch": 0.4241775057383321, + "grad_norm": 2.5461071258196064, + "learning_rate": 1.8311855377279974e-05, + "loss": 0.6186, + "step": 2772 + }, + { + "epoch": 0.42433052792654935, + "grad_norm": 2.4759293912836555, + "learning_rate": 1.831047726431624e-05, + "loss": 0.8608, + "step": 2773 + }, + { + "epoch": 0.42448355011476663, + "grad_norm": 2.228142862655671, + "learning_rate": 1.8309098640973584e-05, + "loss": 0.7126, + "step": 2774 + }, + { + "epoch": 0.4246365723029839, + "grad_norm": 2.5373975946691507, + "learning_rate": 1.8307719507336675e-05, + "loss": 0.7881, + "step": 2775 + }, + { + "epoch": 0.42478959449120124, + "grad_norm": 2.33822674451866, + "learning_rate": 1.8306339863490216e-05, + "loss": 0.8431, + "step": 2776 + }, + { + "epoch": 0.4249426166794185, + "grad_norm": 2.7554536395279507, + "learning_rate": 1.8304959709518923e-05, + "loss": 0.8821, + "step": 2777 + }, + { + "epoch": 0.4250956388676358, + "grad_norm": 2.5830841255768244, + "learning_rate": 1.830357904550757e-05, + "loss": 0.8288, + "step": 2778 + }, + { + "epoch": 0.42524866105585307, + "grad_norm": 2.5029825041231204, + "learning_rate": 1.8302197871540945e-05, + "loss": 0.8809, + "step": 2779 + }, + { + "epoch": 0.4254016832440704, + "grad_norm": 2.575665600700266, + "learning_rate": 1.830081618770387e-05, + "loss": 0.8533, + "step": 2780 + }, + { + "epoch": 0.4255547054322877, + "grad_norm": 2.5700770047121124, + "learning_rate": 1.8299433994081197e-05, + "loss": 0.8074, + "step": 2781 + }, + { + "epoch": 0.42570772762050496, + "grad_norm": 2.0623106933950304, + "learning_rate": 1.8298051290757818e-05, + "loss": 0.7773, + "step": 2782 + }, + { + "epoch": 0.42586074980872224, + "grad_norm": 2.18537150872912, + "learning_rate": 1.8296668077818644e-05, + "loss": 0.8344, + "step": 2783 + }, + { + "epoch": 0.42601377199693957, + "grad_norm": 2.3942580809651477, + "learning_rate": 1.8295284355348628e-05, + "loss": 0.7428, + "step": 2784 + }, + { + "epoch": 0.42616679418515685, + "grad_norm": 2.780737215240603, + "learning_rate": 1.829390012343275e-05, + "loss": 0.8756, + "step": 2785 + }, + { + "epoch": 0.4263198163733741, + "grad_norm": 2.2278400641722795, + "learning_rate": 1.8292515382156017e-05, + "loss": 0.8758, + "step": 2786 + }, + { + "epoch": 0.4264728385615914, + "grad_norm": 2.3121894916116164, + "learning_rate": 1.8291130131603476e-05, + "loss": 0.7272, + "step": 2787 + }, + { + "epoch": 0.42662586074980874, + "grad_norm": 2.69045702826044, + "learning_rate": 1.8289744371860196e-05, + "loss": 0.7238, + "step": 2788 + }, + { + "epoch": 0.426778882938026, + "grad_norm": 2.315130684005548, + "learning_rate": 1.8288358103011286e-05, + "loss": 0.7843, + "step": 2789 + }, + { + "epoch": 0.4269319051262433, + "grad_norm": 2.18678882527759, + "learning_rate": 1.828697132514188e-05, + "loss": 0.8087, + "step": 2790 + }, + { + "epoch": 0.42708492731446057, + "grad_norm": 2.381064095904689, + "learning_rate": 1.828558403833715e-05, + "loss": 0.8279, + "step": 2791 + }, + { + "epoch": 0.4272379495026779, + "grad_norm": 2.577619698682901, + "learning_rate": 1.8284196242682287e-05, + "loss": 0.8842, + "step": 2792 + }, + { + "epoch": 0.4273909716908952, + "grad_norm": 2.4459585233481103, + "learning_rate": 1.8282807938262525e-05, + "loss": 0.7657, + "step": 2793 + }, + { + "epoch": 0.42754399387911246, + "grad_norm": 2.4999096948175894, + "learning_rate": 1.8281419125163124e-05, + "loss": 0.8371, + "step": 2794 + }, + { + "epoch": 0.42769701606732974, + "grad_norm": 2.10695554914119, + "learning_rate": 1.8280029803469383e-05, + "loss": 0.7368, + "step": 2795 + }, + { + "epoch": 0.42785003825554707, + "grad_norm": 2.568702428809719, + "learning_rate": 1.8278639973266614e-05, + "loss": 0.8826, + "step": 2796 + }, + { + "epoch": 0.42800306044376435, + "grad_norm": 2.4571418370595373, + "learning_rate": 1.827724963464018e-05, + "loss": 0.7719, + "step": 2797 + }, + { + "epoch": 0.4281560826319816, + "grad_norm": 2.321507335931533, + "learning_rate": 1.8275858787675464e-05, + "loss": 0.801, + "step": 2798 + }, + { + "epoch": 0.4283091048201989, + "grad_norm": 1.9921891114138417, + "learning_rate": 1.8274467432457888e-05, + "loss": 0.6825, + "step": 2799 + }, + { + "epoch": 0.42846212700841624, + "grad_norm": 2.4680977264565325, + "learning_rate": 1.8273075569072893e-05, + "loss": 0.7475, + "step": 2800 + }, + { + "epoch": 0.4286151491966335, + "grad_norm": 2.3974672040643776, + "learning_rate": 1.8271683197605966e-05, + "loss": 0.79, + "step": 2801 + }, + { + "epoch": 0.4287681713848508, + "grad_norm": 2.448891743243999, + "learning_rate": 1.8270290318142613e-05, + "loss": 0.8843, + "step": 2802 + }, + { + "epoch": 0.42892119357306807, + "grad_norm": 2.9211845208427603, + "learning_rate": 1.8268896930768378e-05, + "loss": 0.9263, + "step": 2803 + }, + { + "epoch": 0.4290742157612854, + "grad_norm": 2.2479016422193916, + "learning_rate": 1.8267503035568834e-05, + "loss": 0.7761, + "step": 2804 + }, + { + "epoch": 0.4292272379495027, + "grad_norm": 2.256379799241374, + "learning_rate": 1.8266108632629585e-05, + "loss": 0.8842, + "step": 2805 + }, + { + "epoch": 0.42938026013771996, + "grad_norm": 2.230328900853721, + "learning_rate": 1.8264713722036272e-05, + "loss": 0.7677, + "step": 2806 + }, + { + "epoch": 0.42953328232593724, + "grad_norm": 2.4734719612301808, + "learning_rate": 1.8263318303874555e-05, + "loss": 0.8867, + "step": 2807 + }, + { + "epoch": 0.42968630451415457, + "grad_norm": 2.4024715893301445, + "learning_rate": 1.8261922378230134e-05, + "loss": 0.7662, + "step": 2808 + }, + { + "epoch": 0.42983932670237185, + "grad_norm": 2.3019580177421495, + "learning_rate": 1.8260525945188742e-05, + "loss": 0.7525, + "step": 2809 + }, + { + "epoch": 0.4299923488905891, + "grad_norm": 2.268610624691826, + "learning_rate": 1.825912900483613e-05, + "loss": 0.8512, + "step": 2810 + }, + { + "epoch": 0.4301453710788064, + "grad_norm": 2.50834037125462, + "learning_rate": 1.82577315572581e-05, + "loss": 0.7428, + "step": 2811 + }, + { + "epoch": 0.43029839326702374, + "grad_norm": 1.9801505295055397, + "learning_rate": 1.8256333602540472e-05, + "loss": 0.6802, + "step": 2812 + }, + { + "epoch": 0.430451415455241, + "grad_norm": 2.2031680211345988, + "learning_rate": 1.82549351407691e-05, + "loss": 0.7427, + "step": 2813 + }, + { + "epoch": 0.4306044376434583, + "grad_norm": 2.07270281399312, + "learning_rate": 1.8253536172029863e-05, + "loss": 0.7541, + "step": 2814 + }, + { + "epoch": 0.43075745983167557, + "grad_norm": 2.4736090185078616, + "learning_rate": 1.8252136696408683e-05, + "loss": 0.8778, + "step": 2815 + }, + { + "epoch": 0.4309104820198929, + "grad_norm": 2.1084099756052574, + "learning_rate": 1.8250736713991505e-05, + "loss": 0.7589, + "step": 2816 + }, + { + "epoch": 0.4310635042081102, + "grad_norm": 2.377251552697672, + "learning_rate": 1.824933622486431e-05, + "loss": 0.8503, + "step": 2817 + }, + { + "epoch": 0.43121652639632746, + "grad_norm": 2.5075890326248964, + "learning_rate": 1.8247935229113106e-05, + "loss": 0.8015, + "step": 2818 + }, + { + "epoch": 0.43136954858454474, + "grad_norm": 2.4737690463297124, + "learning_rate": 1.824653372682393e-05, + "loss": 0.9242, + "step": 2819 + }, + { + "epoch": 0.43152257077276207, + "grad_norm": 2.5168772301247424, + "learning_rate": 1.824513171808286e-05, + "loss": 0.6882, + "step": 2820 + }, + { + "epoch": 0.43167559296097935, + "grad_norm": 2.4850955909543324, + "learning_rate": 1.8243729202975998e-05, + "loss": 0.8031, + "step": 2821 + }, + { + "epoch": 0.4318286151491966, + "grad_norm": 2.4310792570377364, + "learning_rate": 1.8242326181589472e-05, + "loss": 0.7786, + "step": 2822 + }, + { + "epoch": 0.4319816373374139, + "grad_norm": 2.2162562659795566, + "learning_rate": 1.8240922654009453e-05, + "loss": 0.8593, + "step": 2823 + }, + { + "epoch": 0.43213465952563124, + "grad_norm": 2.1604512997876615, + "learning_rate": 1.823951862032214e-05, + "loss": 0.7262, + "step": 2824 + }, + { + "epoch": 0.4322876817138485, + "grad_norm": 2.2860096253234334, + "learning_rate": 1.8238114080613748e-05, + "loss": 0.8108, + "step": 2825 + }, + { + "epoch": 0.4324407039020658, + "grad_norm": 2.1896817856146926, + "learning_rate": 1.8236709034970547e-05, + "loss": 0.7637, + "step": 2826 + }, + { + "epoch": 0.43259372609028307, + "grad_norm": 2.5100011119270365, + "learning_rate": 1.8235303483478816e-05, + "loss": 0.8717, + "step": 2827 + }, + { + "epoch": 0.4327467482785004, + "grad_norm": 2.2395244764068494, + "learning_rate": 1.8233897426224885e-05, + "loss": 0.8137, + "step": 2828 + }, + { + "epoch": 0.4328997704667177, + "grad_norm": 2.454140156073576, + "learning_rate": 1.8232490863295104e-05, + "loss": 0.7323, + "step": 2829 + }, + { + "epoch": 0.43305279265493496, + "grad_norm": 2.782749694102962, + "learning_rate": 1.8231083794775855e-05, + "loss": 0.8337, + "step": 2830 + }, + { + "epoch": 0.43320581484315224, + "grad_norm": 2.4708319224270787, + "learning_rate": 1.8229676220753544e-05, + "loss": 0.7952, + "step": 2831 + }, + { + "epoch": 0.43335883703136957, + "grad_norm": 2.450763340830767, + "learning_rate": 1.8228268141314625e-05, + "loss": 0.7822, + "step": 2832 + }, + { + "epoch": 0.43351185921958685, + "grad_norm": 2.6846953429174265, + "learning_rate": 1.8226859556545567e-05, + "loss": 0.9821, + "step": 2833 + }, + { + "epoch": 0.4336648814078041, + "grad_norm": 2.384263297031218, + "learning_rate": 1.8225450466532886e-05, + "loss": 0.7387, + "step": 2834 + }, + { + "epoch": 0.4338179035960214, + "grad_norm": 2.5933253496031314, + "learning_rate": 1.8224040871363108e-05, + "loss": 0.8023, + "step": 2835 + }, + { + "epoch": 0.43397092578423874, + "grad_norm": 2.1470594209931164, + "learning_rate": 1.8222630771122813e-05, + "loss": 0.7342, + "step": 2836 + }, + { + "epoch": 0.434123947972456, + "grad_norm": 2.177838898350675, + "learning_rate": 1.822122016589859e-05, + "loss": 0.7216, + "step": 2837 + }, + { + "epoch": 0.4342769701606733, + "grad_norm": 2.318251382225496, + "learning_rate": 1.8219809055777078e-05, + "loss": 0.8282, + "step": 2838 + }, + { + "epoch": 0.43442999234889057, + "grad_norm": 2.485311148535998, + "learning_rate": 1.8218397440844935e-05, + "loss": 0.9077, + "step": 2839 + }, + { + "epoch": 0.4345830145371079, + "grad_norm": 2.187991898973603, + "learning_rate": 1.8216985321188857e-05, + "loss": 0.6987, + "step": 2840 + }, + { + "epoch": 0.4347360367253252, + "grad_norm": 2.954983399291798, + "learning_rate": 1.821557269689556e-05, + "loss": 0.8829, + "step": 2841 + }, + { + "epoch": 0.43488905891354246, + "grad_norm": 2.3435852188557136, + "learning_rate": 1.8214159568051808e-05, + "loss": 0.7581, + "step": 2842 + }, + { + "epoch": 0.43504208110175974, + "grad_norm": 2.506988531798837, + "learning_rate": 1.821274593474438e-05, + "loss": 0.9141, + "step": 2843 + }, + { + "epoch": 0.43519510328997707, + "grad_norm": 2.376637972611891, + "learning_rate": 1.82113317970601e-05, + "loss": 0.7158, + "step": 2844 + }, + { + "epoch": 0.43534812547819435, + "grad_norm": 2.426861380496052, + "learning_rate": 1.8209917155085807e-05, + "loss": 0.8952, + "step": 2845 + }, + { + "epoch": 0.4355011476664116, + "grad_norm": 2.217284799803432, + "learning_rate": 1.820850200890839e-05, + "loss": 0.7568, + "step": 2846 + }, + { + "epoch": 0.4356541698546289, + "grad_norm": 3.406641979935441, + "learning_rate": 1.8207086358614748e-05, + "loss": 0.7537, + "step": 2847 + }, + { + "epoch": 0.43580719204284624, + "grad_norm": 2.2708735332971397, + "learning_rate": 1.8205670204291827e-05, + "loss": 0.7447, + "step": 2848 + }, + { + "epoch": 0.4359602142310635, + "grad_norm": 2.4379444362854796, + "learning_rate": 1.82042535460266e-05, + "loss": 0.8063, + "step": 2849 + }, + { + "epoch": 0.4361132364192808, + "grad_norm": 2.644713852715883, + "learning_rate": 1.8202836383906065e-05, + "loss": 0.8981, + "step": 2850 + }, + { + "epoch": 0.43626625860749807, + "grad_norm": 2.3110592853637324, + "learning_rate": 1.8201418718017257e-05, + "loss": 0.828, + "step": 2851 + }, + { + "epoch": 0.4364192807957154, + "grad_norm": 2.19094109374234, + "learning_rate": 1.8200000548447242e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.4365723029839327, + "grad_norm": 2.4567016099138814, + "learning_rate": 1.8198581875283116e-05, + "loss": 0.8291, + "step": 2853 + }, + { + "epoch": 0.43672532517214996, + "grad_norm": 2.5230151299441665, + "learning_rate": 1.8197162698612008e-05, + "loss": 0.8983, + "step": 2854 + }, + { + "epoch": 0.43687834736036724, + "grad_norm": 2.2014429530314477, + "learning_rate": 1.8195743018521067e-05, + "loss": 0.8416, + "step": 2855 + }, + { + "epoch": 0.43703136954858457, + "grad_norm": 2.798858313321146, + "learning_rate": 1.8194322835097484e-05, + "loss": 0.6999, + "step": 2856 + }, + { + "epoch": 0.43718439173680185, + "grad_norm": 2.559519260396577, + "learning_rate": 1.8192902148428482e-05, + "loss": 1.0143, + "step": 2857 + }, + { + "epoch": 0.4373374139250191, + "grad_norm": 2.3153587914032254, + "learning_rate": 1.8191480958601307e-05, + "loss": 0.8853, + "step": 2858 + }, + { + "epoch": 0.4374904361132364, + "grad_norm": 2.6106845850972498, + "learning_rate": 1.819005926570324e-05, + "loss": 0.8967, + "step": 2859 + }, + { + "epoch": 0.43764345830145374, + "grad_norm": 2.6529185075899244, + "learning_rate": 1.8188637069821595e-05, + "loss": 0.7725, + "step": 2860 + }, + { + "epoch": 0.437796480489671, + "grad_norm": 2.4152913946558248, + "learning_rate": 1.8187214371043717e-05, + "loss": 0.7995, + "step": 2861 + }, + { + "epoch": 0.4379495026778883, + "grad_norm": 2.7423652149993165, + "learning_rate": 1.818579116945697e-05, + "loss": 0.9248, + "step": 2862 + }, + { + "epoch": 0.43810252486610557, + "grad_norm": 2.1886173607384904, + "learning_rate": 1.8184367465148768e-05, + "loss": 0.769, + "step": 2863 + }, + { + "epoch": 0.4382555470543229, + "grad_norm": 2.2565851532333654, + "learning_rate": 1.818294325820654e-05, + "loss": 0.8638, + "step": 2864 + }, + { + "epoch": 0.4384085692425402, + "grad_norm": 2.476506526281387, + "learning_rate": 1.8181518548717757e-05, + "loss": 0.8676, + "step": 2865 + }, + { + "epoch": 0.43856159143075746, + "grad_norm": 2.4366507731535068, + "learning_rate": 1.818009333676991e-05, + "loss": 0.785, + "step": 2866 + }, + { + "epoch": 0.43871461361897474, + "grad_norm": 2.4976448948903784, + "learning_rate": 1.8178667622450536e-05, + "loss": 0.8044, + "step": 2867 + }, + { + "epoch": 0.43886763580719207, + "grad_norm": 2.2797810019467306, + "learning_rate": 1.8177241405847185e-05, + "loss": 0.7425, + "step": 2868 + }, + { + "epoch": 0.43902065799540935, + "grad_norm": 2.189360272061294, + "learning_rate": 1.817581468704745e-05, + "loss": 0.8035, + "step": 2869 + }, + { + "epoch": 0.4391736801836266, + "grad_norm": 2.189311640496233, + "learning_rate": 1.817438746613895e-05, + "loss": 0.7365, + "step": 2870 + }, + { + "epoch": 0.4393267023718439, + "grad_norm": 2.35659633222008, + "learning_rate": 1.817295974320934e-05, + "loss": 0.8272, + "step": 2871 + }, + { + "epoch": 0.43947972456006124, + "grad_norm": 2.093541114804993, + "learning_rate": 1.8171531518346296e-05, + "loss": 0.7011, + "step": 2872 + }, + { + "epoch": 0.4396327467482785, + "grad_norm": 2.3219154246904203, + "learning_rate": 1.8170102791637538e-05, + "loss": 0.6946, + "step": 2873 + }, + { + "epoch": 0.4397857689364958, + "grad_norm": 2.578669440132516, + "learning_rate": 1.81686735631708e-05, + "loss": 0.8617, + "step": 2874 + }, + { + "epoch": 0.43993879112471307, + "grad_norm": 2.194341864294571, + "learning_rate": 1.8167243833033866e-05, + "loss": 0.7764, + "step": 2875 + }, + { + "epoch": 0.4400918133129304, + "grad_norm": 2.543107156092695, + "learning_rate": 1.8165813601314538e-05, + "loss": 0.8232, + "step": 2876 + }, + { + "epoch": 0.4402448355011477, + "grad_norm": 2.1321281234592964, + "learning_rate": 1.8164382868100655e-05, + "loss": 0.8494, + "step": 2877 + }, + { + "epoch": 0.44039785768936496, + "grad_norm": 2.1116804083957996, + "learning_rate": 1.8162951633480076e-05, + "loss": 0.689, + "step": 2878 + }, + { + "epoch": 0.44055087987758224, + "grad_norm": 2.6610632363025735, + "learning_rate": 1.8161519897540706e-05, + "loss": 0.9145, + "step": 2879 + }, + { + "epoch": 0.4407039020657995, + "grad_norm": 2.4608039381193283, + "learning_rate": 1.816008766037047e-05, + "loss": 0.8626, + "step": 2880 + }, + { + "epoch": 0.44085692425401685, + "grad_norm": 2.0608838256285527, + "learning_rate": 1.815865492205733e-05, + "loss": 0.7088, + "step": 2881 + }, + { + "epoch": 0.4410099464422341, + "grad_norm": 2.266950103228107, + "learning_rate": 1.8157221682689274e-05, + "loss": 0.7986, + "step": 2882 + }, + { + "epoch": 0.4411629686304514, + "grad_norm": 2.320138111880372, + "learning_rate": 1.8155787942354323e-05, + "loss": 0.7952, + "step": 2883 + }, + { + "epoch": 0.4413159908186687, + "grad_norm": 2.030179057500993, + "learning_rate": 1.815435370114053e-05, + "loss": 0.7375, + "step": 2884 + }, + { + "epoch": 0.441469013006886, + "grad_norm": 2.277446909757116, + "learning_rate": 1.8152918959135972e-05, + "loss": 0.7952, + "step": 2885 + }, + { + "epoch": 0.4416220351951033, + "grad_norm": 2.336938364081513, + "learning_rate": 1.815148371642877e-05, + "loss": 0.7348, + "step": 2886 + }, + { + "epoch": 0.44177505738332057, + "grad_norm": 2.068616185340274, + "learning_rate": 1.8150047973107062e-05, + "loss": 0.7344, + "step": 2887 + }, + { + "epoch": 0.44192807957153785, + "grad_norm": 2.378424795705215, + "learning_rate": 1.8148611729259028e-05, + "loss": 0.8116, + "step": 2888 + }, + { + "epoch": 0.4420811017597552, + "grad_norm": 2.315366839645358, + "learning_rate": 1.8147174984972866e-05, + "loss": 0.8843, + "step": 2889 + }, + { + "epoch": 0.44223412394797246, + "grad_norm": 2.282370051057871, + "learning_rate": 1.814573774033682e-05, + "loss": 0.6782, + "step": 2890 + }, + { + "epoch": 0.44238714613618974, + "grad_norm": 2.3368147120736054, + "learning_rate": 1.814429999543915e-05, + "loss": 0.7882, + "step": 2891 + }, + { + "epoch": 0.442540168324407, + "grad_norm": 2.1649407313959386, + "learning_rate": 1.8142861750368157e-05, + "loss": 0.697, + "step": 2892 + }, + { + "epoch": 0.44269319051262435, + "grad_norm": 2.5351682813389043, + "learning_rate": 1.8141423005212168e-05, + "loss": 0.8544, + "step": 2893 + }, + { + "epoch": 0.4428462127008416, + "grad_norm": 2.4679098802704553, + "learning_rate": 1.8139983760059546e-05, + "loss": 0.8602, + "step": 2894 + }, + { + "epoch": 0.4429992348890589, + "grad_norm": 2.4500507125200657, + "learning_rate": 1.8138544014998676e-05, + "loss": 0.8638, + "step": 2895 + }, + { + "epoch": 0.4431522570772762, + "grad_norm": 2.907302704924948, + "learning_rate": 1.813710377011798e-05, + "loss": 0.8541, + "step": 2896 + }, + { + "epoch": 0.4433052792654935, + "grad_norm": 2.668616754615131, + "learning_rate": 1.813566302550591e-05, + "loss": 0.8274, + "step": 2897 + }, + { + "epoch": 0.4434583014537108, + "grad_norm": 2.3327329427740255, + "learning_rate": 1.8134221781250946e-05, + "loss": 0.785, + "step": 2898 + }, + { + "epoch": 0.44361132364192807, + "grad_norm": 2.276328368333612, + "learning_rate": 1.81327800374416e-05, + "loss": 0.7404, + "step": 2899 + }, + { + "epoch": 0.44376434583014535, + "grad_norm": 2.196279587222736, + "learning_rate": 1.8131337794166417e-05, + "loss": 0.7991, + "step": 2900 + }, + { + "epoch": 0.4439173680183627, + "grad_norm": 2.4529440496173684, + "learning_rate": 1.812989505151397e-05, + "loss": 0.8399, + "step": 2901 + }, + { + "epoch": 0.44407039020657996, + "grad_norm": 2.2537061718280325, + "learning_rate": 1.8128451809572865e-05, + "loss": 0.7269, + "step": 2902 + }, + { + "epoch": 0.44422341239479723, + "grad_norm": 2.322994259587146, + "learning_rate": 1.8127008068431734e-05, + "loss": 0.7474, + "step": 2903 + }, + { + "epoch": 0.4443764345830145, + "grad_norm": 2.3975452426239405, + "learning_rate": 1.8125563828179244e-05, + "loss": 0.8157, + "step": 2904 + }, + { + "epoch": 0.44452945677123185, + "grad_norm": 2.0799712070502197, + "learning_rate": 1.8124119088904094e-05, + "loss": 0.7465, + "step": 2905 + }, + { + "epoch": 0.4446824789594491, + "grad_norm": 2.445692299342647, + "learning_rate": 1.8122673850695007e-05, + "loss": 0.833, + "step": 2906 + }, + { + "epoch": 0.4448355011476664, + "grad_norm": 2.347753895749705, + "learning_rate": 1.8121228113640744e-05, + "loss": 0.6599, + "step": 2907 + }, + { + "epoch": 0.4449885233358837, + "grad_norm": 2.294931489132973, + "learning_rate": 1.8119781877830093e-05, + "loss": 0.901, + "step": 2908 + }, + { + "epoch": 0.445141545524101, + "grad_norm": 2.2286810517605655, + "learning_rate": 1.8118335143351868e-05, + "loss": 0.7316, + "step": 2909 + }, + { + "epoch": 0.4452945677123183, + "grad_norm": 2.4912510299578057, + "learning_rate": 1.8116887910294925e-05, + "loss": 0.8443, + "step": 2910 + }, + { + "epoch": 0.44544758990053557, + "grad_norm": 2.459604830145821, + "learning_rate": 1.8115440178748142e-05, + "loss": 0.7226, + "step": 2911 + }, + { + "epoch": 0.44560061208875285, + "grad_norm": 2.6383148521577873, + "learning_rate": 1.811399194880043e-05, + "loss": 0.8549, + "step": 2912 + }, + { + "epoch": 0.4457536342769702, + "grad_norm": 2.280528485015101, + "learning_rate": 1.8112543220540733e-05, + "loss": 0.8502, + "step": 2913 + }, + { + "epoch": 0.44590665646518746, + "grad_norm": 2.4402696603587426, + "learning_rate": 1.8111093994058017e-05, + "loss": 0.8576, + "step": 2914 + }, + { + "epoch": 0.44605967865340473, + "grad_norm": 2.2925670057229977, + "learning_rate": 1.810964426944129e-05, + "loss": 0.7551, + "step": 2915 + }, + { + "epoch": 0.446212700841622, + "grad_norm": 2.2902795548425723, + "learning_rate": 1.8108194046779584e-05, + "loss": 0.7498, + "step": 2916 + }, + { + "epoch": 0.44636572302983935, + "grad_norm": 2.510625241344344, + "learning_rate": 1.8106743326161957e-05, + "loss": 0.826, + "step": 2917 + }, + { + "epoch": 0.4465187452180566, + "grad_norm": 2.4701981018218793, + "learning_rate": 1.810529210767751e-05, + "loss": 0.8336, + "step": 2918 + }, + { + "epoch": 0.4466717674062739, + "grad_norm": 2.5259770545154865, + "learning_rate": 1.8103840391415372e-05, + "loss": 0.7502, + "step": 2919 + }, + { + "epoch": 0.4468247895944912, + "grad_norm": 2.5249562507172287, + "learning_rate": 1.810238817746469e-05, + "loss": 0.7999, + "step": 2920 + }, + { + "epoch": 0.4469778117827085, + "grad_norm": 2.4396115738869444, + "learning_rate": 1.8100935465914654e-05, + "loss": 0.8055, + "step": 2921 + }, + { + "epoch": 0.4471308339709258, + "grad_norm": 2.3065700824046282, + "learning_rate": 1.8099482256854477e-05, + "loss": 0.8083, + "step": 2922 + }, + { + "epoch": 0.44728385615914307, + "grad_norm": 2.349949003979491, + "learning_rate": 1.8098028550373415e-05, + "loss": 0.8084, + "step": 2923 + }, + { + "epoch": 0.44743687834736035, + "grad_norm": 2.389430202044748, + "learning_rate": 1.8096574346560736e-05, + "loss": 0.782, + "step": 2924 + }, + { + "epoch": 0.4475899005355777, + "grad_norm": 2.210746221263167, + "learning_rate": 1.809511964550575e-05, + "loss": 0.6941, + "step": 2925 + }, + { + "epoch": 0.44774292272379496, + "grad_norm": 2.1626760051564897, + "learning_rate": 1.8093664447297802e-05, + "loss": 0.796, + "step": 2926 + }, + { + "epoch": 0.44789594491201223, + "grad_norm": 2.251030501988546, + "learning_rate": 1.809220875202626e-05, + "loss": 0.8277, + "step": 2927 + }, + { + "epoch": 0.4480489671002295, + "grad_norm": 2.1946018916323435, + "learning_rate": 1.8090752559780515e-05, + "loss": 0.7804, + "step": 2928 + }, + { + "epoch": 0.44820198928844684, + "grad_norm": 2.139406362787451, + "learning_rate": 1.8089295870650007e-05, + "loss": 0.7893, + "step": 2929 + }, + { + "epoch": 0.4483550114766641, + "grad_norm": 2.648584053744639, + "learning_rate": 1.8087838684724196e-05, + "loss": 0.7844, + "step": 2930 + }, + { + "epoch": 0.4485080336648814, + "grad_norm": 2.8523250188221625, + "learning_rate": 1.8086381002092573e-05, + "loss": 0.8421, + "step": 2931 + }, + { + "epoch": 0.4486610558530987, + "grad_norm": 2.316275777085334, + "learning_rate": 1.8084922822844655e-05, + "loss": 0.8628, + "step": 2932 + }, + { + "epoch": 0.448814078041316, + "grad_norm": 2.7344218627285857, + "learning_rate": 1.8083464147069997e-05, + "loss": 0.7968, + "step": 2933 + }, + { + "epoch": 0.4489671002295333, + "grad_norm": 2.5515179425547747, + "learning_rate": 1.8082004974858186e-05, + "loss": 0.6988, + "step": 2934 + }, + { + "epoch": 0.44912012241775057, + "grad_norm": 2.274009925043369, + "learning_rate": 1.808054530629883e-05, + "loss": 0.8237, + "step": 2935 + }, + { + "epoch": 0.44927314460596784, + "grad_norm": 2.2219946969109006, + "learning_rate": 1.8079085141481577e-05, + "loss": 0.8523, + "step": 2936 + }, + { + "epoch": 0.4494261667941852, + "grad_norm": 2.2150504476863295, + "learning_rate": 1.8077624480496102e-05, + "loss": 0.8053, + "step": 2937 + }, + { + "epoch": 0.44957918898240246, + "grad_norm": 2.419606454998503, + "learning_rate": 1.8076163323432104e-05, + "loss": 0.8921, + "step": 2938 + }, + { + "epoch": 0.44973221117061973, + "grad_norm": 2.36212827728317, + "learning_rate": 1.8074701670379324e-05, + "loss": 0.7622, + "step": 2939 + }, + { + "epoch": 0.449885233358837, + "grad_norm": 2.3272707660273726, + "learning_rate": 1.8073239521427524e-05, + "loss": 0.8535, + "step": 2940 + }, + { + "epoch": 0.45003825554705434, + "grad_norm": 2.516712688408001, + "learning_rate": 1.8071776876666506e-05, + "loss": 0.8504, + "step": 2941 + }, + { + "epoch": 0.4501912777352716, + "grad_norm": 2.223349887585654, + "learning_rate": 1.807031373618609e-05, + "loss": 0.8479, + "step": 2942 + }, + { + "epoch": 0.4503442999234889, + "grad_norm": 2.251986025412135, + "learning_rate": 1.806885010007614e-05, + "loss": 0.7571, + "step": 2943 + }, + { + "epoch": 0.4504973221117062, + "grad_norm": 2.910578161536275, + "learning_rate": 1.8067385968426533e-05, + "loss": 0.8832, + "step": 2944 + }, + { + "epoch": 0.4506503442999235, + "grad_norm": 2.5658246641317435, + "learning_rate": 1.8065921341327203e-05, + "loss": 0.8364, + "step": 2945 + }, + { + "epoch": 0.4508033664881408, + "grad_norm": 2.5441684589337923, + "learning_rate": 1.806445621886808e-05, + "loss": 0.8535, + "step": 2946 + }, + { + "epoch": 0.45095638867635807, + "grad_norm": 2.346003869170097, + "learning_rate": 1.806299060113916e-05, + "loss": 0.7937, + "step": 2947 + }, + { + "epoch": 0.45110941086457534, + "grad_norm": 2.3686802916882264, + "learning_rate": 1.8061524488230443e-05, + "loss": 0.9208, + "step": 2948 + }, + { + "epoch": 0.4512624330527927, + "grad_norm": 2.315318897085286, + "learning_rate": 1.8060057880231966e-05, + "loss": 0.7576, + "step": 2949 + }, + { + "epoch": 0.45141545524100996, + "grad_norm": 2.304204698051078, + "learning_rate": 1.8058590777233807e-05, + "loss": 0.8123, + "step": 2950 + }, + { + "epoch": 0.45156847742922723, + "grad_norm": 2.4172138592186547, + "learning_rate": 1.805712317932606e-05, + "loss": 0.7145, + "step": 2951 + }, + { + "epoch": 0.4517214996174445, + "grad_norm": 2.1016129747785213, + "learning_rate": 1.8055655086598864e-05, + "loss": 0.7545, + "step": 2952 + }, + { + "epoch": 0.45187452180566184, + "grad_norm": 2.49976385009864, + "learning_rate": 1.8054186499142373e-05, + "loss": 0.8997, + "step": 2953 + }, + { + "epoch": 0.4520275439938791, + "grad_norm": 2.1015201909120953, + "learning_rate": 1.8052717417046776e-05, + "loss": 0.7667, + "step": 2954 + }, + { + "epoch": 0.4521805661820964, + "grad_norm": 2.191104113892472, + "learning_rate": 1.8051247840402305e-05, + "loss": 0.7319, + "step": 2955 + }, + { + "epoch": 0.4523335883703137, + "grad_norm": 2.4834077756294, + "learning_rate": 1.8049777769299207e-05, + "loss": 0.8395, + "step": 2956 + }, + { + "epoch": 0.452486610558531, + "grad_norm": 2.6233714667161534, + "learning_rate": 1.8048307203827766e-05, + "loss": 0.7377, + "step": 2957 + }, + { + "epoch": 0.4526396327467483, + "grad_norm": 2.3852562290363855, + "learning_rate": 1.804683614407829e-05, + "loss": 0.8138, + "step": 2958 + }, + { + "epoch": 0.45279265493496557, + "grad_norm": 2.405802744082427, + "learning_rate": 1.8045364590141132e-05, + "loss": 0.7731, + "step": 2959 + }, + { + "epoch": 0.45294567712318284, + "grad_norm": 2.281624831951309, + "learning_rate": 1.804389254210666e-05, + "loss": 0.7172, + "step": 2960 + }, + { + "epoch": 0.4530986993114002, + "grad_norm": 2.5650189896049183, + "learning_rate": 1.8042420000065276e-05, + "loss": 0.8563, + "step": 2961 + }, + { + "epoch": 0.45325172149961745, + "grad_norm": 2.186990313937297, + "learning_rate": 1.8040946964107423e-05, + "loss": 0.7669, + "step": 2962 + }, + { + "epoch": 0.45340474368783473, + "grad_norm": 2.5993364039741147, + "learning_rate": 1.8039473434323555e-05, + "loss": 0.8328, + "step": 2963 + }, + { + "epoch": 0.453557765876052, + "grad_norm": 2.690424314638924, + "learning_rate": 1.803799941080418e-05, + "loss": 0.7906, + "step": 2964 + }, + { + "epoch": 0.45371078806426934, + "grad_norm": 2.3615307843715803, + "learning_rate": 1.803652489363981e-05, + "loss": 0.8846, + "step": 2965 + }, + { + "epoch": 0.4538638102524866, + "grad_norm": 2.3028327800647923, + "learning_rate": 1.8035049882921008e-05, + "loss": 0.7632, + "step": 2966 + }, + { + "epoch": 0.4540168324407039, + "grad_norm": 2.6992236760908104, + "learning_rate": 1.803357437873836e-05, + "loss": 0.8173, + "step": 2967 + }, + { + "epoch": 0.4541698546289212, + "grad_norm": 2.1987188003454072, + "learning_rate": 1.8032098381182483e-05, + "loss": 0.8117, + "step": 2968 + }, + { + "epoch": 0.4543228768171385, + "grad_norm": 2.6263119915845605, + "learning_rate": 1.8030621890344023e-05, + "loss": 0.8381, + "step": 2969 + }, + { + "epoch": 0.4544758990053558, + "grad_norm": 2.5134499182053296, + "learning_rate": 1.8029144906313653e-05, + "loss": 0.8841, + "step": 2970 + }, + { + "epoch": 0.45462892119357307, + "grad_norm": 2.3417192495256867, + "learning_rate": 1.8027667429182087e-05, + "loss": 0.765, + "step": 2971 + }, + { + "epoch": 0.45478194338179034, + "grad_norm": 2.3521797349686455, + "learning_rate": 1.8026189459040063e-05, + "loss": 0.7582, + "step": 2972 + }, + { + "epoch": 0.4549349655700077, + "grad_norm": 2.2634494087856964, + "learning_rate": 1.802471099597834e-05, + "loss": 0.7566, + "step": 2973 + }, + { + "epoch": 0.45508798775822495, + "grad_norm": 2.757423631425634, + "learning_rate": 1.8023232040087723e-05, + "loss": 0.7558, + "step": 2974 + }, + { + "epoch": 0.45524100994644223, + "grad_norm": 2.4974299117094536, + "learning_rate": 1.802175259145904e-05, + "loss": 0.904, + "step": 2975 + }, + { + "epoch": 0.4553940321346595, + "grad_norm": 2.4164896648530365, + "learning_rate": 1.8020272650183152e-05, + "loss": 0.7928, + "step": 2976 + }, + { + "epoch": 0.45554705432287684, + "grad_norm": 2.278461747453805, + "learning_rate": 1.801879221635094e-05, + "loss": 0.7486, + "step": 2977 + }, + { + "epoch": 0.4557000765110941, + "grad_norm": 2.325475613320422, + "learning_rate": 1.8017311290053335e-05, + "loss": 0.7439, + "step": 2978 + }, + { + "epoch": 0.4558530986993114, + "grad_norm": 2.5560265554303316, + "learning_rate": 1.8015829871381272e-05, + "loss": 0.8628, + "step": 2979 + }, + { + "epoch": 0.4560061208875287, + "grad_norm": 2.7690977507099417, + "learning_rate": 1.8014347960425742e-05, + "loss": 0.8506, + "step": 2980 + }, + { + "epoch": 0.456159143075746, + "grad_norm": 2.4905184954891544, + "learning_rate": 1.801286555727775e-05, + "loss": 0.7848, + "step": 2981 + }, + { + "epoch": 0.4563121652639633, + "grad_norm": 2.7241255638703317, + "learning_rate": 1.8011382662028343e-05, + "loss": 0.7943, + "step": 2982 + }, + { + "epoch": 0.45646518745218057, + "grad_norm": 2.3442771277370404, + "learning_rate": 1.800989927476858e-05, + "loss": 0.8593, + "step": 2983 + }, + { + "epoch": 0.45661820964039784, + "grad_norm": 2.3831996561185234, + "learning_rate": 1.800841539558957e-05, + "loss": 0.7683, + "step": 2984 + }, + { + "epoch": 0.4567712318286152, + "grad_norm": 2.529846692344636, + "learning_rate": 1.800693102458244e-05, + "loss": 0.7843, + "step": 2985 + }, + { + "epoch": 0.45692425401683245, + "grad_norm": 2.5994620405027686, + "learning_rate": 1.8005446161838355e-05, + "loss": 0.7773, + "step": 2986 + }, + { + "epoch": 0.45707727620504973, + "grad_norm": 2.1902994542558947, + "learning_rate": 1.8003960807448505e-05, + "loss": 0.8147, + "step": 2987 + }, + { + "epoch": 0.457230298393267, + "grad_norm": 2.666362830144391, + "learning_rate": 1.800247496150411e-05, + "loss": 0.8922, + "step": 2988 + }, + { + "epoch": 0.45738332058148434, + "grad_norm": 2.348016100092245, + "learning_rate": 1.800098862409642e-05, + "loss": 0.8225, + "step": 2989 + }, + { + "epoch": 0.4575363427697016, + "grad_norm": 2.6178249141604364, + "learning_rate": 1.7999501795316712e-05, + "loss": 0.7883, + "step": 2990 + }, + { + "epoch": 0.4576893649579189, + "grad_norm": 2.6567570604699315, + "learning_rate": 1.7998014475256313e-05, + "loss": 0.8205, + "step": 2991 + }, + { + "epoch": 0.4578423871461362, + "grad_norm": 2.1047805071412014, + "learning_rate": 1.7996526664006553e-05, + "loss": 0.6739, + "step": 2992 + }, + { + "epoch": 0.45799540933435345, + "grad_norm": 2.404156937766429, + "learning_rate": 1.7995038361658813e-05, + "loss": 0.687, + "step": 2993 + }, + { + "epoch": 0.4581484315225708, + "grad_norm": 2.3096394834735277, + "learning_rate": 1.7993549568304485e-05, + "loss": 0.8982, + "step": 2994 + }, + { + "epoch": 0.45830145371078806, + "grad_norm": 2.3445700625391535, + "learning_rate": 1.799206028403501e-05, + "loss": 0.8137, + "step": 2995 + }, + { + "epoch": 0.45845447589900534, + "grad_norm": 2.370222594067083, + "learning_rate": 1.7990570508941845e-05, + "loss": 0.6931, + "step": 2996 + }, + { + "epoch": 0.4586074980872226, + "grad_norm": 2.1936059754346706, + "learning_rate": 1.7989080243116486e-05, + "loss": 0.6917, + "step": 2997 + }, + { + "epoch": 0.45876052027543995, + "grad_norm": 2.139823927624093, + "learning_rate": 1.7987589486650457e-05, + "loss": 0.7353, + "step": 2998 + }, + { + "epoch": 0.45891354246365723, + "grad_norm": 2.418548770240328, + "learning_rate": 1.7986098239635313e-05, + "loss": 0.7797, + "step": 2999 + }, + { + "epoch": 0.4590665646518745, + "grad_norm": 2.427079798960257, + "learning_rate": 1.798460650216263e-05, + "loss": 0.8436, + "step": 3000 + }, + { + "epoch": 0.4592195868400918, + "grad_norm": 3.182514965849544, + "learning_rate": 1.7983114274324026e-05, + "loss": 0.9386, + "step": 3001 + }, + { + "epoch": 0.4593726090283091, + "grad_norm": 2.396757540197267, + "learning_rate": 1.7981621556211146e-05, + "loss": 0.7159, + "step": 3002 + }, + { + "epoch": 0.4595256312165264, + "grad_norm": 2.5679978233582004, + "learning_rate": 1.7980128347915663e-05, + "loss": 0.8365, + "step": 3003 + }, + { + "epoch": 0.4596786534047437, + "grad_norm": 2.38552520369705, + "learning_rate": 1.7978634649529276e-05, + "loss": 0.9041, + "step": 3004 + }, + { + "epoch": 0.45983167559296095, + "grad_norm": 2.2016042252587416, + "learning_rate": 1.7977140461143724e-05, + "loss": 0.7491, + "step": 3005 + }, + { + "epoch": 0.4599846977811783, + "grad_norm": 2.2356987100497134, + "learning_rate": 1.797564578285077e-05, + "loss": 0.7265, + "step": 3006 + }, + { + "epoch": 0.46013771996939556, + "grad_norm": 2.2342192382296, + "learning_rate": 1.797415061474221e-05, + "loss": 0.8228, + "step": 3007 + }, + { + "epoch": 0.46029074215761284, + "grad_norm": 2.4500043399004334, + "learning_rate": 1.7972654956909864e-05, + "loss": 0.755, + "step": 3008 + }, + { + "epoch": 0.4604437643458301, + "grad_norm": 2.0413908159279393, + "learning_rate": 1.7971158809445586e-05, + "loss": 0.6604, + "step": 3009 + }, + { + "epoch": 0.46059678653404745, + "grad_norm": 2.4438109905320067, + "learning_rate": 1.7969662172441266e-05, + "loss": 0.7801, + "step": 3010 + }, + { + "epoch": 0.46074980872226473, + "grad_norm": 2.5796430716040493, + "learning_rate": 1.796816504598881e-05, + "loss": 0.7737, + "step": 3011 + }, + { + "epoch": 0.460902830910482, + "grad_norm": 2.4072337562016366, + "learning_rate": 1.7966667430180168e-05, + "loss": 0.7351, + "step": 3012 + }, + { + "epoch": 0.4610558530986993, + "grad_norm": 2.4134621653012025, + "learning_rate": 1.7965169325107316e-05, + "loss": 0.679, + "step": 3013 + }, + { + "epoch": 0.4612088752869166, + "grad_norm": 2.382826189730748, + "learning_rate": 1.7963670730862253e-05, + "loss": 0.8454, + "step": 3014 + }, + { + "epoch": 0.4613618974751339, + "grad_norm": 2.461863361146385, + "learning_rate": 1.796217164753702e-05, + "loss": 0.8673, + "step": 3015 + }, + { + "epoch": 0.4615149196633512, + "grad_norm": 2.162119030473225, + "learning_rate": 1.7960672075223674e-05, + "loss": 0.7094, + "step": 3016 + }, + { + "epoch": 0.46166794185156845, + "grad_norm": 2.4495640775338123, + "learning_rate": 1.7959172014014315e-05, + "loss": 0.8153, + "step": 3017 + }, + { + "epoch": 0.4618209640397858, + "grad_norm": 2.8686927876563653, + "learning_rate": 1.7957671464001064e-05, + "loss": 0.7448, + "step": 3018 + }, + { + "epoch": 0.46197398622800306, + "grad_norm": 2.375874766610349, + "learning_rate": 1.7956170425276084e-05, + "loss": 0.7054, + "step": 3019 + }, + { + "epoch": 0.46212700841622034, + "grad_norm": 2.2043885507178445, + "learning_rate": 1.7954668897931545e-05, + "loss": 0.8179, + "step": 3020 + }, + { + "epoch": 0.4622800306044376, + "grad_norm": 2.2472971973036433, + "learning_rate": 1.7953166882059677e-05, + "loss": 0.8027, + "step": 3021 + }, + { + "epoch": 0.46243305279265495, + "grad_norm": 2.1792083036915866, + "learning_rate": 1.7951664377752714e-05, + "loss": 0.6854, + "step": 3022 + }, + { + "epoch": 0.46258607498087223, + "grad_norm": 2.4282694085901917, + "learning_rate": 1.7950161385102936e-05, + "loss": 0.8682, + "step": 3023 + }, + { + "epoch": 0.4627390971690895, + "grad_norm": 2.5164884020527114, + "learning_rate": 1.7948657904202645e-05, + "loss": 0.7298, + "step": 3024 + }, + { + "epoch": 0.4628921193573068, + "grad_norm": 2.2296415956211653, + "learning_rate": 1.7947153935144182e-05, + "loss": 0.7645, + "step": 3025 + }, + { + "epoch": 0.4630451415455241, + "grad_norm": 2.4863277949035556, + "learning_rate": 1.79456494780199e-05, + "loss": 0.8104, + "step": 3026 + }, + { + "epoch": 0.4631981637337414, + "grad_norm": 2.5801820662776995, + "learning_rate": 1.79441445329222e-05, + "loss": 0.8021, + "step": 3027 + }, + { + "epoch": 0.4633511859219587, + "grad_norm": 2.6857176336923536, + "learning_rate": 1.794263909994351e-05, + "loss": 0.8011, + "step": 3028 + }, + { + "epoch": 0.46350420811017595, + "grad_norm": 2.5215164941271895, + "learning_rate": 1.794113317917628e-05, + "loss": 0.9214, + "step": 3029 + }, + { + "epoch": 0.4636572302983933, + "grad_norm": 2.3776208831265877, + "learning_rate": 1.7939626770712996e-05, + "loss": 0.7993, + "step": 3030 + }, + { + "epoch": 0.46381025248661056, + "grad_norm": 2.3370137755310134, + "learning_rate": 1.7938119874646173e-05, + "loss": 0.9326, + "step": 3031 + }, + { + "epoch": 0.46396327467482784, + "grad_norm": 2.2444810838602196, + "learning_rate": 1.793661249106835e-05, + "loss": 0.78, + "step": 3032 + }, + { + "epoch": 0.4641162968630451, + "grad_norm": 2.626977361377603, + "learning_rate": 1.793510462007211e-05, + "loss": 0.8295, + "step": 3033 + }, + { + "epoch": 0.46426931905126245, + "grad_norm": 2.279663603934519, + "learning_rate": 1.793359626175005e-05, + "loss": 0.7931, + "step": 3034 + }, + { + "epoch": 0.46442234123947973, + "grad_norm": 2.2209566116817783, + "learning_rate": 1.793208741619481e-05, + "loss": 0.8991, + "step": 3035 + }, + { + "epoch": 0.464575363427697, + "grad_norm": 2.3367089021530467, + "learning_rate": 1.793057808349905e-05, + "loss": 0.6924, + "step": 3036 + }, + { + "epoch": 0.4647283856159143, + "grad_norm": 2.416662478259207, + "learning_rate": 1.7929068263755465e-05, + "loss": 0.9007, + "step": 3037 + }, + { + "epoch": 0.4648814078041316, + "grad_norm": 2.3202236234576756, + "learning_rate": 1.792755795705678e-05, + "loss": 0.8085, + "step": 3038 + }, + { + "epoch": 0.4650344299923489, + "grad_norm": 2.2844609011360815, + "learning_rate": 1.792604716349575e-05, + "loss": 0.5956, + "step": 3039 + }, + { + "epoch": 0.4651874521805662, + "grad_norm": 2.1019501713247006, + "learning_rate": 1.7924535883165154e-05, + "loss": 0.7306, + "step": 3040 + }, + { + "epoch": 0.46534047436878345, + "grad_norm": 2.371476838905052, + "learning_rate": 1.7923024116157806e-05, + "loss": 0.7912, + "step": 3041 + }, + { + "epoch": 0.4654934965570008, + "grad_norm": 2.2453509974503847, + "learning_rate": 1.7921511862566557e-05, + "loss": 0.7831, + "step": 3042 + }, + { + "epoch": 0.46564651874521806, + "grad_norm": 2.516526580358889, + "learning_rate": 1.7919999122484273e-05, + "loss": 0.8007, + "step": 3043 + }, + { + "epoch": 0.46579954093343534, + "grad_norm": 2.3714993400159288, + "learning_rate": 1.791848589600386e-05, + "loss": 0.8676, + "step": 3044 + }, + { + "epoch": 0.4659525631216526, + "grad_norm": 2.4338479274309766, + "learning_rate": 1.791697218321825e-05, + "loss": 0.7427, + "step": 3045 + }, + { + "epoch": 0.46610558530986995, + "grad_norm": 2.377571211070433, + "learning_rate": 1.7915457984220406e-05, + "loss": 0.8151, + "step": 3046 + }, + { + "epoch": 0.46625860749808723, + "grad_norm": 2.3621546987748907, + "learning_rate": 1.7913943299103325e-05, + "loss": 0.7547, + "step": 3047 + }, + { + "epoch": 0.4664116296863045, + "grad_norm": 2.339112355183538, + "learning_rate": 1.7912428127960025e-05, + "loss": 0.83, + "step": 3048 + }, + { + "epoch": 0.4665646518745218, + "grad_norm": 2.685139879857841, + "learning_rate": 1.791091247088356e-05, + "loss": 0.8156, + "step": 3049 + }, + { + "epoch": 0.4667176740627391, + "grad_norm": 2.531331145019246, + "learning_rate": 1.790939632796701e-05, + "loss": 0.7796, + "step": 3050 + }, + { + "epoch": 0.4668706962509564, + "grad_norm": 2.4861225552288864, + "learning_rate": 1.7907879699303494e-05, + "loss": 0.7717, + "step": 3051 + }, + { + "epoch": 0.4670237184391737, + "grad_norm": 2.310760243025036, + "learning_rate": 1.790636258498615e-05, + "loss": 0.7763, + "step": 3052 + }, + { + "epoch": 0.46717674062739095, + "grad_norm": 2.3903087000299084, + "learning_rate": 1.7904844985108146e-05, + "loss": 0.7714, + "step": 3053 + }, + { + "epoch": 0.4673297628156083, + "grad_norm": 2.2206279036603482, + "learning_rate": 1.790332689976269e-05, + "loss": 0.7524, + "step": 3054 + }, + { + "epoch": 0.46748278500382556, + "grad_norm": 2.3187476587864095, + "learning_rate": 1.790180832904301e-05, + "loss": 0.8059, + "step": 3055 + }, + { + "epoch": 0.46763580719204284, + "grad_norm": 2.710134518322272, + "learning_rate": 1.7900289273042367e-05, + "loss": 0.7208, + "step": 3056 + }, + { + "epoch": 0.4677888293802601, + "grad_norm": 2.35421789922996, + "learning_rate": 1.7898769731854058e-05, + "loss": 0.7486, + "step": 3057 + }, + { + "epoch": 0.46794185156847745, + "grad_norm": 2.286655013492438, + "learning_rate": 1.7897249705571397e-05, + "loss": 0.7359, + "step": 3058 + }, + { + "epoch": 0.46809487375669473, + "grad_norm": 2.11297135275565, + "learning_rate": 1.7895729194287733e-05, + "loss": 0.6781, + "step": 3059 + }, + { + "epoch": 0.468247895944912, + "grad_norm": 2.406508933681986, + "learning_rate": 1.789420819809646e-05, + "loss": 0.7598, + "step": 3060 + }, + { + "epoch": 0.4684009181331293, + "grad_norm": 2.6366510236837075, + "learning_rate": 1.7892686717090972e-05, + "loss": 0.9273, + "step": 3061 + }, + { + "epoch": 0.4685539403213466, + "grad_norm": 2.532411121338478, + "learning_rate": 1.789116475136472e-05, + "loss": 0.7786, + "step": 3062 + }, + { + "epoch": 0.4687069625095639, + "grad_norm": 2.594310936151825, + "learning_rate": 1.788964230101117e-05, + "loss": 0.8882, + "step": 3063 + }, + { + "epoch": 0.4688599846977812, + "grad_norm": 2.358180777677221, + "learning_rate": 1.788811936612382e-05, + "loss": 0.835, + "step": 3064 + }, + { + "epoch": 0.46901300688599845, + "grad_norm": 2.2454409418760903, + "learning_rate": 1.7886595946796202e-05, + "loss": 0.8277, + "step": 3065 + }, + { + "epoch": 0.4691660290742158, + "grad_norm": 2.329341381250108, + "learning_rate": 1.7885072043121874e-05, + "loss": 0.8119, + "step": 3066 + }, + { + "epoch": 0.46931905126243306, + "grad_norm": 1.9843107842254468, + "learning_rate": 1.7883547655194426e-05, + "loss": 0.7151, + "step": 3067 + }, + { + "epoch": 0.46947207345065034, + "grad_norm": 2.3293478291000755, + "learning_rate": 1.7882022783107475e-05, + "loss": 0.7741, + "step": 3068 + }, + { + "epoch": 0.4696250956388676, + "grad_norm": 2.4350391900241997, + "learning_rate": 1.7880497426954673e-05, + "loss": 0.8019, + "step": 3069 + }, + { + "epoch": 0.46977811782708495, + "grad_norm": 2.144141917751492, + "learning_rate": 1.787897158682969e-05, + "loss": 0.7007, + "step": 3070 + }, + { + "epoch": 0.46993114001530223, + "grad_norm": 2.174973634706149, + "learning_rate": 1.787744526282625e-05, + "loss": 0.792, + "step": 3071 + }, + { + "epoch": 0.4700841622035195, + "grad_norm": 2.618268443070123, + "learning_rate": 1.7875918455038068e-05, + "loss": 0.9176, + "step": 3072 + }, + { + "epoch": 0.4702371843917368, + "grad_norm": 2.128847386701305, + "learning_rate": 1.7874391163558926e-05, + "loss": 0.7339, + "step": 3073 + }, + { + "epoch": 0.4703902065799541, + "grad_norm": 2.4522718758498314, + "learning_rate": 1.787286338848262e-05, + "loss": 0.8111, + "step": 3074 + }, + { + "epoch": 0.4705432287681714, + "grad_norm": 2.2474549670739736, + "learning_rate": 1.7871335129902974e-05, + "loss": 0.8114, + "step": 3075 + }, + { + "epoch": 0.4706962509563887, + "grad_norm": 2.4366628479396724, + "learning_rate": 1.7869806387913845e-05, + "loss": 0.8638, + "step": 3076 + }, + { + "epoch": 0.47084927314460595, + "grad_norm": 2.220474175033701, + "learning_rate": 1.7868277162609116e-05, + "loss": 0.7199, + "step": 3077 + }, + { + "epoch": 0.4710022953328233, + "grad_norm": 2.4551268926851, + "learning_rate": 1.786674745408271e-05, + "loss": 0.7692, + "step": 3078 + }, + { + "epoch": 0.47115531752104056, + "grad_norm": 2.464234476027355, + "learning_rate": 1.7865217262428564e-05, + "loss": 0.8443, + "step": 3079 + }, + { + "epoch": 0.47130833970925784, + "grad_norm": 2.422126071311725, + "learning_rate": 1.786368658774066e-05, + "loss": 0.742, + "step": 3080 + }, + { + "epoch": 0.4714613618974751, + "grad_norm": 2.5067019319029566, + "learning_rate": 1.7862155430112997e-05, + "loss": 0.7451, + "step": 3081 + }, + { + "epoch": 0.47161438408569245, + "grad_norm": 2.260090103969428, + "learning_rate": 1.7860623789639614e-05, + "loss": 0.715, + "step": 3082 + }, + { + "epoch": 0.47176740627390973, + "grad_norm": 2.57018335192414, + "learning_rate": 1.7859091666414574e-05, + "loss": 0.8114, + "step": 3083 + }, + { + "epoch": 0.471920428462127, + "grad_norm": 2.3585692516115517, + "learning_rate": 1.7857559060531967e-05, + "loss": 0.7992, + "step": 3084 + }, + { + "epoch": 0.4720734506503443, + "grad_norm": 2.35928380554881, + "learning_rate": 1.785602597208592e-05, + "loss": 0.8346, + "step": 3085 + }, + { + "epoch": 0.4722264728385616, + "grad_norm": 2.462727119352491, + "learning_rate": 1.7854492401170587e-05, + "loss": 0.6754, + "step": 3086 + }, + { + "epoch": 0.4723794950267789, + "grad_norm": 2.196818002617913, + "learning_rate": 1.7852958347880147e-05, + "loss": 0.832, + "step": 3087 + }, + { + "epoch": 0.4725325172149962, + "grad_norm": 2.449395292618889, + "learning_rate": 1.7851423812308814e-05, + "loss": 0.8474, + "step": 3088 + }, + { + "epoch": 0.47268553940321345, + "grad_norm": 2.0127666982797643, + "learning_rate": 1.784988879455083e-05, + "loss": 0.6382, + "step": 3089 + }, + { + "epoch": 0.4728385615914308, + "grad_norm": 2.339434631709559, + "learning_rate": 1.7848353294700467e-05, + "loss": 0.7453, + "step": 3090 + }, + { + "epoch": 0.47299158377964806, + "grad_norm": 2.36812382885095, + "learning_rate": 1.7846817312852025e-05, + "loss": 0.7293, + "step": 3091 + }, + { + "epoch": 0.47314460596786534, + "grad_norm": 2.3238439044790975, + "learning_rate": 1.7845280849099835e-05, + "loss": 0.8259, + "step": 3092 + }, + { + "epoch": 0.4732976281560826, + "grad_norm": 2.26683061558248, + "learning_rate": 1.7843743903538255e-05, + "loss": 0.7869, + "step": 3093 + }, + { + "epoch": 0.47345065034429995, + "grad_norm": 2.2685721302716657, + "learning_rate": 1.784220647626168e-05, + "loss": 0.7982, + "step": 3094 + }, + { + "epoch": 0.47360367253251723, + "grad_norm": 2.1515562913610893, + "learning_rate": 1.7840668567364524e-05, + "loss": 0.7836, + "step": 3095 + }, + { + "epoch": 0.4737566947207345, + "grad_norm": 2.2986525906713404, + "learning_rate": 1.783913017694124e-05, + "loss": 0.7999, + "step": 3096 + }, + { + "epoch": 0.4739097169089518, + "grad_norm": 2.14804110147054, + "learning_rate": 1.7837591305086304e-05, + "loss": 0.8341, + "step": 3097 + }, + { + "epoch": 0.4740627390971691, + "grad_norm": 2.032811223621796, + "learning_rate": 1.7836051951894226e-05, + "loss": 0.7648, + "step": 3098 + }, + { + "epoch": 0.4742157612853864, + "grad_norm": 2.366229763538631, + "learning_rate": 1.7834512117459542e-05, + "loss": 0.7466, + "step": 3099 + }, + { + "epoch": 0.4743687834736037, + "grad_norm": 2.4477203066920836, + "learning_rate": 1.783297180187682e-05, + "loss": 0.8449, + "step": 3100 + }, + { + "epoch": 0.47452180566182095, + "grad_norm": 2.4758677333507015, + "learning_rate": 1.7831431005240655e-05, + "loss": 0.879, + "step": 3101 + }, + { + "epoch": 0.47467482785003823, + "grad_norm": 2.175130815541815, + "learning_rate": 1.782988972764568e-05, + "loss": 0.8596, + "step": 3102 + }, + { + "epoch": 0.47482785003825556, + "grad_norm": 2.6280119624613936, + "learning_rate": 1.7828347969186542e-05, + "loss": 0.8391, + "step": 3103 + }, + { + "epoch": 0.47498087222647284, + "grad_norm": 2.3033026539480175, + "learning_rate": 1.782680572995793e-05, + "loss": 0.8477, + "step": 3104 + }, + { + "epoch": 0.4751338944146901, + "grad_norm": 2.3324067024311006, + "learning_rate": 1.7825263010054565e-05, + "loss": 0.9281, + "step": 3105 + }, + { + "epoch": 0.4752869166029074, + "grad_norm": 2.4416725086218873, + "learning_rate": 1.7823719809571183e-05, + "loss": 0.7544, + "step": 3106 + }, + { + "epoch": 0.4754399387911247, + "grad_norm": 2.3461897741365663, + "learning_rate": 1.782217612860256e-05, + "loss": 0.7901, + "step": 3107 + }, + { + "epoch": 0.475592960979342, + "grad_norm": 2.258825700505424, + "learning_rate": 1.7820631967243502e-05, + "loss": 0.74, + "step": 3108 + }, + { + "epoch": 0.4757459831675593, + "grad_norm": 2.236350731521977, + "learning_rate": 1.781908732558884e-05, + "loss": 0.8043, + "step": 3109 + }, + { + "epoch": 0.47589900535577656, + "grad_norm": 2.306541883073562, + "learning_rate": 1.7817542203733435e-05, + "loss": 0.7783, + "step": 3110 + }, + { + "epoch": 0.4760520275439939, + "grad_norm": 2.132024776419034, + "learning_rate": 1.781599660177218e-05, + "loss": 0.7408, + "step": 3111 + }, + { + "epoch": 0.47620504973221117, + "grad_norm": 2.3182129805418006, + "learning_rate": 1.7814450519800003e-05, + "loss": 0.8, + "step": 3112 + }, + { + "epoch": 0.47635807192042845, + "grad_norm": 2.7774847658958324, + "learning_rate": 1.7812903957911845e-05, + "loss": 0.8339, + "step": 3113 + }, + { + "epoch": 0.4765110941086457, + "grad_norm": 2.4081664390527022, + "learning_rate": 1.781135691620269e-05, + "loss": 0.8104, + "step": 3114 + }, + { + "epoch": 0.47666411629686306, + "grad_norm": 2.2680414415315977, + "learning_rate": 1.7809809394767553e-05, + "loss": 0.7734, + "step": 3115 + }, + { + "epoch": 0.47681713848508034, + "grad_norm": 2.5455145450874874, + "learning_rate": 1.7808261393701466e-05, + "loss": 0.6636, + "step": 3116 + }, + { + "epoch": 0.4769701606732976, + "grad_norm": 2.402374800475588, + "learning_rate": 1.78067129130995e-05, + "loss": 0.8384, + "step": 3117 + }, + { + "epoch": 0.4771231828615149, + "grad_norm": 2.3897889539372366, + "learning_rate": 1.7805163953056755e-05, + "loss": 0.8397, + "step": 3118 + }, + { + "epoch": 0.4772762050497322, + "grad_norm": 2.4403644458499887, + "learning_rate": 1.7803614513668357e-05, + "loss": 0.8328, + "step": 3119 + }, + { + "epoch": 0.4774292272379495, + "grad_norm": 2.298660833843685, + "learning_rate": 1.7802064595029463e-05, + "loss": 0.7801, + "step": 3120 + }, + { + "epoch": 0.4775822494261668, + "grad_norm": 2.488715580053673, + "learning_rate": 1.7800514197235262e-05, + "loss": 0.9101, + "step": 3121 + }, + { + "epoch": 0.47773527161438406, + "grad_norm": 2.377244202377784, + "learning_rate": 1.7798963320380965e-05, + "loss": 0.7036, + "step": 3122 + }, + { + "epoch": 0.4778882938026014, + "grad_norm": 2.4289693996129165, + "learning_rate": 1.7797411964561825e-05, + "loss": 0.814, + "step": 3123 + }, + { + "epoch": 0.47804131599081867, + "grad_norm": 2.5574585527951905, + "learning_rate": 1.7795860129873113e-05, + "loss": 0.8414, + "step": 3124 + }, + { + "epoch": 0.47819433817903595, + "grad_norm": 2.145881998828633, + "learning_rate": 1.779430781641013e-05, + "loss": 0.7363, + "step": 3125 + }, + { + "epoch": 0.4783473603672532, + "grad_norm": 2.2443790813070317, + "learning_rate": 1.7792755024268213e-05, + "loss": 0.8263, + "step": 3126 + }, + { + "epoch": 0.47850038255547056, + "grad_norm": 2.4804357389915483, + "learning_rate": 1.7791201753542725e-05, + "loss": 0.9297, + "step": 3127 + }, + { + "epoch": 0.47865340474368784, + "grad_norm": 2.409155080634445, + "learning_rate": 1.778964800432906e-05, + "loss": 0.8459, + "step": 3128 + }, + { + "epoch": 0.4788064269319051, + "grad_norm": 2.0586342296667213, + "learning_rate": 1.7788093776722634e-05, + "loss": 0.7397, + "step": 3129 + }, + { + "epoch": 0.4789594491201224, + "grad_norm": 2.3936689258666943, + "learning_rate": 1.7786539070818903e-05, + "loss": 0.8269, + "step": 3130 + }, + { + "epoch": 0.4791124713083397, + "grad_norm": 2.359822665681357, + "learning_rate": 1.778498388671335e-05, + "loss": 0.7837, + "step": 3131 + }, + { + "epoch": 0.479265493496557, + "grad_norm": 2.6016378686683455, + "learning_rate": 1.778342822450148e-05, + "loss": 0.8188, + "step": 3132 + }, + { + "epoch": 0.4794185156847743, + "grad_norm": 2.2391289244689783, + "learning_rate": 1.778187208427883e-05, + "loss": 0.7139, + "step": 3133 + }, + { + "epoch": 0.47957153787299156, + "grad_norm": 2.4395509973947904, + "learning_rate": 1.7780315466140975e-05, + "loss": 0.6954, + "step": 3134 + }, + { + "epoch": 0.4797245600612089, + "grad_norm": 2.3167840415266783, + "learning_rate": 1.777875837018351e-05, + "loss": 0.8495, + "step": 3135 + }, + { + "epoch": 0.47987758224942617, + "grad_norm": 2.207197768229709, + "learning_rate": 1.7777200796502065e-05, + "loss": 0.7324, + "step": 3136 + }, + { + "epoch": 0.48003060443764345, + "grad_norm": 2.3388895918578405, + "learning_rate": 1.7775642745192293e-05, + "loss": 0.8092, + "step": 3137 + }, + { + "epoch": 0.4801836266258607, + "grad_norm": 2.5991436129658134, + "learning_rate": 1.777408421634988e-05, + "loss": 0.7951, + "step": 3138 + }, + { + "epoch": 0.48033664881407806, + "grad_norm": 2.614271482790651, + "learning_rate": 1.7772525210070545e-05, + "loss": 0.9615, + "step": 3139 + }, + { + "epoch": 0.48048967100229534, + "grad_norm": 2.280529742595915, + "learning_rate": 1.777096572645003e-05, + "loss": 0.8814, + "step": 3140 + }, + { + "epoch": 0.4806426931905126, + "grad_norm": 2.22455747968137, + "learning_rate": 1.7769405765584112e-05, + "loss": 0.7447, + "step": 3141 + }, + { + "epoch": 0.4807957153787299, + "grad_norm": 1.9704088622500777, + "learning_rate": 1.7767845327568585e-05, + "loss": 0.6619, + "step": 3142 + }, + { + "epoch": 0.4809487375669472, + "grad_norm": 2.440214460615727, + "learning_rate": 1.7766284412499295e-05, + "loss": 0.829, + "step": 3143 + }, + { + "epoch": 0.4811017597551645, + "grad_norm": 2.474091480639532, + "learning_rate": 1.7764723020472098e-05, + "loss": 0.8021, + "step": 3144 + }, + { + "epoch": 0.4812547819433818, + "grad_norm": 2.159308906503231, + "learning_rate": 1.7763161151582878e-05, + "loss": 0.7575, + "step": 3145 + }, + { + "epoch": 0.48140780413159906, + "grad_norm": 2.2634318435235814, + "learning_rate": 1.7761598805927564e-05, + "loss": 0.7508, + "step": 3146 + }, + { + "epoch": 0.4815608263198164, + "grad_norm": 2.2808387798097574, + "learning_rate": 1.7760035983602107e-05, + "loss": 0.8432, + "step": 3147 + }, + { + "epoch": 0.48171384850803367, + "grad_norm": 2.2333734359872213, + "learning_rate": 1.775847268470248e-05, + "loss": 0.7977, + "step": 3148 + }, + { + "epoch": 0.48186687069625095, + "grad_norm": 2.175497185361418, + "learning_rate": 1.7756908909324697e-05, + "loss": 0.8576, + "step": 3149 + }, + { + "epoch": 0.4820198928844682, + "grad_norm": 2.141222377290015, + "learning_rate": 1.7755344657564792e-05, + "loss": 0.7083, + "step": 3150 + }, + { + "epoch": 0.48217291507268556, + "grad_norm": 2.204712165087772, + "learning_rate": 1.7753779929518834e-05, + "loss": 0.7488, + "step": 3151 + }, + { + "epoch": 0.48232593726090284, + "grad_norm": 2.6909546478854023, + "learning_rate": 1.7752214725282912e-05, + "loss": 0.8871, + "step": 3152 + }, + { + "epoch": 0.4824789594491201, + "grad_norm": 2.4007662255888182, + "learning_rate": 1.775064904495316e-05, + "loss": 0.8097, + "step": 3153 + }, + { + "epoch": 0.4826319816373374, + "grad_norm": 2.281531360791035, + "learning_rate": 1.774908288862573e-05, + "loss": 0.6746, + "step": 3154 + }, + { + "epoch": 0.4827850038255547, + "grad_norm": 2.2461468476777973, + "learning_rate": 1.7747516256396805e-05, + "loss": 0.7814, + "step": 3155 + }, + { + "epoch": 0.482938026013772, + "grad_norm": 2.8940583997852145, + "learning_rate": 1.7745949148362598e-05, + "loss": 0.9052, + "step": 3156 + }, + { + "epoch": 0.4830910482019893, + "grad_norm": 2.151146909327901, + "learning_rate": 1.774438156461935e-05, + "loss": 0.6908, + "step": 3157 + }, + { + "epoch": 0.48324407039020656, + "grad_norm": 2.215843800992492, + "learning_rate": 1.7742813505263337e-05, + "loss": 0.7155, + "step": 3158 + }, + { + "epoch": 0.4833970925784239, + "grad_norm": 2.7562581153783814, + "learning_rate": 1.7741244970390854e-05, + "loss": 0.8132, + "step": 3159 + }, + { + "epoch": 0.48355011476664117, + "grad_norm": 2.151781872160964, + "learning_rate": 1.7739675960098233e-05, + "loss": 0.6925, + "step": 3160 + }, + { + "epoch": 0.48370313695485845, + "grad_norm": 2.31679038784991, + "learning_rate": 1.7738106474481838e-05, + "loss": 0.8587, + "step": 3161 + }, + { + "epoch": 0.4838561591430757, + "grad_norm": 2.2757888463384135, + "learning_rate": 1.773653651363805e-05, + "loss": 0.7059, + "step": 3162 + }, + { + "epoch": 0.48400918133129306, + "grad_norm": 2.6012748147156315, + "learning_rate": 1.7734966077663288e-05, + "loss": 0.7631, + "step": 3163 + }, + { + "epoch": 0.48416220351951034, + "grad_norm": 2.27116501374633, + "learning_rate": 1.7733395166654e-05, + "loss": 0.7921, + "step": 3164 + }, + { + "epoch": 0.4843152257077276, + "grad_norm": 2.463737754185167, + "learning_rate": 1.7731823780706664e-05, + "loss": 0.8805, + "step": 3165 + }, + { + "epoch": 0.4844682478959449, + "grad_norm": 2.457864756479185, + "learning_rate": 1.773025191991778e-05, + "loss": 0.8466, + "step": 3166 + }, + { + "epoch": 0.4846212700841622, + "grad_norm": 2.3691514278135695, + "learning_rate": 1.7728679584383884e-05, + "loss": 0.7611, + "step": 3167 + }, + { + "epoch": 0.4847742922723795, + "grad_norm": 2.6690788196830746, + "learning_rate": 1.7727106774201542e-05, + "loss": 0.7784, + "step": 3168 + }, + { + "epoch": 0.4849273144605968, + "grad_norm": 3.050441302794346, + "learning_rate": 1.7725533489467345e-05, + "loss": 0.9023, + "step": 3169 + }, + { + "epoch": 0.48508033664881406, + "grad_norm": 2.242247145906977, + "learning_rate": 1.7723959730277912e-05, + "loss": 0.6486, + "step": 3170 + }, + { + "epoch": 0.4852333588370314, + "grad_norm": 2.171151620961211, + "learning_rate": 1.7722385496729896e-05, + "loss": 0.7588, + "step": 3171 + }, + { + "epoch": 0.48538638102524867, + "grad_norm": 2.1872229724795673, + "learning_rate": 1.7720810788919974e-05, + "loss": 0.7352, + "step": 3172 + }, + { + "epoch": 0.48553940321346595, + "grad_norm": 2.2381038298991487, + "learning_rate": 1.7719235606944864e-05, + "loss": 0.777, + "step": 3173 + }, + { + "epoch": 0.4856924254016832, + "grad_norm": 2.1952939393056026, + "learning_rate": 1.7717659950901293e-05, + "loss": 0.738, + "step": 3174 + }, + { + "epoch": 0.48584544758990056, + "grad_norm": 2.4037553430751863, + "learning_rate": 1.771608382088603e-05, + "loss": 0.7518, + "step": 3175 + }, + { + "epoch": 0.48599846977811784, + "grad_norm": 2.298974912728606, + "learning_rate": 1.771450721699588e-05, + "loss": 0.7406, + "step": 3176 + }, + { + "epoch": 0.4861514919663351, + "grad_norm": 2.4675400243903214, + "learning_rate": 1.7712930139327663e-05, + "loss": 0.8112, + "step": 3177 + }, + { + "epoch": 0.4863045141545524, + "grad_norm": 2.332643028056019, + "learning_rate": 1.771135258797823e-05, + "loss": 0.7814, + "step": 3178 + }, + { + "epoch": 0.4864575363427697, + "grad_norm": 2.3206125208022685, + "learning_rate": 1.7709774563044467e-05, + "loss": 0.7689, + "step": 3179 + }, + { + "epoch": 0.486610558530987, + "grad_norm": 2.2311701929496537, + "learning_rate": 1.7708196064623288e-05, + "loss": 0.8517, + "step": 3180 + }, + { + "epoch": 0.4867635807192043, + "grad_norm": 1.9798751484367991, + "learning_rate": 1.7706617092811634e-05, + "loss": 0.7107, + "step": 3181 + }, + { + "epoch": 0.48691660290742156, + "grad_norm": 2.665140548734543, + "learning_rate": 1.7705037647706476e-05, + "loss": 0.8483, + "step": 3182 + }, + { + "epoch": 0.4870696250956389, + "grad_norm": 2.3060955142676525, + "learning_rate": 1.7703457729404814e-05, + "loss": 0.7595, + "step": 3183 + }, + { + "epoch": 0.48722264728385617, + "grad_norm": 2.201706180521868, + "learning_rate": 1.7701877338003677e-05, + "loss": 0.6579, + "step": 3184 + }, + { + "epoch": 0.48737566947207345, + "grad_norm": 2.4863780424056436, + "learning_rate": 1.7700296473600122e-05, + "loss": 0.8218, + "step": 3185 + }, + { + "epoch": 0.4875286916602907, + "grad_norm": 2.5234475881120972, + "learning_rate": 1.7698715136291238e-05, + "loss": 0.9158, + "step": 3186 + }, + { + "epoch": 0.48768171384850806, + "grad_norm": 2.230782070237414, + "learning_rate": 1.769713332617414e-05, + "loss": 0.8134, + "step": 3187 + }, + { + "epoch": 0.48783473603672534, + "grad_norm": 2.555470250956034, + "learning_rate": 1.7695551043345972e-05, + "loss": 0.7862, + "step": 3188 + }, + { + "epoch": 0.4879877582249426, + "grad_norm": 2.435982366105735, + "learning_rate": 1.7693968287903906e-05, + "loss": 0.8112, + "step": 3189 + }, + { + "epoch": 0.4881407804131599, + "grad_norm": 2.079276970212424, + "learning_rate": 1.7692385059945154e-05, + "loss": 0.7436, + "step": 3190 + }, + { + "epoch": 0.4882938026013772, + "grad_norm": 2.503362168438321, + "learning_rate": 1.769080135956694e-05, + "loss": 0.8012, + "step": 3191 + }, + { + "epoch": 0.4884468247895945, + "grad_norm": 2.3646623422395905, + "learning_rate": 1.768921718686653e-05, + "loss": 0.8065, + "step": 3192 + }, + { + "epoch": 0.4885998469778118, + "grad_norm": 2.428668920391647, + "learning_rate": 1.7687632541941208e-05, + "loss": 0.8703, + "step": 3193 + }, + { + "epoch": 0.48875286916602906, + "grad_norm": 2.592725000921327, + "learning_rate": 1.7686047424888303e-05, + "loss": 0.9091, + "step": 3194 + }, + { + "epoch": 0.4889058913542464, + "grad_norm": 2.3455071603658335, + "learning_rate": 1.7684461835805153e-05, + "loss": 0.7535, + "step": 3195 + }, + { + "epoch": 0.48905891354246367, + "grad_norm": 2.7711528247671837, + "learning_rate": 1.7682875774789143e-05, + "loss": 0.8585, + "step": 3196 + }, + { + "epoch": 0.48921193573068095, + "grad_norm": 2.5378645714466836, + "learning_rate": 1.7681289241937675e-05, + "loss": 0.806, + "step": 3197 + }, + { + "epoch": 0.4893649579188982, + "grad_norm": 2.302542926603842, + "learning_rate": 1.7679702237348185e-05, + "loss": 0.775, + "step": 3198 + }, + { + "epoch": 0.48951798010711556, + "grad_norm": 2.5459949407122777, + "learning_rate": 1.7678114761118136e-05, + "loss": 0.8727, + "step": 3199 + }, + { + "epoch": 0.48967100229533284, + "grad_norm": 2.448262018505535, + "learning_rate": 1.7676526813345024e-05, + "loss": 0.7077, + "step": 3200 + }, + { + "epoch": 0.4898240244835501, + "grad_norm": 2.622412414805754, + "learning_rate": 1.7674938394126368e-05, + "loss": 1.0291, + "step": 3201 + }, + { + "epoch": 0.4899770466717674, + "grad_norm": 2.4738400913098495, + "learning_rate": 1.7673349503559728e-05, + "loss": 0.857, + "step": 3202 + }, + { + "epoch": 0.4901300688599847, + "grad_norm": 2.5134079518079253, + "learning_rate": 1.767176014174267e-05, + "loss": 0.8002, + "step": 3203 + }, + { + "epoch": 0.490283091048202, + "grad_norm": 13.524370615262251, + "learning_rate": 1.767017030877281e-05, + "loss": 0.8024, + "step": 3204 + }, + { + "epoch": 0.4904361132364193, + "grad_norm": 2.404277059545446, + "learning_rate": 1.7668580004747787e-05, + "loss": 0.769, + "step": 3205 + }, + { + "epoch": 0.49058913542463656, + "grad_norm": 2.2606428101347795, + "learning_rate": 1.7666989229765265e-05, + "loss": 0.7535, + "step": 3206 + }, + { + "epoch": 0.4907421576128539, + "grad_norm": 2.64388311392485, + "learning_rate": 1.7665397983922942e-05, + "loss": 0.906, + "step": 3207 + }, + { + "epoch": 0.49089517980107117, + "grad_norm": 2.5906710909288977, + "learning_rate": 1.7663806267318538e-05, + "loss": 0.7953, + "step": 3208 + }, + { + "epoch": 0.49104820198928845, + "grad_norm": 2.272625903337256, + "learning_rate": 1.7662214080049814e-05, + "loss": 0.7149, + "step": 3209 + }, + { + "epoch": 0.4912012241775057, + "grad_norm": 2.078589092936746, + "learning_rate": 1.7660621422214546e-05, + "loss": 0.6869, + "step": 3210 + }, + { + "epoch": 0.49135424636572306, + "grad_norm": 2.1478637518748904, + "learning_rate": 1.765902829391055e-05, + "loss": 0.7486, + "step": 3211 + }, + { + "epoch": 0.49150726855394034, + "grad_norm": 2.3289060874748637, + "learning_rate": 1.7657434695235666e-05, + "loss": 0.7998, + "step": 3212 + }, + { + "epoch": 0.4916602907421576, + "grad_norm": 2.4882931922578484, + "learning_rate": 1.7655840626287756e-05, + "loss": 0.7622, + "step": 3213 + }, + { + "epoch": 0.4918133129303749, + "grad_norm": 2.1951542390318712, + "learning_rate": 1.765424608716473e-05, + "loss": 0.7396, + "step": 3214 + }, + { + "epoch": 0.49196633511859217, + "grad_norm": 2.418546776586094, + "learning_rate": 1.7652651077964504e-05, + "loss": 0.8735, + "step": 3215 + }, + { + "epoch": 0.4921193573068095, + "grad_norm": 2.258923232410249, + "learning_rate": 1.7651055598785037e-05, + "loss": 0.8429, + "step": 3216 + }, + { + "epoch": 0.4922723794950268, + "grad_norm": 2.212119341949836, + "learning_rate": 1.7649459649724315e-05, + "loss": 0.691, + "step": 3217 + }, + { + "epoch": 0.49242540168324406, + "grad_norm": 2.5536534558659456, + "learning_rate": 1.7647863230880354e-05, + "loss": 0.7298, + "step": 3218 + }, + { + "epoch": 0.49257842387146134, + "grad_norm": 2.3804911347467574, + "learning_rate": 1.764626634235119e-05, + "loss": 0.857, + "step": 3219 + }, + { + "epoch": 0.49273144605967867, + "grad_norm": 2.1041892044354715, + "learning_rate": 1.76446689842349e-05, + "loss": 0.7818, + "step": 3220 + }, + { + "epoch": 0.49288446824789595, + "grad_norm": 2.3149475586078623, + "learning_rate": 1.764307115662958e-05, + "loss": 0.8303, + "step": 3221 + }, + { + "epoch": 0.4930374904361132, + "grad_norm": 2.404275202101493, + "learning_rate": 1.7641472859633362e-05, + "loss": 0.8074, + "step": 3222 + }, + { + "epoch": 0.4931905126243305, + "grad_norm": 2.7182580165570585, + "learning_rate": 1.76398740933444e-05, + "loss": 0.6691, + "step": 3223 + }, + { + "epoch": 0.49334353481254783, + "grad_norm": 2.2542269012648757, + "learning_rate": 1.7638274857860884e-05, + "loss": 0.7576, + "step": 3224 + }, + { + "epoch": 0.4934965570007651, + "grad_norm": 2.6790158205282273, + "learning_rate": 1.763667515328103e-05, + "loss": 0.7793, + "step": 3225 + }, + { + "epoch": 0.4936495791889824, + "grad_norm": 2.3313374760580494, + "learning_rate": 1.7635074979703076e-05, + "loss": 0.8102, + "step": 3226 + }, + { + "epoch": 0.49380260137719967, + "grad_norm": 2.307545812369083, + "learning_rate": 1.76334743372253e-05, + "loss": 0.7408, + "step": 3227 + }, + { + "epoch": 0.493955623565417, + "grad_norm": 2.6064789126655907, + "learning_rate": 1.7631873225946004e-05, + "loss": 0.9119, + "step": 3228 + }, + { + "epoch": 0.4941086457536343, + "grad_norm": 2.3624050343096727, + "learning_rate": 1.7630271645963518e-05, + "loss": 0.829, + "step": 3229 + }, + { + "epoch": 0.49426166794185156, + "grad_norm": 2.554313730527207, + "learning_rate": 1.7628669597376198e-05, + "loss": 0.9271, + "step": 3230 + }, + { + "epoch": 0.49441469013006883, + "grad_norm": 2.1998616907501995, + "learning_rate": 1.7627067080282432e-05, + "loss": 0.8112, + "step": 3231 + }, + { + "epoch": 0.49456771231828617, + "grad_norm": 2.363883278824949, + "learning_rate": 1.7625464094780646e-05, + "loss": 0.7588, + "step": 3232 + }, + { + "epoch": 0.49472073450650345, + "grad_norm": 2.4905261389828506, + "learning_rate": 1.7623860640969275e-05, + "loss": 0.7716, + "step": 3233 + }, + { + "epoch": 0.4948737566947207, + "grad_norm": 2.3865027559186447, + "learning_rate": 1.76222567189468e-05, + "loss": 0.7401, + "step": 3234 + }, + { + "epoch": 0.495026778882938, + "grad_norm": 2.483958830160686, + "learning_rate": 1.762065232881172e-05, + "loss": 0.8404, + "step": 3235 + }, + { + "epoch": 0.49517980107115533, + "grad_norm": 2.1902122015963448, + "learning_rate": 1.7619047470662565e-05, + "loss": 0.8056, + "step": 3236 + }, + { + "epoch": 0.4953328232593726, + "grad_norm": 2.399968282559192, + "learning_rate": 1.7617442144597902e-05, + "loss": 0.8102, + "step": 3237 + }, + { + "epoch": 0.4954858454475899, + "grad_norm": 2.1498314028722554, + "learning_rate": 1.7615836350716316e-05, + "loss": 0.7334, + "step": 3238 + }, + { + "epoch": 0.49563886763580717, + "grad_norm": 2.6122160629141424, + "learning_rate": 1.7614230089116428e-05, + "loss": 0.8052, + "step": 3239 + }, + { + "epoch": 0.4957918898240245, + "grad_norm": 2.293516449978853, + "learning_rate": 1.7612623359896884e-05, + "loss": 0.7155, + "step": 3240 + }, + { + "epoch": 0.4959449120122418, + "grad_norm": 2.3151533649954965, + "learning_rate": 1.7611016163156355e-05, + "loss": 0.7752, + "step": 3241 + }, + { + "epoch": 0.49609793420045906, + "grad_norm": 2.794173016381072, + "learning_rate": 1.7609408498993553e-05, + "loss": 0.9031, + "step": 3242 + }, + { + "epoch": 0.49625095638867633, + "grad_norm": 2.588196116806027, + "learning_rate": 1.7607800367507204e-05, + "loss": 0.8531, + "step": 3243 + }, + { + "epoch": 0.49640397857689367, + "grad_norm": 2.3351088057592615, + "learning_rate": 1.7606191768796074e-05, + "loss": 0.8684, + "step": 3244 + }, + { + "epoch": 0.49655700076511095, + "grad_norm": 2.32274229856413, + "learning_rate": 1.7604582702958953e-05, + "loss": 0.7669, + "step": 3245 + }, + { + "epoch": 0.4967100229533282, + "grad_norm": 2.477436788357222, + "learning_rate": 1.7602973170094658e-05, + "loss": 0.8381, + "step": 3246 + }, + { + "epoch": 0.4968630451415455, + "grad_norm": 2.6129214133461343, + "learning_rate": 1.7601363170302035e-05, + "loss": 0.793, + "step": 3247 + }, + { + "epoch": 0.49701606732976283, + "grad_norm": 2.172244742262806, + "learning_rate": 1.7599752703679966e-05, + "loss": 0.624, + "step": 3248 + }, + { + "epoch": 0.4971690895179801, + "grad_norm": 2.298575304630846, + "learning_rate": 1.7598141770327353e-05, + "loss": 0.7008, + "step": 3249 + }, + { + "epoch": 0.4973221117061974, + "grad_norm": 2.268141893500064, + "learning_rate": 1.759653037034313e-05, + "loss": 0.7647, + "step": 3250 + }, + { + "epoch": 0.49747513389441467, + "grad_norm": 2.222876437768816, + "learning_rate": 1.7594918503826255e-05, + "loss": 0.6546, + "step": 3251 + }, + { + "epoch": 0.497628156082632, + "grad_norm": 2.752810421202548, + "learning_rate": 1.7593306170875727e-05, + "loss": 0.9053, + "step": 3252 + }, + { + "epoch": 0.4977811782708493, + "grad_norm": 2.4122049038132767, + "learning_rate": 1.7591693371590563e-05, + "loss": 0.8456, + "step": 3253 + }, + { + "epoch": 0.49793420045906656, + "grad_norm": 2.1764013379355425, + "learning_rate": 1.7590080106069807e-05, + "loss": 0.8315, + "step": 3254 + }, + { + "epoch": 0.49808722264728383, + "grad_norm": 2.375189125899373, + "learning_rate": 1.758846637441254e-05, + "loss": 0.7593, + "step": 3255 + }, + { + "epoch": 0.49824024483550117, + "grad_norm": 2.3610123855811675, + "learning_rate": 1.7586852176717867e-05, + "loss": 0.8311, + "step": 3256 + }, + { + "epoch": 0.49839326702371844, + "grad_norm": 2.6043618815139635, + "learning_rate": 1.7585237513084922e-05, + "loss": 0.7291, + "step": 3257 + }, + { + "epoch": 0.4985462892119357, + "grad_norm": 2.3211779791516522, + "learning_rate": 1.758362238361287e-05, + "loss": 0.769, + "step": 3258 + }, + { + "epoch": 0.498699311400153, + "grad_norm": 2.2649520601965607, + "learning_rate": 1.7582006788400895e-05, + "loss": 0.705, + "step": 3259 + }, + { + "epoch": 0.49885233358837033, + "grad_norm": 2.439781656230531, + "learning_rate": 1.7580390727548228e-05, + "loss": 0.653, + "step": 3260 + }, + { + "epoch": 0.4990053557765876, + "grad_norm": 2.448128881631369, + "learning_rate": 1.7578774201154107e-05, + "loss": 0.8099, + "step": 3261 + }, + { + "epoch": 0.4991583779648049, + "grad_norm": 2.4505488863057416, + "learning_rate": 1.7577157209317817e-05, + "loss": 0.8893, + "step": 3262 + }, + { + "epoch": 0.49931140015302217, + "grad_norm": 2.5299595454688135, + "learning_rate": 1.757553975213866e-05, + "loss": 0.7847, + "step": 3263 + }, + { + "epoch": 0.4994644223412395, + "grad_norm": 2.7098718662487524, + "learning_rate": 1.757392182971597e-05, + "loss": 0.7942, + "step": 3264 + }, + { + "epoch": 0.4996174445294568, + "grad_norm": 2.3535479403390354, + "learning_rate": 1.757230344214911e-05, + "loss": 0.8274, + "step": 3265 + }, + { + "epoch": 0.49977046671767406, + "grad_norm": 2.1897548479389557, + "learning_rate": 1.7570684589537473e-05, + "loss": 0.8113, + "step": 3266 + }, + { + "epoch": 0.49992348890589133, + "grad_norm": 2.321407918265366, + "learning_rate": 1.7569065271980482e-05, + "loss": 0.8089, + "step": 3267 + }, + { + "epoch": 0.5000765110941087, + "grad_norm": 2.5519453674642194, + "learning_rate": 1.756744548957758e-05, + "loss": 0.7425, + "step": 3268 + }, + { + "epoch": 0.5002295332823259, + "grad_norm": 2.425437004623082, + "learning_rate": 1.756582524242825e-05, + "loss": 0.7746, + "step": 3269 + }, + { + "epoch": 0.5003825554705432, + "grad_norm": 2.3121885815367405, + "learning_rate": 1.756420453063199e-05, + "loss": 0.8158, + "step": 3270 + }, + { + "epoch": 0.5005355776587606, + "grad_norm": 2.378431932054567, + "learning_rate": 1.756258335428834e-05, + "loss": 0.7868, + "step": 3271 + }, + { + "epoch": 0.5006885998469778, + "grad_norm": 2.007280263141698, + "learning_rate": 1.756096171349686e-05, + "loss": 0.8533, + "step": 3272 + }, + { + "epoch": 0.5008416220351951, + "grad_norm": 2.522900136714207, + "learning_rate": 1.7559339608357145e-05, + "loss": 0.7948, + "step": 3273 + }, + { + "epoch": 0.5009946442234124, + "grad_norm": 2.2461761449648905, + "learning_rate": 1.7557717038968807e-05, + "loss": 0.724, + "step": 3274 + }, + { + "epoch": 0.5011476664116297, + "grad_norm": 2.492589978811216, + "learning_rate": 1.7556094005431507e-05, + "loss": 0.8158, + "step": 3275 + }, + { + "epoch": 0.501300688599847, + "grad_norm": 2.779932572876539, + "learning_rate": 1.7554470507844906e-05, + "loss": 0.7971, + "step": 3276 + }, + { + "epoch": 0.5014537107880642, + "grad_norm": 2.726781264906075, + "learning_rate": 1.7552846546308724e-05, + "loss": 0.882, + "step": 3277 + }, + { + "epoch": 0.5016067329762816, + "grad_norm": 2.750726908807502, + "learning_rate": 1.7551222120922687e-05, + "loss": 0.7143, + "step": 3278 + }, + { + "epoch": 0.5017597551644989, + "grad_norm": 2.611633220188425, + "learning_rate": 1.7549597231786562e-05, + "loss": 0.9365, + "step": 3279 + }, + { + "epoch": 0.5019127773527161, + "grad_norm": 2.193683141102787, + "learning_rate": 1.7547971879000133e-05, + "loss": 0.7702, + "step": 3280 + }, + { + "epoch": 0.5020657995409334, + "grad_norm": 2.1799111635263384, + "learning_rate": 1.7546346062663224e-05, + "loss": 0.767, + "step": 3281 + }, + { + "epoch": 0.5022188217291508, + "grad_norm": 2.2272737839801846, + "learning_rate": 1.754471978287568e-05, + "loss": 0.7561, + "step": 3282 + }, + { + "epoch": 0.502371843917368, + "grad_norm": 2.3756225531817337, + "learning_rate": 1.7543093039737384e-05, + "loss": 0.7519, + "step": 3283 + }, + { + "epoch": 0.5025248661055853, + "grad_norm": 2.6892279006528206, + "learning_rate": 1.754146583334823e-05, + "loss": 0.7317, + "step": 3284 + }, + { + "epoch": 0.5026778882938026, + "grad_norm": 2.167783051454944, + "learning_rate": 1.7539838163808164e-05, + "loss": 0.6927, + "step": 3285 + }, + { + "epoch": 0.5028309104820199, + "grad_norm": 2.497432083363853, + "learning_rate": 1.7538210031217133e-05, + "loss": 0.7813, + "step": 3286 + }, + { + "epoch": 0.5029839326702372, + "grad_norm": 2.42569547096876, + "learning_rate": 1.7536581435675136e-05, + "loss": 0.6925, + "step": 3287 + }, + { + "epoch": 0.5031369548584544, + "grad_norm": 2.2400882700793336, + "learning_rate": 1.753495237728219e-05, + "loss": 0.828, + "step": 3288 + }, + { + "epoch": 0.5032899770466718, + "grad_norm": 2.247722482445047, + "learning_rate": 1.7533322856138345e-05, + "loss": 0.7002, + "step": 3289 + }, + { + "epoch": 0.5034429992348891, + "grad_norm": 2.507215137207525, + "learning_rate": 1.7531692872343672e-05, + "loss": 0.811, + "step": 3290 + }, + { + "epoch": 0.5035960214231063, + "grad_norm": 2.3787834208881655, + "learning_rate": 1.753006242599827e-05, + "loss": 0.8742, + "step": 3291 + }, + { + "epoch": 0.5037490436113237, + "grad_norm": 2.3822113508515392, + "learning_rate": 1.7528431517202283e-05, + "loss": 0.8242, + "step": 3292 + }, + { + "epoch": 0.5039020657995409, + "grad_norm": 2.1981554588331984, + "learning_rate": 1.7526800146055866e-05, + "loss": 0.932, + "step": 3293 + }, + { + "epoch": 0.5040550879877582, + "grad_norm": 2.2509071078466643, + "learning_rate": 1.7525168312659202e-05, + "loss": 0.7999, + "step": 3294 + }, + { + "epoch": 0.5042081101759756, + "grad_norm": 2.4597949432094417, + "learning_rate": 1.7523536017112516e-05, + "loss": 0.7423, + "step": 3295 + }, + { + "epoch": 0.5043611323641928, + "grad_norm": 2.2297846203516305, + "learning_rate": 1.7521903259516045e-05, + "loss": 0.6966, + "step": 3296 + }, + { + "epoch": 0.5045141545524101, + "grad_norm": 2.4466722284928677, + "learning_rate": 1.752027003997008e-05, + "loss": 0.8328, + "step": 3297 + }, + { + "epoch": 0.5046671767406274, + "grad_norm": 2.240130726906534, + "learning_rate": 1.7518636358574905e-05, + "loss": 0.7626, + "step": 3298 + }, + { + "epoch": 0.5048201989288447, + "grad_norm": 2.4645099563125554, + "learning_rate": 1.7517002215430857e-05, + "loss": 0.8377, + "step": 3299 + }, + { + "epoch": 0.504973221117062, + "grad_norm": 2.173867927524907, + "learning_rate": 1.75153676106383e-05, + "loss": 0.7712, + "step": 3300 + }, + { + "epoch": 0.5051262433052792, + "grad_norm": 2.538823450141907, + "learning_rate": 1.7513732544297618e-05, + "loss": 0.8464, + "step": 3301 + }, + { + "epoch": 0.5052792654934966, + "grad_norm": 2.462672051119006, + "learning_rate": 1.7512097016509224e-05, + "loss": 0.8667, + "step": 3302 + }, + { + "epoch": 0.5054322876817139, + "grad_norm": 2.4697382148859597, + "learning_rate": 1.7510461027373565e-05, + "loss": 0.726, + "step": 3303 + }, + { + "epoch": 0.5055853098699311, + "grad_norm": 2.2938339035117727, + "learning_rate": 1.7508824576991115e-05, + "loss": 0.7386, + "step": 3304 + }, + { + "epoch": 0.5057383320581484, + "grad_norm": 2.157524022621634, + "learning_rate": 1.7507187665462374e-05, + "loss": 0.7496, + "step": 3305 + }, + { + "epoch": 0.5058913542463658, + "grad_norm": 2.1429953471957015, + "learning_rate": 1.7505550292887866e-05, + "loss": 0.7888, + "step": 3306 + }, + { + "epoch": 0.506044376434583, + "grad_norm": 2.1704731421991394, + "learning_rate": 1.7503912459368156e-05, + "loss": 0.8105, + "step": 3307 + }, + { + "epoch": 0.5061973986228003, + "grad_norm": 2.5892926483115772, + "learning_rate": 1.7502274165003826e-05, + "loss": 0.8, + "step": 3308 + }, + { + "epoch": 0.5063504208110176, + "grad_norm": 2.4993920181365654, + "learning_rate": 1.7500635409895488e-05, + "loss": 0.8512, + "step": 3309 + }, + { + "epoch": 0.5065034429992349, + "grad_norm": 2.340329315102193, + "learning_rate": 1.7498996194143792e-05, + "loss": 0.7955, + "step": 3310 + }, + { + "epoch": 0.5066564651874522, + "grad_norm": 2.3758682343089905, + "learning_rate": 1.7497356517849397e-05, + "loss": 0.7777, + "step": 3311 + }, + { + "epoch": 0.5068094873756694, + "grad_norm": 2.430405212589665, + "learning_rate": 1.749571638111301e-05, + "loss": 0.9005, + "step": 3312 + }, + { + "epoch": 0.5069625095638868, + "grad_norm": 2.295985134609166, + "learning_rate": 1.749407578403536e-05, + "loss": 0.6991, + "step": 3313 + }, + { + "epoch": 0.5071155317521041, + "grad_norm": 2.2080978558305646, + "learning_rate": 1.74924347267172e-05, + "loss": 0.6966, + "step": 3314 + }, + { + "epoch": 0.5072685539403213, + "grad_norm": 2.3125199843341386, + "learning_rate": 1.749079320925931e-05, + "loss": 0.9435, + "step": 3315 + }, + { + "epoch": 0.5074215761285387, + "grad_norm": 2.3758545793328394, + "learning_rate": 1.7489151231762503e-05, + "loss": 0.7937, + "step": 3316 + }, + { + "epoch": 0.5075745983167559, + "grad_norm": 2.327374821187704, + "learning_rate": 1.7487508794327627e-05, + "loss": 0.6983, + "step": 3317 + }, + { + "epoch": 0.5077276205049732, + "grad_norm": 2.3625341694144413, + "learning_rate": 1.7485865897055537e-05, + "loss": 0.797, + "step": 3318 + }, + { + "epoch": 0.5078806426931906, + "grad_norm": 2.4367362368564836, + "learning_rate": 1.748422254004714e-05, + "loss": 0.8072, + "step": 3319 + }, + { + "epoch": 0.5080336648814078, + "grad_norm": 2.3174896950785433, + "learning_rate": 1.7482578723403363e-05, + "loss": 0.7571, + "step": 3320 + }, + { + "epoch": 0.5081866870696251, + "grad_norm": 1.8791679422020524, + "learning_rate": 1.748093444722515e-05, + "loss": 0.7182, + "step": 3321 + }, + { + "epoch": 0.5083397092578424, + "grad_norm": 2.1616486568516096, + "learning_rate": 1.7479289711613487e-05, + "loss": 0.7051, + "step": 3322 + }, + { + "epoch": 0.5084927314460597, + "grad_norm": 2.230631601951075, + "learning_rate": 1.7477644516669387e-05, + "loss": 0.7449, + "step": 3323 + }, + { + "epoch": 0.508645753634277, + "grad_norm": 2.0062049354602225, + "learning_rate": 1.747599886249388e-05, + "loss": 0.7714, + "step": 3324 + }, + { + "epoch": 0.5087987758224942, + "grad_norm": 2.5277809968626417, + "learning_rate": 1.747435274918804e-05, + "loss": 0.9102, + "step": 3325 + }, + { + "epoch": 0.5089517980107116, + "grad_norm": 2.2964812751680723, + "learning_rate": 1.7472706176852957e-05, + "loss": 0.8331, + "step": 3326 + }, + { + "epoch": 0.5091048201989289, + "grad_norm": 2.4469684135554983, + "learning_rate": 1.7471059145589755e-05, + "loss": 0.8294, + "step": 3327 + }, + { + "epoch": 0.5092578423871461, + "grad_norm": 2.197412860546768, + "learning_rate": 1.7469411655499583e-05, + "loss": 0.8342, + "step": 3328 + }, + { + "epoch": 0.5094108645753634, + "grad_norm": 2.5275013829253212, + "learning_rate": 1.746776370668362e-05, + "loss": 0.8247, + "step": 3329 + }, + { + "epoch": 0.5095638867635807, + "grad_norm": 2.1610874400415803, + "learning_rate": 1.7466115299243072e-05, + "loss": 0.7758, + "step": 3330 + }, + { + "epoch": 0.509716908951798, + "grad_norm": 2.2429685448279733, + "learning_rate": 1.746446643327918e-05, + "loss": 0.7493, + "step": 3331 + }, + { + "epoch": 0.5098699311400153, + "grad_norm": 2.4143447333401435, + "learning_rate": 1.7462817108893202e-05, + "loss": 0.7255, + "step": 3332 + }, + { + "epoch": 0.5100229533282326, + "grad_norm": 2.3857507284179795, + "learning_rate": 1.746116732618643e-05, + "loss": 0.8139, + "step": 3333 + }, + { + "epoch": 0.5101759755164499, + "grad_norm": 2.2535908652855094, + "learning_rate": 1.7459517085260186e-05, + "loss": 0.7173, + "step": 3334 + }, + { + "epoch": 0.5103289977046672, + "grad_norm": 2.182942551591896, + "learning_rate": 1.7457866386215814e-05, + "loss": 0.6888, + "step": 3335 + }, + { + "epoch": 0.5104820198928844, + "grad_norm": 2.297465645975313, + "learning_rate": 1.7456215229154693e-05, + "loss": 0.7634, + "step": 3336 + }, + { + "epoch": 0.5106350420811018, + "grad_norm": 2.3307719054135156, + "learning_rate": 1.7454563614178223e-05, + "loss": 0.789, + "step": 3337 + }, + { + "epoch": 0.510788064269319, + "grad_norm": 2.380246890813595, + "learning_rate": 1.7452911541387844e-05, + "loss": 0.8229, + "step": 3338 + }, + { + "epoch": 0.5109410864575363, + "grad_norm": 2.214607579600976, + "learning_rate": 1.7451259010885008e-05, + "loss": 0.7872, + "step": 3339 + }, + { + "epoch": 0.5110941086457537, + "grad_norm": 2.5175143675456826, + "learning_rate": 1.7449606022771206e-05, + "loss": 0.9167, + "step": 3340 + }, + { + "epoch": 0.5112471308339709, + "grad_norm": 2.2963224878966475, + "learning_rate": 1.7447952577147957e-05, + "loss": 0.8955, + "step": 3341 + }, + { + "epoch": 0.5114001530221882, + "grad_norm": 2.6007771172167304, + "learning_rate": 1.7446298674116803e-05, + "loss": 0.7827, + "step": 3342 + }, + { + "epoch": 0.5115531752104056, + "grad_norm": 2.619381696692319, + "learning_rate": 1.7444644313779316e-05, + "loss": 0.8316, + "step": 3343 + }, + { + "epoch": 0.5117061973986228, + "grad_norm": 2.422295060929172, + "learning_rate": 1.74429894962371e-05, + "loss": 0.7311, + "step": 3344 + }, + { + "epoch": 0.5118592195868401, + "grad_norm": 2.3663869854583544, + "learning_rate": 1.7441334221591783e-05, + "loss": 0.7173, + "step": 3345 + }, + { + "epoch": 0.5120122417750573, + "grad_norm": 2.1570402298204225, + "learning_rate": 1.7439678489945017e-05, + "loss": 0.7325, + "step": 3346 + }, + { + "epoch": 0.5121652639632747, + "grad_norm": 3.0964806046696944, + "learning_rate": 1.7438022301398495e-05, + "loss": 0.8387, + "step": 3347 + }, + { + "epoch": 0.512318286151492, + "grad_norm": 2.431537047730991, + "learning_rate": 1.743636565605392e-05, + "loss": 0.7897, + "step": 3348 + }, + { + "epoch": 0.5124713083397092, + "grad_norm": 2.091834176573096, + "learning_rate": 1.7434708554013046e-05, + "loss": 0.7902, + "step": 3349 + }, + { + "epoch": 0.5126243305279266, + "grad_norm": 2.1331625981738775, + "learning_rate": 1.743305099537763e-05, + "loss": 0.679, + "step": 3350 + }, + { + "epoch": 0.5127773527161439, + "grad_norm": 2.5746781871427182, + "learning_rate": 1.7431392980249477e-05, + "loss": 0.8677, + "step": 3351 + }, + { + "epoch": 0.5129303749043611, + "grad_norm": 2.30631341726066, + "learning_rate": 1.7429734508730404e-05, + "loss": 0.7585, + "step": 3352 + }, + { + "epoch": 0.5130833970925784, + "grad_norm": 2.426958964202821, + "learning_rate": 1.7428075580922278e-05, + "loss": 0.8328, + "step": 3353 + }, + { + "epoch": 0.5132364192807957, + "grad_norm": 2.4386540509387076, + "learning_rate": 1.7426416196926965e-05, + "loss": 0.7015, + "step": 3354 + }, + { + "epoch": 0.513389441469013, + "grad_norm": 2.3459506785389475, + "learning_rate": 1.742475635684638e-05, + "loss": 0.7653, + "step": 3355 + }, + { + "epoch": 0.5135424636572303, + "grad_norm": 2.576562934756025, + "learning_rate": 1.7423096060782466e-05, + "loss": 0.8647, + "step": 3356 + }, + { + "epoch": 0.5136954858454476, + "grad_norm": 2.508717422355279, + "learning_rate": 1.7421435308837186e-05, + "loss": 0.7108, + "step": 3357 + }, + { + "epoch": 0.5138485080336649, + "grad_norm": 2.1278984177818536, + "learning_rate": 1.7419774101112526e-05, + "loss": 0.7988, + "step": 3358 + }, + { + "epoch": 0.5140015302218822, + "grad_norm": 2.376063781505229, + "learning_rate": 1.741811243771051e-05, + "loss": 0.7497, + "step": 3359 + }, + { + "epoch": 0.5141545524100994, + "grad_norm": 2.401044243211275, + "learning_rate": 1.7416450318733194e-05, + "loss": 0.8131, + "step": 3360 + }, + { + "epoch": 0.5143075745983168, + "grad_norm": 2.3899515726116958, + "learning_rate": 1.741478774428265e-05, + "loss": 0.855, + "step": 3361 + }, + { + "epoch": 0.514460596786534, + "grad_norm": 2.6320361366457314, + "learning_rate": 1.741312471446098e-05, + "loss": 0.7491, + "step": 3362 + }, + { + "epoch": 0.5146136189747513, + "grad_norm": 2.214272223614381, + "learning_rate": 1.7411461229370326e-05, + "loss": 0.7406, + "step": 3363 + }, + { + "epoch": 0.5147666411629687, + "grad_norm": 2.446861139903529, + "learning_rate": 1.7409797289112842e-05, + "loss": 0.7961, + "step": 3364 + }, + { + "epoch": 0.5149196633511859, + "grad_norm": 2.6934573971392064, + "learning_rate": 1.740813289379072e-05, + "loss": 0.7586, + "step": 3365 + }, + { + "epoch": 0.5150726855394032, + "grad_norm": 2.2123795928387264, + "learning_rate": 1.7406468043506176e-05, + "loss": 0.7273, + "step": 3366 + }, + { + "epoch": 0.5152257077276206, + "grad_norm": 2.243404809270673, + "learning_rate": 1.740480273836145e-05, + "loss": 0.7537, + "step": 3367 + }, + { + "epoch": 0.5153787299158378, + "grad_norm": 2.250245346866966, + "learning_rate": 1.7403136978458828e-05, + "loss": 0.7529, + "step": 3368 + }, + { + "epoch": 0.5155317521040551, + "grad_norm": 2.3461943594132966, + "learning_rate": 1.74014707639006e-05, + "loss": 0.78, + "step": 3369 + }, + { + "epoch": 0.5156847742922723, + "grad_norm": 2.2217893071169224, + "learning_rate": 1.7399804094789096e-05, + "loss": 0.7165, + "step": 3370 + }, + { + "epoch": 0.5158377964804897, + "grad_norm": 2.2631879850970464, + "learning_rate": 1.7398136971226677e-05, + "loss": 0.7602, + "step": 3371 + }, + { + "epoch": 0.515990818668707, + "grad_norm": 2.431585278848265, + "learning_rate": 1.739646939331572e-05, + "loss": 0.771, + "step": 3372 + }, + { + "epoch": 0.5161438408569242, + "grad_norm": 2.4202710096861506, + "learning_rate": 1.7394801361158648e-05, + "loss": 0.7569, + "step": 3373 + }, + { + "epoch": 0.5162968630451416, + "grad_norm": 2.535564085626204, + "learning_rate": 1.7393132874857894e-05, + "loss": 0.8373, + "step": 3374 + }, + { + "epoch": 0.5164498852333589, + "grad_norm": 2.5135595738903467, + "learning_rate": 1.739146393451593e-05, + "loss": 0.8715, + "step": 3375 + }, + { + "epoch": 0.5166029074215761, + "grad_norm": 2.095406374528688, + "learning_rate": 1.7389794540235246e-05, + "loss": 0.7633, + "step": 3376 + }, + { + "epoch": 0.5167559296097934, + "grad_norm": 2.686993076718003, + "learning_rate": 1.7388124692118373e-05, + "loss": 0.7197, + "step": 3377 + }, + { + "epoch": 0.5169089517980107, + "grad_norm": 2.4490117625560517, + "learning_rate": 1.7386454390267857e-05, + "loss": 0.8004, + "step": 3378 + }, + { + "epoch": 0.517061973986228, + "grad_norm": 2.301469614522371, + "learning_rate": 1.7384783634786284e-05, + "loss": 0.7179, + "step": 3379 + }, + { + "epoch": 0.5172149961744453, + "grad_norm": 2.432510545681375, + "learning_rate": 1.738311242577626e-05, + "loss": 0.8886, + "step": 3380 + }, + { + "epoch": 0.5173680183626626, + "grad_norm": 2.4190454792921963, + "learning_rate": 1.7381440763340416e-05, + "loss": 0.867, + "step": 3381 + }, + { + "epoch": 0.5175210405508799, + "grad_norm": 2.5166664142471924, + "learning_rate": 1.737976864758142e-05, + "loss": 0.8787, + "step": 3382 + }, + { + "epoch": 0.5176740627390972, + "grad_norm": 2.4588195664926578, + "learning_rate": 1.7378096078601962e-05, + "loss": 0.8622, + "step": 3383 + }, + { + "epoch": 0.5178270849273144, + "grad_norm": 2.365906374128987, + "learning_rate": 1.7376423056504756e-05, + "loss": 0.7523, + "step": 3384 + }, + { + "epoch": 0.5179801071155318, + "grad_norm": 2.2284179451077035, + "learning_rate": 1.7374749581392558e-05, + "loss": 0.7016, + "step": 3385 + }, + { + "epoch": 0.518133129303749, + "grad_norm": 2.252164274842456, + "learning_rate": 1.7373075653368137e-05, + "loss": 0.76, + "step": 3386 + }, + { + "epoch": 0.5182861514919663, + "grad_norm": 2.3064049524967674, + "learning_rate": 1.7371401272534295e-05, + "loss": 0.8086, + "step": 3387 + }, + { + "epoch": 0.5184391736801837, + "grad_norm": 2.3002293014541673, + "learning_rate": 1.7369726438993865e-05, + "loss": 0.8244, + "step": 3388 + }, + { + "epoch": 0.5185921958684009, + "grad_norm": 2.495992362678814, + "learning_rate": 1.7368051152849704e-05, + "loss": 0.8959, + "step": 3389 + }, + { + "epoch": 0.5187452180566182, + "grad_norm": 2.3947045552206445, + "learning_rate": 1.73663754142047e-05, + "loss": 0.8003, + "step": 3390 + }, + { + "epoch": 0.5188982402448356, + "grad_norm": 2.31492330849426, + "learning_rate": 1.736469922316176e-05, + "loss": 0.6984, + "step": 3391 + }, + { + "epoch": 0.5190512624330528, + "grad_norm": 2.237344088919252, + "learning_rate": 1.736302257982383e-05, + "loss": 0.8371, + "step": 3392 + }, + { + "epoch": 0.5192042846212701, + "grad_norm": 2.7400758010484645, + "learning_rate": 1.7361345484293882e-05, + "loss": 0.9044, + "step": 3393 + }, + { + "epoch": 0.5193573068094873, + "grad_norm": 2.33269965816581, + "learning_rate": 1.7359667936674907e-05, + "loss": 0.7688, + "step": 3394 + }, + { + "epoch": 0.5195103289977047, + "grad_norm": 2.429817087511107, + "learning_rate": 1.7357989937069936e-05, + "loss": 0.8199, + "step": 3395 + }, + { + "epoch": 0.519663351185922, + "grad_norm": 2.3330509991957933, + "learning_rate": 1.735631148558202e-05, + "loss": 0.7509, + "step": 3396 + }, + { + "epoch": 0.5198163733741392, + "grad_norm": 2.116684148235966, + "learning_rate": 1.7354632582314232e-05, + "loss": 0.7309, + "step": 3397 + }, + { + "epoch": 0.5199693955623566, + "grad_norm": 2.4278111061121836, + "learning_rate": 1.735295322736969e-05, + "loss": 0.8164, + "step": 3398 + }, + { + "epoch": 0.5201224177505739, + "grad_norm": 2.2950130329897944, + "learning_rate": 1.7351273420851522e-05, + "loss": 0.7755, + "step": 3399 + }, + { + "epoch": 0.5202754399387911, + "grad_norm": 2.3013900005039796, + "learning_rate": 1.7349593162862896e-05, + "loss": 0.7645, + "step": 3400 + }, + { + "epoch": 0.5204284621270084, + "grad_norm": 2.131498202133757, + "learning_rate": 1.7347912453507003e-05, + "loss": 0.7436, + "step": 3401 + }, + { + "epoch": 0.5205814843152257, + "grad_norm": 2.0513736202538104, + "learning_rate": 1.734623129288706e-05, + "loss": 0.7005, + "step": 3402 + }, + { + "epoch": 0.520734506503443, + "grad_norm": 2.458358694468482, + "learning_rate": 1.734454968110632e-05, + "loss": 0.7273, + "step": 3403 + }, + { + "epoch": 0.5208875286916603, + "grad_norm": 2.492268056512554, + "learning_rate": 1.7342867618268043e-05, + "loss": 0.7453, + "step": 3404 + }, + { + "epoch": 0.5210405508798776, + "grad_norm": 2.2243137346199444, + "learning_rate": 1.7341185104475546e-05, + "loss": 0.7455, + "step": 3405 + }, + { + "epoch": 0.5211935730680949, + "grad_norm": 2.3585772891771684, + "learning_rate": 1.733950213983215e-05, + "loss": 0.8491, + "step": 3406 + }, + { + "epoch": 0.5213465952563122, + "grad_norm": 2.301805032044261, + "learning_rate": 1.733781872444122e-05, + "loss": 0.8319, + "step": 3407 + }, + { + "epoch": 0.5214996174445294, + "grad_norm": 2.2982512317533246, + "learning_rate": 1.7336134858406134e-05, + "loss": 0.7436, + "step": 3408 + }, + { + "epoch": 0.5216526396327468, + "grad_norm": 2.335280353022862, + "learning_rate": 1.7334450541830305e-05, + "loss": 0.7517, + "step": 3409 + }, + { + "epoch": 0.521805661820964, + "grad_norm": 2.5502609428808394, + "learning_rate": 1.7332765774817177e-05, + "loss": 0.8186, + "step": 3410 + }, + { + "epoch": 0.5219586840091813, + "grad_norm": 2.415137367526865, + "learning_rate": 1.7331080557470214e-05, + "loss": 0.7861, + "step": 3411 + }, + { + "epoch": 0.5221117061973987, + "grad_norm": 2.720231789733179, + "learning_rate": 1.732939488989292e-05, + "loss": 0.8509, + "step": 3412 + }, + { + "epoch": 0.5222647283856159, + "grad_norm": 2.2490141250767057, + "learning_rate": 1.732770877218881e-05, + "loss": 0.6168, + "step": 3413 + }, + { + "epoch": 0.5224177505738332, + "grad_norm": 2.255322676967083, + "learning_rate": 1.7326022204461438e-05, + "loss": 0.8309, + "step": 3414 + }, + { + "epoch": 0.5225707727620506, + "grad_norm": 2.403136772714949, + "learning_rate": 1.7324335186814382e-05, + "loss": 0.7862, + "step": 3415 + }, + { + "epoch": 0.5227237949502678, + "grad_norm": 2.4871382173190963, + "learning_rate": 1.7322647719351253e-05, + "loss": 0.7874, + "step": 3416 + }, + { + "epoch": 0.5228768171384851, + "grad_norm": 2.0456564998057143, + "learning_rate": 1.732095980217568e-05, + "loss": 0.7629, + "step": 3417 + }, + { + "epoch": 0.5230298393267023, + "grad_norm": 2.140116669546573, + "learning_rate": 1.7319271435391326e-05, + "loss": 0.7619, + "step": 3418 + }, + { + "epoch": 0.5231828615149197, + "grad_norm": 2.2570260777984443, + "learning_rate": 1.7317582619101876e-05, + "loss": 0.7535, + "step": 3419 + }, + { + "epoch": 0.523335883703137, + "grad_norm": 2.3392895682248445, + "learning_rate": 1.7315893353411052e-05, + "loss": 0.824, + "step": 3420 + }, + { + "epoch": 0.5234889058913542, + "grad_norm": 2.543168315321036, + "learning_rate": 1.73142036384226e-05, + "loss": 0.7372, + "step": 3421 + }, + { + "epoch": 0.5236419280795716, + "grad_norm": 2.1197882622093034, + "learning_rate": 1.731251347424029e-05, + "loss": 0.772, + "step": 3422 + }, + { + "epoch": 0.5237949502677889, + "grad_norm": 2.5217333583866535, + "learning_rate": 1.731082286096792e-05, + "loss": 0.7934, + "step": 3423 + }, + { + "epoch": 0.5239479724560061, + "grad_norm": 2.308111842163604, + "learning_rate": 1.7309131798709313e-05, + "loss": 0.8, + "step": 3424 + }, + { + "epoch": 0.5241009946442234, + "grad_norm": 2.1948172683467866, + "learning_rate": 1.7307440287568335e-05, + "loss": 0.7586, + "step": 3425 + }, + { + "epoch": 0.5242540168324407, + "grad_norm": 2.349110205686944, + "learning_rate": 1.730574832764886e-05, + "loss": 0.8168, + "step": 3426 + }, + { + "epoch": 0.524407039020658, + "grad_norm": 2.478957873778854, + "learning_rate": 1.73040559190548e-05, + "loss": 0.7624, + "step": 3427 + }, + { + "epoch": 0.5245600612088753, + "grad_norm": 2.2363519273765573, + "learning_rate": 1.730236306189009e-05, + "loss": 0.7556, + "step": 3428 + }, + { + "epoch": 0.5247130833970926, + "grad_norm": 2.4000273014466638, + "learning_rate": 1.73006697562587e-05, + "loss": 0.631, + "step": 3429 + }, + { + "epoch": 0.5248661055853099, + "grad_norm": 2.296172924033883, + "learning_rate": 1.729897600226462e-05, + "loss": 0.6752, + "step": 3430 + }, + { + "epoch": 0.5250191277735272, + "grad_norm": 2.4068767186514926, + "learning_rate": 1.729728180001187e-05, + "loss": 0.768, + "step": 3431 + }, + { + "epoch": 0.5251721499617444, + "grad_norm": 2.0931991854527574, + "learning_rate": 1.7295587149604496e-05, + "loss": 0.7415, + "step": 3432 + }, + { + "epoch": 0.5253251721499618, + "grad_norm": 2.2446215315731455, + "learning_rate": 1.7293892051146575e-05, + "loss": 0.7315, + "step": 3433 + }, + { + "epoch": 0.525478194338179, + "grad_norm": 2.31206674583333, + "learning_rate": 1.729219650474221e-05, + "loss": 0.7596, + "step": 3434 + }, + { + "epoch": 0.5256312165263963, + "grad_norm": 2.4780596800982195, + "learning_rate": 1.7290500510495532e-05, + "loss": 0.8166, + "step": 3435 + }, + { + "epoch": 0.5257842387146137, + "grad_norm": 2.2182491100816977, + "learning_rate": 1.7288804068510695e-05, + "loss": 0.7766, + "step": 3436 + }, + { + "epoch": 0.5259372609028309, + "grad_norm": 2.5993499183042323, + "learning_rate": 1.7287107178891886e-05, + "loss": 0.7479, + "step": 3437 + }, + { + "epoch": 0.5260902830910482, + "grad_norm": 2.6164904758032295, + "learning_rate": 1.7285409841743318e-05, + "loss": 0.8316, + "step": 3438 + }, + { + "epoch": 0.5262433052792654, + "grad_norm": 2.1482006073526083, + "learning_rate": 1.728371205716923e-05, + "loss": 0.7556, + "step": 3439 + }, + { + "epoch": 0.5263963274674828, + "grad_norm": 2.2992580395879236, + "learning_rate": 1.7282013825273894e-05, + "loss": 0.8617, + "step": 3440 + }, + { + "epoch": 0.5265493496557001, + "grad_norm": 2.760122409809691, + "learning_rate": 1.7280315146161595e-05, + "loss": 0.8638, + "step": 3441 + }, + { + "epoch": 0.5267023718439173, + "grad_norm": 2.413735791441003, + "learning_rate": 1.727861601993667e-05, + "loss": 0.7804, + "step": 3442 + }, + { + "epoch": 0.5268553940321347, + "grad_norm": 3.149252704804255, + "learning_rate": 1.727691644670346e-05, + "loss": 0.7932, + "step": 3443 + }, + { + "epoch": 0.527008416220352, + "grad_norm": 2.2225478603436235, + "learning_rate": 1.727521642656634e-05, + "loss": 0.7971, + "step": 3444 + }, + { + "epoch": 0.5271614384085692, + "grad_norm": 2.2044358309518524, + "learning_rate": 1.7273515959629715e-05, + "loss": 0.6773, + "step": 3445 + }, + { + "epoch": 0.5273144605967865, + "grad_norm": 2.4651261717517774, + "learning_rate": 1.727181504599803e-05, + "loss": 0.8615, + "step": 3446 + }, + { + "epoch": 0.5274674827850038, + "grad_norm": 2.5505959614561786, + "learning_rate": 1.7270113685775728e-05, + "loss": 0.8561, + "step": 3447 + }, + { + "epoch": 0.5276205049732211, + "grad_norm": 2.2154727272181036, + "learning_rate": 1.7268411879067305e-05, + "loss": 0.7426, + "step": 3448 + }, + { + "epoch": 0.5277735271614384, + "grad_norm": 2.226037165166472, + "learning_rate": 1.7266709625977277e-05, + "loss": 0.7895, + "step": 3449 + }, + { + "epoch": 0.5279265493496557, + "grad_norm": 2.533881833418401, + "learning_rate": 1.7265006926610183e-05, + "loss": 0.7938, + "step": 3450 + }, + { + "epoch": 0.528079571537873, + "grad_norm": 2.4668535803573213, + "learning_rate": 1.726330378107059e-05, + "loss": 0.7951, + "step": 3451 + }, + { + "epoch": 0.5282325937260903, + "grad_norm": 2.213391599044407, + "learning_rate": 1.72616001894631e-05, + "loss": 0.6765, + "step": 3452 + }, + { + "epoch": 0.5283856159143075, + "grad_norm": 2.231713283875013, + "learning_rate": 1.7259896151892336e-05, + "loss": 0.7737, + "step": 3453 + }, + { + "epoch": 0.5285386381025249, + "grad_norm": 2.203632199838888, + "learning_rate": 1.7258191668462948e-05, + "loss": 0.7826, + "step": 3454 + }, + { + "epoch": 0.5286916602907421, + "grad_norm": 2.2665233994578076, + "learning_rate": 1.725648673927961e-05, + "loss": 0.7484, + "step": 3455 + }, + { + "epoch": 0.5288446824789594, + "grad_norm": 2.2420393983606353, + "learning_rate": 1.725478136444704e-05, + "loss": 0.8268, + "step": 3456 + }, + { + "epoch": 0.5289977046671768, + "grad_norm": 2.6467064473764834, + "learning_rate": 1.7253075544069964e-05, + "loss": 0.8971, + "step": 3457 + }, + { + "epoch": 0.529150726855394, + "grad_norm": 2.8179506973708786, + "learning_rate": 1.7251369278253146e-05, + "loss": 0.7285, + "step": 3458 + }, + { + "epoch": 0.5293037490436113, + "grad_norm": 2.0736781024821838, + "learning_rate": 1.724966256710137e-05, + "loss": 0.7716, + "step": 3459 + }, + { + "epoch": 0.5294567712318287, + "grad_norm": 2.3075858700032037, + "learning_rate": 1.7247955410719454e-05, + "loss": 0.7077, + "step": 3460 + }, + { + "epoch": 0.5296097934200459, + "grad_norm": 2.325588149675756, + "learning_rate": 1.7246247809212248e-05, + "loss": 0.753, + "step": 3461 + }, + { + "epoch": 0.5297628156082632, + "grad_norm": 2.3361579258165883, + "learning_rate": 1.7244539762684607e-05, + "loss": 0.8598, + "step": 3462 + }, + { + "epoch": 0.5299158377964804, + "grad_norm": 2.105536512175094, + "learning_rate": 1.7242831271241446e-05, + "loss": 0.8154, + "step": 3463 + }, + { + "epoch": 0.5300688599846978, + "grad_norm": 2.3968996850665794, + "learning_rate": 1.7241122334987684e-05, + "loss": 0.8799, + "step": 3464 + }, + { + "epoch": 0.5302218821729151, + "grad_norm": 2.484649805300681, + "learning_rate": 1.7239412954028268e-05, + "loss": 0.8878, + "step": 3465 + }, + { + "epoch": 0.5303749043611323, + "grad_norm": 2.41188080989505, + "learning_rate": 1.723770312846818e-05, + "loss": 0.8617, + "step": 3466 + }, + { + "epoch": 0.5305279265493497, + "grad_norm": 2.5004806531008734, + "learning_rate": 1.7235992858412432e-05, + "loss": 0.8951, + "step": 3467 + }, + { + "epoch": 0.530680948737567, + "grad_norm": 2.3251879051654063, + "learning_rate": 1.7234282143966056e-05, + "loss": 0.7946, + "step": 3468 + }, + { + "epoch": 0.5308339709257842, + "grad_norm": 2.270779230163958, + "learning_rate": 1.723257098523411e-05, + "loss": 0.814, + "step": 3469 + }, + { + "epoch": 0.5309869931140015, + "grad_norm": 2.318798194460353, + "learning_rate": 1.7230859382321686e-05, + "loss": 0.8017, + "step": 3470 + }, + { + "epoch": 0.5311400153022188, + "grad_norm": 2.3221217836842274, + "learning_rate": 1.7229147335333906e-05, + "loss": 0.7764, + "step": 3471 + }, + { + "epoch": 0.5312930374904361, + "grad_norm": 2.1502170801701377, + "learning_rate": 1.72274348443759e-05, + "loss": 0.7496, + "step": 3472 + }, + { + "epoch": 0.5314460596786534, + "grad_norm": 2.2513830788602442, + "learning_rate": 1.722572190955285e-05, + "loss": 0.8413, + "step": 3473 + }, + { + "epoch": 0.5315990818668707, + "grad_norm": 2.3165428770546392, + "learning_rate": 1.722400853096995e-05, + "loss": 0.7304, + "step": 3474 + }, + { + "epoch": 0.531752104055088, + "grad_norm": 2.022193837277281, + "learning_rate": 1.7222294708732423e-05, + "loss": 0.8124, + "step": 3475 + }, + { + "epoch": 0.5319051262433053, + "grad_norm": 2.240338201650682, + "learning_rate": 1.7220580442945527e-05, + "loss": 0.7717, + "step": 3476 + }, + { + "epoch": 0.5320581484315225, + "grad_norm": 2.279369131283573, + "learning_rate": 1.7218865733714538e-05, + "loss": 0.8536, + "step": 3477 + }, + { + "epoch": 0.5322111706197399, + "grad_norm": 2.684390602188598, + "learning_rate": 1.7217150581144767e-05, + "loss": 0.8194, + "step": 3478 + }, + { + "epoch": 0.5323641928079571, + "grad_norm": 1.9666810860473458, + "learning_rate": 1.721543498534154e-05, + "loss": 0.7184, + "step": 3479 + }, + { + "epoch": 0.5325172149961744, + "grad_norm": 2.1966123033514364, + "learning_rate": 1.7213718946410227e-05, + "loss": 0.6808, + "step": 3480 + }, + { + "epoch": 0.5326702371843918, + "grad_norm": 2.839505940014808, + "learning_rate": 1.721200246445621e-05, + "loss": 0.7687, + "step": 3481 + }, + { + "epoch": 0.532823259372609, + "grad_norm": 2.396567722242438, + "learning_rate": 1.7210285539584913e-05, + "loss": 0.7691, + "step": 3482 + }, + { + "epoch": 0.5329762815608263, + "grad_norm": 2.4196703683338625, + "learning_rate": 1.7208568171901768e-05, + "loss": 0.7263, + "step": 3483 + }, + { + "epoch": 0.5331293037490437, + "grad_norm": 2.3478722383711967, + "learning_rate": 1.720685036151226e-05, + "loss": 0.7947, + "step": 3484 + }, + { + "epoch": 0.5332823259372609, + "grad_norm": 2.463647890976903, + "learning_rate": 1.7205132108521868e-05, + "loss": 0.776, + "step": 3485 + }, + { + "epoch": 0.5334353481254782, + "grad_norm": 2.465606762967954, + "learning_rate": 1.7203413413036132e-05, + "loss": 0.8351, + "step": 3486 + }, + { + "epoch": 0.5335883703136954, + "grad_norm": 2.4476074906053005, + "learning_rate": 1.72016942751606e-05, + "loss": 0.8652, + "step": 3487 + }, + { + "epoch": 0.5337413925019128, + "grad_norm": 2.3371820769390217, + "learning_rate": 1.7199974695000844e-05, + "loss": 0.7619, + "step": 3488 + }, + { + "epoch": 0.5338944146901301, + "grad_norm": 2.016240201556696, + "learning_rate": 1.7198254672662482e-05, + "loss": 0.7056, + "step": 3489 + }, + { + "epoch": 0.5340474368783473, + "grad_norm": 2.1501317642039823, + "learning_rate": 1.7196534208251138e-05, + "loss": 0.7003, + "step": 3490 + }, + { + "epoch": 0.5342004590665647, + "grad_norm": 2.428358228929068, + "learning_rate": 1.7194813301872475e-05, + "loss": 0.6957, + "step": 3491 + }, + { + "epoch": 0.534353481254782, + "grad_norm": 2.3692668417947824, + "learning_rate": 1.719309195363218e-05, + "loss": 0.833, + "step": 3492 + }, + { + "epoch": 0.5345065034429992, + "grad_norm": 2.2518561612504544, + "learning_rate": 1.7191370163635968e-05, + "loss": 0.8864, + "step": 3493 + }, + { + "epoch": 0.5346595256312165, + "grad_norm": 2.1806620122914433, + "learning_rate": 1.7189647931989584e-05, + "loss": 0.7775, + "step": 3494 + }, + { + "epoch": 0.5348125478194338, + "grad_norm": 2.1202465956947107, + "learning_rate": 1.7187925258798795e-05, + "loss": 0.6904, + "step": 3495 + }, + { + "epoch": 0.5349655700076511, + "grad_norm": 2.293147683612584, + "learning_rate": 1.7186202144169396e-05, + "loss": 0.7109, + "step": 3496 + }, + { + "epoch": 0.5351185921958684, + "grad_norm": 2.387009081930552, + "learning_rate": 1.7184478588207208e-05, + "loss": 0.8928, + "step": 3497 + }, + { + "epoch": 0.5352716143840857, + "grad_norm": 2.036587619586427, + "learning_rate": 1.7182754591018084e-05, + "loss": 0.7308, + "step": 3498 + }, + { + "epoch": 0.535424636572303, + "grad_norm": 2.23617394688373, + "learning_rate": 1.7181030152707903e-05, + "loss": 0.8534, + "step": 3499 + }, + { + "epoch": 0.5355776587605203, + "grad_norm": 2.4902832725810464, + "learning_rate": 1.7179305273382567e-05, + "loss": 0.898, + "step": 3500 + }, + { + "epoch": 0.5357306809487375, + "grad_norm": 2.2712921361457936, + "learning_rate": 1.717757995314801e-05, + "loss": 0.7424, + "step": 3501 + }, + { + "epoch": 0.5358837031369549, + "grad_norm": 2.1869051131580823, + "learning_rate": 1.717585419211019e-05, + "loss": 0.8083, + "step": 3502 + }, + { + "epoch": 0.5360367253251721, + "grad_norm": 2.0173761674315616, + "learning_rate": 1.7174127990375092e-05, + "loss": 0.7098, + "step": 3503 + }, + { + "epoch": 0.5361897475133894, + "grad_norm": 2.7480203050102974, + "learning_rate": 1.7172401348048726e-05, + "loss": 0.8463, + "step": 3504 + }, + { + "epoch": 0.5363427697016068, + "grad_norm": 2.2412224884608185, + "learning_rate": 1.7170674265237136e-05, + "loss": 0.7697, + "step": 3505 + }, + { + "epoch": 0.536495791889824, + "grad_norm": 2.317814020727311, + "learning_rate": 1.7168946742046388e-05, + "loss": 0.7449, + "step": 3506 + }, + { + "epoch": 0.5366488140780413, + "grad_norm": 2.60268449435134, + "learning_rate": 1.7167218778582575e-05, + "loss": 0.7636, + "step": 3507 + }, + { + "epoch": 0.5368018362662587, + "grad_norm": 2.4034307976445, + "learning_rate": 1.716549037495182e-05, + "loss": 0.7584, + "step": 3508 + }, + { + "epoch": 0.5369548584544759, + "grad_norm": 2.181778255040697, + "learning_rate": 1.7163761531260267e-05, + "loss": 0.8163, + "step": 3509 + }, + { + "epoch": 0.5371078806426932, + "grad_norm": 2.144458168411783, + "learning_rate": 1.7162032247614097e-05, + "loss": 0.7615, + "step": 3510 + }, + { + "epoch": 0.5372609028309104, + "grad_norm": 2.277975562378708, + "learning_rate": 1.7160302524119506e-05, + "loss": 0.7225, + "step": 3511 + }, + { + "epoch": 0.5374139250191278, + "grad_norm": 2.1812185639721324, + "learning_rate": 1.7158572360882723e-05, + "loss": 0.7146, + "step": 3512 + }, + { + "epoch": 0.5375669472073451, + "grad_norm": 2.487858008956128, + "learning_rate": 1.7156841758010015e-05, + "loss": 0.8443, + "step": 3513 + }, + { + "epoch": 0.5377199693955623, + "grad_norm": 2.451809471203112, + "learning_rate": 1.715511071560765e-05, + "loss": 0.8188, + "step": 3514 + }, + { + "epoch": 0.5378729915837797, + "grad_norm": 2.4906401899132127, + "learning_rate": 1.7153379233781947e-05, + "loss": 0.7324, + "step": 3515 + }, + { + "epoch": 0.538026013771997, + "grad_norm": 2.3799176640369293, + "learning_rate": 1.715164731263924e-05, + "loss": 0.7876, + "step": 3516 + }, + { + "epoch": 0.5381790359602142, + "grad_norm": 2.256999436428998, + "learning_rate": 1.7149914952285896e-05, + "loss": 0.704, + "step": 3517 + }, + { + "epoch": 0.5383320581484315, + "grad_norm": 2.375566540112142, + "learning_rate": 1.7148182152828302e-05, + "loss": 0.7552, + "step": 3518 + }, + { + "epoch": 0.5384850803366488, + "grad_norm": 2.396394674063, + "learning_rate": 1.714644891437288e-05, + "loss": 0.8285, + "step": 3519 + }, + { + "epoch": 0.5386381025248661, + "grad_norm": 2.214449654913485, + "learning_rate": 1.7144715237026073e-05, + "loss": 0.8153, + "step": 3520 + }, + { + "epoch": 0.5387911247130834, + "grad_norm": 2.440366421084005, + "learning_rate": 1.714298112089435e-05, + "loss": 0.7928, + "step": 3521 + }, + { + "epoch": 0.5389441469013007, + "grad_norm": 2.038324486460608, + "learning_rate": 1.7141246566084218e-05, + "loss": 0.6482, + "step": 3522 + }, + { + "epoch": 0.539097169089518, + "grad_norm": 2.718259058349394, + "learning_rate": 1.7139511572702195e-05, + "loss": 0.8081, + "step": 3523 + }, + { + "epoch": 0.5392501912777353, + "grad_norm": 2.5128057012131255, + "learning_rate": 1.7137776140854838e-05, + "loss": 0.8716, + "step": 3524 + }, + { + "epoch": 0.5394032134659525, + "grad_norm": 2.484955535252938, + "learning_rate": 1.713604027064872e-05, + "loss": 0.8966, + "step": 3525 + }, + { + "epoch": 0.5395562356541699, + "grad_norm": 2.329587732262935, + "learning_rate": 1.713430396219046e-05, + "loss": 0.7159, + "step": 3526 + }, + { + "epoch": 0.5397092578423871, + "grad_norm": 2.2577950281919956, + "learning_rate": 1.713256721558668e-05, + "loss": 0.7494, + "step": 3527 + }, + { + "epoch": 0.5398622800306044, + "grad_norm": 2.2125935730221107, + "learning_rate": 1.7130830030944042e-05, + "loss": 0.743, + "step": 3528 + }, + { + "epoch": 0.5400153022188218, + "grad_norm": 2.4576951883990286, + "learning_rate": 1.712909240836924e-05, + "loss": 0.7244, + "step": 3529 + }, + { + "epoch": 0.540168324407039, + "grad_norm": 2.4385213443689078, + "learning_rate": 1.7127354347968984e-05, + "loss": 0.8586, + "step": 3530 + }, + { + "epoch": 0.5403213465952563, + "grad_norm": 2.281079066866731, + "learning_rate": 1.712561584985001e-05, + "loss": 0.8375, + "step": 3531 + }, + { + "epoch": 0.5404743687834737, + "grad_norm": 2.114376916900364, + "learning_rate": 1.7123876914119097e-05, + "loss": 0.7872, + "step": 3532 + }, + { + "epoch": 0.5406273909716909, + "grad_norm": 2.3214355956058057, + "learning_rate": 1.7122137540883034e-05, + "loss": 0.8094, + "step": 3533 + }, + { + "epoch": 0.5407804131599082, + "grad_norm": 2.2774113626957377, + "learning_rate": 1.712039773024864e-05, + "loss": 0.7912, + "step": 3534 + }, + { + "epoch": 0.5409334353481254, + "grad_norm": 2.3514449429395685, + "learning_rate": 1.7118657482322768e-05, + "loss": 0.8295, + "step": 3535 + }, + { + "epoch": 0.5410864575363428, + "grad_norm": 2.388457809727973, + "learning_rate": 1.7116916797212288e-05, + "loss": 0.8479, + "step": 3536 + }, + { + "epoch": 0.5412394797245601, + "grad_norm": 2.4052542394521623, + "learning_rate": 1.7115175675024107e-05, + "loss": 0.8573, + "step": 3537 + }, + { + "epoch": 0.5413925019127773, + "grad_norm": 2.281123645319344, + "learning_rate": 1.7113434115865157e-05, + "loss": 0.7488, + "step": 3538 + }, + { + "epoch": 0.5415455241009947, + "grad_norm": 2.0479313985657197, + "learning_rate": 1.7111692119842388e-05, + "loss": 0.7098, + "step": 3539 + }, + { + "epoch": 0.541698546289212, + "grad_norm": 2.503934028957903, + "learning_rate": 1.7109949687062784e-05, + "loss": 0.7766, + "step": 3540 + }, + { + "epoch": 0.5418515684774292, + "grad_norm": 2.2920623120918076, + "learning_rate": 1.7108206817633355e-05, + "loss": 0.7058, + "step": 3541 + }, + { + "epoch": 0.5420045906656465, + "grad_norm": 2.3190499618303715, + "learning_rate": 1.7106463511661136e-05, + "loss": 0.8249, + "step": 3542 + }, + { + "epoch": 0.5421576128538638, + "grad_norm": 2.3367282852844093, + "learning_rate": 1.7104719769253196e-05, + "loss": 0.8424, + "step": 3543 + }, + { + "epoch": 0.5423106350420811, + "grad_norm": 2.459648815663863, + "learning_rate": 1.7102975590516622e-05, + "loss": 0.7337, + "step": 3544 + }, + { + "epoch": 0.5424636572302984, + "grad_norm": 2.1848741538774243, + "learning_rate": 1.710123097555853e-05, + "loss": 0.6866, + "step": 3545 + }, + { + "epoch": 0.5426166794185157, + "grad_norm": 2.2894374221802813, + "learning_rate": 1.709948592448606e-05, + "loss": 1.0706, + "step": 3546 + }, + { + "epoch": 0.542769701606733, + "grad_norm": 2.1592314348992354, + "learning_rate": 1.7097740437406388e-05, + "loss": 0.79, + "step": 3547 + }, + { + "epoch": 0.5429227237949502, + "grad_norm": 2.4608898527446863, + "learning_rate": 1.7095994514426704e-05, + "loss": 0.8597, + "step": 3548 + }, + { + "epoch": 0.5430757459831675, + "grad_norm": 2.3433110444464886, + "learning_rate": 1.7094248155654245e-05, + "loss": 0.8365, + "step": 3549 + }, + { + "epoch": 0.5432287681713849, + "grad_norm": 2.15572265762958, + "learning_rate": 1.709250136119625e-05, + "loss": 0.7956, + "step": 3550 + }, + { + "epoch": 0.5433817903596021, + "grad_norm": 2.3729171950028247, + "learning_rate": 1.7090754131160002e-05, + "loss": 0.816, + "step": 3551 + }, + { + "epoch": 0.5435348125478194, + "grad_norm": 2.2130874362562074, + "learning_rate": 1.7089006465652803e-05, + "loss": 0.776, + "step": 3552 + }, + { + "epoch": 0.5436878347360368, + "grad_norm": 2.0911755353057626, + "learning_rate": 1.7087258364781983e-05, + "loss": 0.7587, + "step": 3553 + }, + { + "epoch": 0.543840856924254, + "grad_norm": 2.4086231529019995, + "learning_rate": 1.7085509828654903e-05, + "loss": 0.7506, + "step": 3554 + }, + { + "epoch": 0.5439938791124713, + "grad_norm": 2.472931732242185, + "learning_rate": 1.7083760857378944e-05, + "loss": 0.8069, + "step": 3555 + }, + { + "epoch": 0.5441469013006885, + "grad_norm": 2.2070865464878966, + "learning_rate": 1.708201145106152e-05, + "loss": 0.7785, + "step": 3556 + }, + { + "epoch": 0.5442999234889059, + "grad_norm": 2.2713818289159096, + "learning_rate": 1.708026160981007e-05, + "loss": 0.7081, + "step": 3557 + }, + { + "epoch": 0.5444529456771232, + "grad_norm": 2.092372395572879, + "learning_rate": 1.7078511333732055e-05, + "loss": 0.8032, + "step": 3558 + }, + { + "epoch": 0.5446059678653404, + "grad_norm": 2.4299843117798483, + "learning_rate": 1.7076760622934968e-05, + "loss": 0.8316, + "step": 3559 + }, + { + "epoch": 0.5447589900535578, + "grad_norm": 2.3366226179643514, + "learning_rate": 1.7075009477526326e-05, + "loss": 0.7562, + "step": 3560 + }, + { + "epoch": 0.5449120122417751, + "grad_norm": 2.1968180719323662, + "learning_rate": 1.7073257897613676e-05, + "loss": 0.8084, + "step": 3561 + }, + { + "epoch": 0.5450650344299923, + "grad_norm": 2.2329839134922658, + "learning_rate": 1.7071505883304583e-05, + "loss": 0.7897, + "step": 3562 + }, + { + "epoch": 0.5452180566182097, + "grad_norm": 2.1279812197279533, + "learning_rate": 1.706975343470665e-05, + "loss": 0.7008, + "step": 3563 + }, + { + "epoch": 0.5453710788064269, + "grad_norm": 2.457635225867449, + "learning_rate": 1.706800055192751e-05, + "loss": 0.8133, + "step": 3564 + }, + { + "epoch": 0.5455241009946442, + "grad_norm": 2.5452787153874588, + "learning_rate": 1.70662472350748e-05, + "loss": 0.8601, + "step": 3565 + }, + { + "epoch": 0.5456771231828615, + "grad_norm": 2.2487105216791896, + "learning_rate": 1.7064493484256203e-05, + "loss": 0.7402, + "step": 3566 + }, + { + "epoch": 0.5458301453710788, + "grad_norm": 1.9842725323794457, + "learning_rate": 1.7062739299579428e-05, + "loss": 0.7388, + "step": 3567 + }, + { + "epoch": 0.5459831675592961, + "grad_norm": 2.235134268947892, + "learning_rate": 1.7060984681152198e-05, + "loss": 0.7448, + "step": 3568 + }, + { + "epoch": 0.5461361897475134, + "grad_norm": 2.3743457059092252, + "learning_rate": 1.705922962908228e-05, + "loss": 0.6938, + "step": 3569 + }, + { + "epoch": 0.5462892119357307, + "grad_norm": 2.2343627337316208, + "learning_rate": 1.7057474143477453e-05, + "loss": 0.7406, + "step": 3570 + }, + { + "epoch": 0.546442234123948, + "grad_norm": 2.0820331103238745, + "learning_rate": 1.7055718224445536e-05, + "loss": 0.7538, + "step": 3571 + }, + { + "epoch": 0.5465952563121652, + "grad_norm": 2.45795322798732, + "learning_rate": 1.705396187209435e-05, + "loss": 0.8631, + "step": 3572 + }, + { + "epoch": 0.5467482785003825, + "grad_norm": 2.0978156116712694, + "learning_rate": 1.7052205086531777e-05, + "loss": 0.7448, + "step": 3573 + }, + { + "epoch": 0.5469013006885999, + "grad_norm": 2.8515911974508024, + "learning_rate": 1.7050447867865703e-05, + "loss": 0.7892, + "step": 3574 + }, + { + "epoch": 0.5470543228768171, + "grad_norm": 2.4076632991660487, + "learning_rate": 1.704869021620404e-05, + "loss": 0.8385, + "step": 3575 + }, + { + "epoch": 0.5472073450650344, + "grad_norm": 2.0957414053029826, + "learning_rate": 1.7046932131654736e-05, + "loss": 0.739, + "step": 3576 + }, + { + "epoch": 0.5473603672532518, + "grad_norm": 2.5671839888269194, + "learning_rate": 1.704517361432576e-05, + "loss": 0.8024, + "step": 3577 + }, + { + "epoch": 0.547513389441469, + "grad_norm": 2.3047558703626185, + "learning_rate": 1.7043414664325117e-05, + "loss": 0.7954, + "step": 3578 + }, + { + "epoch": 0.5476664116296863, + "grad_norm": 2.3600811561008017, + "learning_rate": 1.7041655281760824e-05, + "loss": 0.7037, + "step": 3579 + }, + { + "epoch": 0.5478194338179035, + "grad_norm": 2.3556484582162147, + "learning_rate": 1.703989546674093e-05, + "loss": 0.8624, + "step": 3580 + }, + { + "epoch": 0.5479724560061209, + "grad_norm": 2.275299767646912, + "learning_rate": 1.7038135219373518e-05, + "loss": 0.8452, + "step": 3581 + }, + { + "epoch": 0.5481254781943382, + "grad_norm": 2.4187029345883224, + "learning_rate": 1.7036374539766688e-05, + "loss": 0.7792, + "step": 3582 + }, + { + "epoch": 0.5482785003825554, + "grad_norm": 2.206175619502996, + "learning_rate": 1.703461342802857e-05, + "loss": 0.8213, + "step": 3583 + }, + { + "epoch": 0.5484315225707728, + "grad_norm": 2.6504692224635553, + "learning_rate": 1.7032851884267323e-05, + "loss": 0.8608, + "step": 3584 + }, + { + "epoch": 0.5485845447589901, + "grad_norm": 4.131748832937652, + "learning_rate": 1.703108990859113e-05, + "loss": 0.8366, + "step": 3585 + }, + { + "epoch": 0.5487375669472073, + "grad_norm": 2.2101905827124373, + "learning_rate": 1.70293275011082e-05, + "loss": 0.7445, + "step": 3586 + }, + { + "epoch": 0.5488905891354247, + "grad_norm": 2.4726466589537517, + "learning_rate": 1.7027564661926766e-05, + "loss": 0.7431, + "step": 3587 + }, + { + "epoch": 0.5490436113236419, + "grad_norm": 2.4049526878760585, + "learning_rate": 1.7025801391155097e-05, + "loss": 0.6889, + "step": 3588 + }, + { + "epoch": 0.5491966335118592, + "grad_norm": 2.364932204717252, + "learning_rate": 1.7024037688901477e-05, + "loss": 0.7787, + "step": 3589 + }, + { + "epoch": 0.5493496557000765, + "grad_norm": 2.1140087888729324, + "learning_rate": 1.7022273555274228e-05, + "loss": 0.7424, + "step": 3590 + }, + { + "epoch": 0.5495026778882938, + "grad_norm": 2.3098634667372884, + "learning_rate": 1.7020508990381685e-05, + "loss": 0.7408, + "step": 3591 + }, + { + "epoch": 0.5496557000765111, + "grad_norm": 2.4846746652950555, + "learning_rate": 1.7018743994332222e-05, + "loss": 0.9127, + "step": 3592 + }, + { + "epoch": 0.5498087222647284, + "grad_norm": 2.475667889661541, + "learning_rate": 1.7016978567234234e-05, + "loss": 0.855, + "step": 3593 + }, + { + "epoch": 0.5499617444529457, + "grad_norm": 2.391009217771575, + "learning_rate": 1.7015212709196142e-05, + "loss": 0.7626, + "step": 3594 + }, + { + "epoch": 0.550114766641163, + "grad_norm": 2.7017637632302116, + "learning_rate": 1.7013446420326393e-05, + "loss": 0.7821, + "step": 3595 + }, + { + "epoch": 0.5502677888293802, + "grad_norm": 2.302889942316845, + "learning_rate": 1.701167970073346e-05, + "loss": 0.7432, + "step": 3596 + }, + { + "epoch": 0.5504208110175975, + "grad_norm": 2.0950840579929983, + "learning_rate": 1.700991255052585e-05, + "loss": 0.6559, + "step": 3597 + }, + { + "epoch": 0.5505738332058149, + "grad_norm": 2.0143471965497652, + "learning_rate": 1.7008144969812085e-05, + "loss": 0.6854, + "step": 3598 + }, + { + "epoch": 0.5507268553940321, + "grad_norm": 2.153960943563205, + "learning_rate": 1.7006376958700724e-05, + "loss": 0.722, + "step": 3599 + }, + { + "epoch": 0.5508798775822494, + "grad_norm": 2.252566815376236, + "learning_rate": 1.7004608517300343e-05, + "loss": 0.7874, + "step": 3600 + }, + { + "epoch": 0.5510328997704668, + "grad_norm": 2.306690084716881, + "learning_rate": 1.700283964571955e-05, + "loss": 0.6778, + "step": 3601 + }, + { + "epoch": 0.551185921958684, + "grad_norm": 2.0446451450569083, + "learning_rate": 1.7001070344066982e-05, + "loss": 0.729, + "step": 3602 + }, + { + "epoch": 0.5513389441469013, + "grad_norm": 2.249420963070831, + "learning_rate": 1.6999300612451294e-05, + "loss": 0.6717, + "step": 3603 + }, + { + "epoch": 0.5514919663351185, + "grad_norm": 2.3610035001536853, + "learning_rate": 1.6997530450981175e-05, + "loss": 0.8365, + "step": 3604 + }, + { + "epoch": 0.5516449885233359, + "grad_norm": 2.301984232252648, + "learning_rate": 1.6995759859765332e-05, + "loss": 0.828, + "step": 3605 + }, + { + "epoch": 0.5517980107115532, + "grad_norm": 2.242175879504416, + "learning_rate": 1.6993988838912514e-05, + "loss": 0.7475, + "step": 3606 + }, + { + "epoch": 0.5519510328997704, + "grad_norm": 2.3666687002372737, + "learning_rate": 1.699221738853148e-05, + "loss": 0.5838, + "step": 3607 + }, + { + "epoch": 0.5521040550879878, + "grad_norm": 2.3339878325157417, + "learning_rate": 1.6990445508731023e-05, + "loss": 0.8548, + "step": 3608 + }, + { + "epoch": 0.5522570772762051, + "grad_norm": 2.2915212295829672, + "learning_rate": 1.6988673199619955e-05, + "loss": 0.7516, + "step": 3609 + }, + { + "epoch": 0.5524100994644223, + "grad_norm": 2.6222980931777418, + "learning_rate": 1.698690046130713e-05, + "loss": 0.7324, + "step": 3610 + }, + { + "epoch": 0.5525631216526397, + "grad_norm": 2.591361000180004, + "learning_rate": 1.6985127293901417e-05, + "loss": 0.8047, + "step": 3611 + }, + { + "epoch": 0.5527161438408569, + "grad_norm": 2.5309834401100066, + "learning_rate": 1.698335369751171e-05, + "loss": 0.7602, + "step": 3612 + }, + { + "epoch": 0.5528691660290742, + "grad_norm": 2.332044775123289, + "learning_rate": 1.6981579672246932e-05, + "loss": 0.8728, + "step": 3613 + }, + { + "epoch": 0.5530221882172915, + "grad_norm": 2.3418254681031527, + "learning_rate": 1.6979805218216033e-05, + "loss": 0.7037, + "step": 3614 + }, + { + "epoch": 0.5531752104055088, + "grad_norm": 2.2946913600869783, + "learning_rate": 1.697803033552799e-05, + "loss": 0.6968, + "step": 3615 + }, + { + "epoch": 0.5533282325937261, + "grad_norm": 2.345597481651466, + "learning_rate": 1.697625502429181e-05, + "loss": 0.8231, + "step": 3616 + }, + { + "epoch": 0.5534812547819434, + "grad_norm": 2.158189225934225, + "learning_rate": 1.6974479284616512e-05, + "loss": 0.6546, + "step": 3617 + }, + { + "epoch": 0.5536342769701607, + "grad_norm": 2.1114929224062187, + "learning_rate": 1.697270311661116e-05, + "loss": 0.7127, + "step": 3618 + }, + { + "epoch": 0.553787299158378, + "grad_norm": 2.1596841291083453, + "learning_rate": 1.6970926520384833e-05, + "loss": 0.7419, + "step": 3619 + }, + { + "epoch": 0.5539403213465952, + "grad_norm": 2.400828622459582, + "learning_rate": 1.696914949604664e-05, + "loss": 0.7161, + "step": 3620 + }, + { + "epoch": 0.5540933435348125, + "grad_norm": 2.323163384739257, + "learning_rate": 1.6967372043705707e-05, + "loss": 0.8356, + "step": 3621 + }, + { + "epoch": 0.5542463657230299, + "grad_norm": 2.4295012870436152, + "learning_rate": 1.6965594163471202e-05, + "loss": 0.8251, + "step": 3622 + }, + { + "epoch": 0.5543993879112471, + "grad_norm": 2.2234601452756864, + "learning_rate": 1.6963815855452312e-05, + "loss": 0.6826, + "step": 3623 + }, + { + "epoch": 0.5545524100994644, + "grad_norm": 1.9956514045453755, + "learning_rate": 1.6962037119758247e-05, + "loss": 0.7141, + "step": 3624 + }, + { + "epoch": 0.5547054322876818, + "grad_norm": 2.3909418043326545, + "learning_rate": 1.696025795649825e-05, + "loss": 0.8332, + "step": 3625 + }, + { + "epoch": 0.554858454475899, + "grad_norm": 2.453103266939776, + "learning_rate": 1.6958478365781576e-05, + "loss": 0.8726, + "step": 3626 + }, + { + "epoch": 0.5550114766641163, + "grad_norm": 2.2746636360271855, + "learning_rate": 1.695669834771753e-05, + "loss": 0.8042, + "step": 3627 + }, + { + "epoch": 0.5551644988523335, + "grad_norm": 2.2324321290351796, + "learning_rate": 1.6954917902415423e-05, + "loss": 0.7195, + "step": 3628 + }, + { + "epoch": 0.5553175210405509, + "grad_norm": 2.2940189638814728, + "learning_rate": 1.6953137029984597e-05, + "loss": 0.7609, + "step": 3629 + }, + { + "epoch": 0.5554705432287682, + "grad_norm": 2.3775879354384677, + "learning_rate": 1.6951355730534426e-05, + "loss": 0.6981, + "step": 3630 + }, + { + "epoch": 0.5556235654169854, + "grad_norm": 2.201686255606368, + "learning_rate": 1.6949574004174304e-05, + "loss": 0.6593, + "step": 3631 + }, + { + "epoch": 0.5557765876052028, + "grad_norm": 2.1482634948848554, + "learning_rate": 1.694779185101366e-05, + "loss": 0.7824, + "step": 3632 + }, + { + "epoch": 0.5559296097934201, + "grad_norm": 2.173950406005488, + "learning_rate": 1.6946009271161936e-05, + "loss": 0.5722, + "step": 3633 + }, + { + "epoch": 0.5560826319816373, + "grad_norm": 2.1168563259507946, + "learning_rate": 1.6944226264728612e-05, + "loss": 0.703, + "step": 3634 + }, + { + "epoch": 0.5562356541698547, + "grad_norm": 2.267913969094195, + "learning_rate": 1.6942442831823184e-05, + "loss": 0.8432, + "step": 3635 + }, + { + "epoch": 0.5563886763580719, + "grad_norm": 2.6212417618753983, + "learning_rate": 1.6940658972555184e-05, + "loss": 0.8025, + "step": 3636 + }, + { + "epoch": 0.5565416985462892, + "grad_norm": 2.5846678141954467, + "learning_rate": 1.6938874687034163e-05, + "loss": 0.8314, + "step": 3637 + }, + { + "epoch": 0.5566947207345065, + "grad_norm": 2.334021760479528, + "learning_rate": 1.6937089975369705e-05, + "loss": 0.6797, + "step": 3638 + }, + { + "epoch": 0.5568477429227238, + "grad_norm": 2.107816213370517, + "learning_rate": 1.693530483767141e-05, + "loss": 0.6848, + "step": 3639 + }, + { + "epoch": 0.5570007651109411, + "grad_norm": 2.356072011721238, + "learning_rate": 1.693351927404892e-05, + "loss": 0.7297, + "step": 3640 + }, + { + "epoch": 0.5571537872991584, + "grad_norm": 2.169257056229893, + "learning_rate": 1.6931733284611882e-05, + "loss": 0.8036, + "step": 3641 + }, + { + "epoch": 0.5573068094873757, + "grad_norm": 2.2003860483288302, + "learning_rate": 1.6929946869469987e-05, + "loss": 0.6347, + "step": 3642 + }, + { + "epoch": 0.557459831675593, + "grad_norm": 2.170921678736145, + "learning_rate": 1.6928160028732942e-05, + "loss": 0.6982, + "step": 3643 + }, + { + "epoch": 0.5576128538638102, + "grad_norm": 2.29010650862317, + "learning_rate": 1.6926372762510492e-05, + "loss": 0.8291, + "step": 3644 + }, + { + "epoch": 0.5577658760520275, + "grad_norm": 2.4638778340052565, + "learning_rate": 1.692458507091239e-05, + "loss": 0.7613, + "step": 3645 + }, + { + "epoch": 0.5579188982402449, + "grad_norm": 2.054167992139388, + "learning_rate": 1.6922796954048434e-05, + "loss": 0.8133, + "step": 3646 + }, + { + "epoch": 0.5580719204284621, + "grad_norm": 2.7024998915374643, + "learning_rate": 1.6921008412028435e-05, + "loss": 0.8554, + "step": 3647 + }, + { + "epoch": 0.5582249426166794, + "grad_norm": 2.797028845762761, + "learning_rate": 1.691921944496223e-05, + "loss": 0.8489, + "step": 3648 + }, + { + "epoch": 0.5583779648048968, + "grad_norm": 2.1668006488701166, + "learning_rate": 1.6917430052959692e-05, + "loss": 0.7914, + "step": 3649 + }, + { + "epoch": 0.558530986993114, + "grad_norm": 2.17202935119517, + "learning_rate": 1.6915640236130716e-05, + "loss": 0.6803, + "step": 3650 + }, + { + "epoch": 0.5586840091813313, + "grad_norm": 2.2835949452400257, + "learning_rate": 1.6913849994585217e-05, + "loss": 0.7817, + "step": 3651 + }, + { + "epoch": 0.5588370313695485, + "grad_norm": 2.3830929379834656, + "learning_rate": 1.6912059328433144e-05, + "loss": 0.7796, + "step": 3652 + }, + { + "epoch": 0.5589900535577659, + "grad_norm": 2.3925932080902808, + "learning_rate": 1.6910268237784465e-05, + "loss": 0.8567, + "step": 3653 + }, + { + "epoch": 0.5591430757459832, + "grad_norm": 2.0860786207614894, + "learning_rate": 1.6908476722749182e-05, + "loss": 0.7333, + "step": 3654 + }, + { + "epoch": 0.5592960979342004, + "grad_norm": 2.307073599086852, + "learning_rate": 1.6906684783437315e-05, + "loss": 0.8835, + "step": 3655 + }, + { + "epoch": 0.5594491201224178, + "grad_norm": 2.552393727596661, + "learning_rate": 1.6904892419958918e-05, + "loss": 0.832, + "step": 3656 + }, + { + "epoch": 0.559602142310635, + "grad_norm": 2.1716406728706112, + "learning_rate": 1.6903099632424067e-05, + "loss": 0.655, + "step": 3657 + }, + { + "epoch": 0.5597551644988523, + "grad_norm": 2.431452976880131, + "learning_rate": 1.6901306420942862e-05, + "loss": 0.812, + "step": 3658 + }, + { + "epoch": 0.5599081866870697, + "grad_norm": 2.2599738041832333, + "learning_rate": 1.689951278562543e-05, + "loss": 0.7497, + "step": 3659 + }, + { + "epoch": 0.5600612088752869, + "grad_norm": 2.3432851680364095, + "learning_rate": 1.6897718726581926e-05, + "loss": 0.7247, + "step": 3660 + }, + { + "epoch": 0.5602142310635042, + "grad_norm": 2.385858830011283, + "learning_rate": 1.6895924243922535e-05, + "loss": 0.8256, + "step": 3661 + }, + { + "epoch": 0.5603672532517215, + "grad_norm": 2.2732882617133487, + "learning_rate": 1.6894129337757458e-05, + "loss": 0.8056, + "step": 3662 + }, + { + "epoch": 0.5605202754399388, + "grad_norm": 2.3518883584912156, + "learning_rate": 1.6892334008196925e-05, + "loss": 0.7344, + "step": 3663 + }, + { + "epoch": 0.5606732976281561, + "grad_norm": 2.3340522250983273, + "learning_rate": 1.68905382553512e-05, + "loss": 0.8086, + "step": 3664 + }, + { + "epoch": 0.5608263198163733, + "grad_norm": 2.256805069227207, + "learning_rate": 1.6888742079330566e-05, + "loss": 0.7789, + "step": 3665 + }, + { + "epoch": 0.5609793420045907, + "grad_norm": 2.3041015030287415, + "learning_rate": 1.688694548024533e-05, + "loss": 0.749, + "step": 3666 + }, + { + "epoch": 0.561132364192808, + "grad_norm": 2.601621539475915, + "learning_rate": 1.688514845820583e-05, + "loss": 0.8205, + "step": 3667 + }, + { + "epoch": 0.5612853863810252, + "grad_norm": 2.4104490877496945, + "learning_rate": 1.6883351013322427e-05, + "loss": 0.9014, + "step": 3668 + }, + { + "epoch": 0.5614384085692425, + "grad_norm": 2.1468517678220076, + "learning_rate": 1.6881553145705512e-05, + "loss": 0.7408, + "step": 3669 + }, + { + "epoch": 0.5615914307574599, + "grad_norm": 2.0830615950634783, + "learning_rate": 1.68797548554655e-05, + "loss": 0.6995, + "step": 3670 + }, + { + "epoch": 0.5617444529456771, + "grad_norm": 2.6874796790277204, + "learning_rate": 1.6877956142712827e-05, + "loss": 0.7714, + "step": 3671 + }, + { + "epoch": 0.5618974751338944, + "grad_norm": 2.4850035657522955, + "learning_rate": 1.6876157007557958e-05, + "loss": 0.9281, + "step": 3672 + }, + { + "epoch": 0.5620504973221117, + "grad_norm": 2.273469641065428, + "learning_rate": 1.6874357450111392e-05, + "loss": 0.7668, + "step": 3673 + }, + { + "epoch": 0.562203519510329, + "grad_norm": 2.2483399413604674, + "learning_rate": 1.6872557470483638e-05, + "loss": 0.689, + "step": 3674 + }, + { + "epoch": 0.5623565416985463, + "grad_norm": 2.1512441247663845, + "learning_rate": 1.687075706878525e-05, + "loss": 0.7961, + "step": 3675 + }, + { + "epoch": 0.5625095638867635, + "grad_norm": 2.348647850963959, + "learning_rate": 1.6868956245126785e-05, + "loss": 0.8583, + "step": 3676 + }, + { + "epoch": 0.5626625860749809, + "grad_norm": 2.4585945763130486, + "learning_rate": 1.686715499961885e-05, + "loss": 0.7551, + "step": 3677 + }, + { + "epoch": 0.5628156082631982, + "grad_norm": 2.227822096524331, + "learning_rate": 1.686535333237206e-05, + "loss": 0.7732, + "step": 3678 + }, + { + "epoch": 0.5629686304514154, + "grad_norm": 2.241153131196082, + "learning_rate": 1.6863551243497064e-05, + "loss": 0.7789, + "step": 3679 + }, + { + "epoch": 0.5631216526396328, + "grad_norm": 2.493229819494121, + "learning_rate": 1.686174873310454e-05, + "loss": 0.7737, + "step": 3680 + }, + { + "epoch": 0.56327467482785, + "grad_norm": 2.204665508561256, + "learning_rate": 1.6859945801305176e-05, + "loss": 0.7376, + "step": 3681 + }, + { + "epoch": 0.5634276970160673, + "grad_norm": 2.46220972958079, + "learning_rate": 1.685814244820971e-05, + "loss": 0.7946, + "step": 3682 + }, + { + "epoch": 0.5635807192042847, + "grad_norm": 2.145178057791112, + "learning_rate": 1.6856338673928883e-05, + "loss": 0.686, + "step": 3683 + }, + { + "epoch": 0.5637337413925019, + "grad_norm": 2.4499506242368474, + "learning_rate": 1.6854534478573475e-05, + "loss": 0.842, + "step": 3684 + }, + { + "epoch": 0.5638867635807192, + "grad_norm": 2.056117094582968, + "learning_rate": 1.6852729862254297e-05, + "loss": 0.7311, + "step": 3685 + }, + { + "epoch": 0.5640397857689365, + "grad_norm": 2.6838442750214786, + "learning_rate": 1.6850924825082164e-05, + "loss": 0.9012, + "step": 3686 + }, + { + "epoch": 0.5641928079571538, + "grad_norm": 2.3160834319934858, + "learning_rate": 1.6849119367167935e-05, + "loss": 0.758, + "step": 3687 + }, + { + "epoch": 0.5643458301453711, + "grad_norm": 2.3640101619510423, + "learning_rate": 1.6847313488622495e-05, + "loss": 0.7606, + "step": 3688 + }, + { + "epoch": 0.5644988523335883, + "grad_norm": 2.121683684858718, + "learning_rate": 1.684550718955675e-05, + "loss": 0.6647, + "step": 3689 + }, + { + "epoch": 0.5646518745218057, + "grad_norm": 2.2062012781293823, + "learning_rate": 1.6843700470081625e-05, + "loss": 0.8113, + "step": 3690 + }, + { + "epoch": 0.564804896710023, + "grad_norm": 2.3360486325838505, + "learning_rate": 1.684189333030808e-05, + "loss": 0.6079, + "step": 3691 + }, + { + "epoch": 0.5649579188982402, + "grad_norm": 2.197760459122702, + "learning_rate": 1.68400857703471e-05, + "loss": 0.6825, + "step": 3692 + }, + { + "epoch": 0.5651109410864575, + "grad_norm": 2.1093971294742326, + "learning_rate": 1.6838277790309697e-05, + "loss": 0.7355, + "step": 3693 + }, + { + "epoch": 0.5652639632746749, + "grad_norm": 2.3687841166774524, + "learning_rate": 1.6836469390306903e-05, + "loss": 0.7743, + "step": 3694 + }, + { + "epoch": 0.5654169854628921, + "grad_norm": 3.161887730066366, + "learning_rate": 1.683466057044978e-05, + "loss": 0.827, + "step": 3695 + }, + { + "epoch": 0.5655700076511094, + "grad_norm": 2.216397568711454, + "learning_rate": 1.683285133084941e-05, + "loss": 0.8549, + "step": 3696 + }, + { + "epoch": 0.5657230298393267, + "grad_norm": 1.9890486097667865, + "learning_rate": 1.6831041671616914e-05, + "loss": 0.7672, + "step": 3697 + }, + { + "epoch": 0.565876052027544, + "grad_norm": 2.493183723503674, + "learning_rate": 1.6829231592863425e-05, + "loss": 0.8644, + "step": 3698 + }, + { + "epoch": 0.5660290742157613, + "grad_norm": 2.2253445860436685, + "learning_rate": 1.6827421094700108e-05, + "loss": 0.7744, + "step": 3699 + }, + { + "epoch": 0.5661820964039785, + "grad_norm": 2.5572294147324848, + "learning_rate": 1.682561017723815e-05, + "loss": 0.7337, + "step": 3700 + }, + { + "epoch": 0.5663351185921959, + "grad_norm": 2.3673349447586554, + "learning_rate": 1.682379884058877e-05, + "loss": 0.7592, + "step": 3701 + }, + { + "epoch": 0.5664881407804132, + "grad_norm": 2.231170609581366, + "learning_rate": 1.6821987084863208e-05, + "loss": 0.7259, + "step": 3702 + }, + { + "epoch": 0.5666411629686304, + "grad_norm": 2.312622201395522, + "learning_rate": 1.6820174910172732e-05, + "loss": 0.7672, + "step": 3703 + }, + { + "epoch": 0.5667941851568478, + "grad_norm": 2.5794283462785215, + "learning_rate": 1.6818362316628635e-05, + "loss": 0.7342, + "step": 3704 + }, + { + "epoch": 0.566947207345065, + "grad_norm": 2.2887391825650525, + "learning_rate": 1.6816549304342233e-05, + "loss": 0.7277, + "step": 3705 + }, + { + "epoch": 0.5671002295332823, + "grad_norm": 2.3369358739869197, + "learning_rate": 1.6814735873424874e-05, + "loss": 0.7064, + "step": 3706 + }, + { + "epoch": 0.5672532517214997, + "grad_norm": 2.1653393567600103, + "learning_rate": 1.6812922023987922e-05, + "loss": 0.837, + "step": 3707 + }, + { + "epoch": 0.5674062739097169, + "grad_norm": 2.3845278093061384, + "learning_rate": 1.6811107756142776e-05, + "loss": 0.8588, + "step": 3708 + }, + { + "epoch": 0.5675592960979342, + "grad_norm": 2.495972598753815, + "learning_rate": 1.6809293070000862e-05, + "loss": 0.7833, + "step": 3709 + }, + { + "epoch": 0.5677123182861515, + "grad_norm": 2.247267803617609, + "learning_rate": 1.680747796567362e-05, + "loss": 0.9386, + "step": 3710 + }, + { + "epoch": 0.5678653404743688, + "grad_norm": 2.5184558160832107, + "learning_rate": 1.6805662443272525e-05, + "loss": 0.8083, + "step": 3711 + }, + { + "epoch": 0.5680183626625861, + "grad_norm": 2.165389680357376, + "learning_rate": 1.6803846502909074e-05, + "loss": 0.7876, + "step": 3712 + }, + { + "epoch": 0.5681713848508033, + "grad_norm": 2.370201233814579, + "learning_rate": 1.6802030144694798e-05, + "loss": 0.7525, + "step": 3713 + }, + { + "epoch": 0.5683244070390207, + "grad_norm": 2.4514170401052175, + "learning_rate": 1.6800213368741236e-05, + "loss": 0.816, + "step": 3714 + }, + { + "epoch": 0.568477429227238, + "grad_norm": 2.2577834479368963, + "learning_rate": 1.679839617515997e-05, + "loss": 0.7959, + "step": 3715 + }, + { + "epoch": 0.5686304514154552, + "grad_norm": 2.5236928956082605, + "learning_rate": 1.67965785640626e-05, + "loss": 0.7833, + "step": 3716 + }, + { + "epoch": 0.5687834736036725, + "grad_norm": 2.188326961484387, + "learning_rate": 1.679476053556075e-05, + "loss": 0.7064, + "step": 3717 + }, + { + "epoch": 0.5689364957918899, + "grad_norm": 2.3822901414350306, + "learning_rate": 1.6792942089766076e-05, + "loss": 0.8851, + "step": 3718 + }, + { + "epoch": 0.5690895179801071, + "grad_norm": 2.1457955667343134, + "learning_rate": 1.6791123226790255e-05, + "loss": 0.7607, + "step": 3719 + }, + { + "epoch": 0.5692425401683244, + "grad_norm": 2.323996375369015, + "learning_rate": 1.6789303946744985e-05, + "loss": 0.7703, + "step": 3720 + }, + { + "epoch": 0.5693955623565417, + "grad_norm": 2.3656552591801523, + "learning_rate": 1.6787484249742004e-05, + "loss": 0.768, + "step": 3721 + }, + { + "epoch": 0.569548584544759, + "grad_norm": 2.3223483884094143, + "learning_rate": 1.678566413589306e-05, + "loss": 0.7593, + "step": 3722 + }, + { + "epoch": 0.5697016067329763, + "grad_norm": 2.2639870672318296, + "learning_rate": 1.678384360530994e-05, + "loss": 0.733, + "step": 3723 + }, + { + "epoch": 0.5698546289211935, + "grad_norm": 2.3355673048906254, + "learning_rate": 1.6782022658104444e-05, + "loss": 0.7424, + "step": 3724 + }, + { + "epoch": 0.5700076511094109, + "grad_norm": 2.218856200972419, + "learning_rate": 1.67802012943884e-05, + "loss": 0.769, + "step": 3725 + }, + { + "epoch": 0.5701606732976282, + "grad_norm": 2.6124220750851506, + "learning_rate": 1.6778379514273677e-05, + "loss": 0.8804, + "step": 3726 + }, + { + "epoch": 0.5703136954858454, + "grad_norm": 2.3903567269634087, + "learning_rate": 1.6776557317872146e-05, + "loss": 0.7813, + "step": 3727 + }, + { + "epoch": 0.5704667176740628, + "grad_norm": 2.5179833679939247, + "learning_rate": 1.6774734705295723e-05, + "loss": 0.7096, + "step": 3728 + }, + { + "epoch": 0.57061973986228, + "grad_norm": 2.1922598523160928, + "learning_rate": 1.677291167665634e-05, + "loss": 0.7195, + "step": 3729 + }, + { + "epoch": 0.5707727620504973, + "grad_norm": 2.4942133453405333, + "learning_rate": 1.677108823206595e-05, + "loss": 0.7428, + "step": 3730 + }, + { + "epoch": 0.5709257842387147, + "grad_norm": 2.2384937500275104, + "learning_rate": 1.6769264371636546e-05, + "loss": 0.8616, + "step": 3731 + }, + { + "epoch": 0.5710788064269319, + "grad_norm": 2.2191384349159002, + "learning_rate": 1.6767440095480136e-05, + "loss": 0.7403, + "step": 3732 + }, + { + "epoch": 0.5712318286151492, + "grad_norm": 2.380505958844727, + "learning_rate": 1.6765615403708756e-05, + "loss": 0.748, + "step": 3733 + }, + { + "epoch": 0.5713848508033665, + "grad_norm": 2.2650779038130793, + "learning_rate": 1.6763790296434463e-05, + "loss": 0.7306, + "step": 3734 + }, + { + "epoch": 0.5715378729915838, + "grad_norm": 2.6913536745347866, + "learning_rate": 1.6761964773769352e-05, + "loss": 0.8774, + "step": 3735 + }, + { + "epoch": 0.5716908951798011, + "grad_norm": 2.079473361927895, + "learning_rate": 1.676013883582553e-05, + "loss": 0.6435, + "step": 3736 + }, + { + "epoch": 0.5718439173680183, + "grad_norm": 2.236787333970397, + "learning_rate": 1.6758312482715137e-05, + "loss": 0.8045, + "step": 3737 + }, + { + "epoch": 0.5719969395562357, + "grad_norm": 2.1951274280494206, + "learning_rate": 1.6756485714550333e-05, + "loss": 0.7215, + "step": 3738 + }, + { + "epoch": 0.572149961744453, + "grad_norm": 2.2622052474326733, + "learning_rate": 1.6754658531443314e-05, + "loss": 0.703, + "step": 3739 + }, + { + "epoch": 0.5723029839326702, + "grad_norm": 2.3735375982121707, + "learning_rate": 1.6752830933506285e-05, + "loss": 0.7512, + "step": 3740 + }, + { + "epoch": 0.5724560061208875, + "grad_norm": 2.0811855879934225, + "learning_rate": 1.6751002920851494e-05, + "loss": 0.6298, + "step": 3741 + }, + { + "epoch": 0.5726090283091049, + "grad_norm": 2.5859503146517446, + "learning_rate": 1.6749174493591205e-05, + "loss": 0.7602, + "step": 3742 + }, + { + "epoch": 0.5727620504973221, + "grad_norm": 2.1294854580052487, + "learning_rate": 1.6747345651837706e-05, + "loss": 0.799, + "step": 3743 + }, + { + "epoch": 0.5729150726855394, + "grad_norm": 2.2886074445311517, + "learning_rate": 1.6745516395703315e-05, + "loss": 0.8139, + "step": 3744 + }, + { + "epoch": 0.5730680948737567, + "grad_norm": 2.6893542581228806, + "learning_rate": 1.6743686725300373e-05, + "loss": 0.7434, + "step": 3745 + }, + { + "epoch": 0.573221117061974, + "grad_norm": 2.3216860488738424, + "learning_rate": 1.6741856640741247e-05, + "loss": 0.6948, + "step": 3746 + }, + { + "epoch": 0.5733741392501913, + "grad_norm": 2.2001897317732615, + "learning_rate": 1.6740026142138332e-05, + "loss": 0.6766, + "step": 3747 + }, + { + "epoch": 0.5735271614384085, + "grad_norm": 2.2336227878098693, + "learning_rate": 1.6738195229604043e-05, + "loss": 0.8209, + "step": 3748 + }, + { + "epoch": 0.5736801836266259, + "grad_norm": 2.240603429890706, + "learning_rate": 1.673636390325083e-05, + "loss": 0.7174, + "step": 3749 + }, + { + "epoch": 0.5738332058148432, + "grad_norm": 2.3651344356855035, + "learning_rate": 1.673453216319115e-05, + "loss": 0.791, + "step": 3750 + }, + { + "epoch": 0.5739862280030604, + "grad_norm": 2.557825145409085, + "learning_rate": 1.6732700009537506e-05, + "loss": 0.8281, + "step": 3751 + }, + { + "epoch": 0.5741392501912778, + "grad_norm": 2.2939591266531587, + "learning_rate": 1.6730867442402417e-05, + "loss": 0.7533, + "step": 3752 + }, + { + "epoch": 0.574292272379495, + "grad_norm": 2.4369272889263467, + "learning_rate": 1.6729034461898428e-05, + "loss": 0.7564, + "step": 3753 + }, + { + "epoch": 0.5744452945677123, + "grad_norm": 2.383742765723668, + "learning_rate": 1.6727201068138106e-05, + "loss": 0.8455, + "step": 3754 + }, + { + "epoch": 0.5745983167559297, + "grad_norm": 2.1235204798335325, + "learning_rate": 1.6725367261234052e-05, + "loss": 0.7219, + "step": 3755 + }, + { + "epoch": 0.5747513389441469, + "grad_norm": 2.0759737406177035, + "learning_rate": 1.6723533041298884e-05, + "loss": 0.7086, + "step": 3756 + }, + { + "epoch": 0.5749043611323642, + "grad_norm": 2.1343424495745587, + "learning_rate": 1.6721698408445248e-05, + "loss": 0.8257, + "step": 3757 + }, + { + "epoch": 0.5750573833205815, + "grad_norm": 2.0830999607795135, + "learning_rate": 1.671986336278582e-05, + "loss": 0.7539, + "step": 3758 + }, + { + "epoch": 0.5752104055087988, + "grad_norm": 2.3324609080288066, + "learning_rate": 1.6718027904433292e-05, + "loss": 0.8393, + "step": 3759 + }, + { + "epoch": 0.5753634276970161, + "grad_norm": 2.9857624065747403, + "learning_rate": 1.671619203350039e-05, + "loss": 0.8469, + "step": 3760 + }, + { + "epoch": 0.5755164498852333, + "grad_norm": 2.3340876823270857, + "learning_rate": 1.6714355750099863e-05, + "loss": 0.7146, + "step": 3761 + }, + { + "epoch": 0.5756694720734506, + "grad_norm": 2.296826476694342, + "learning_rate": 1.6712519054344477e-05, + "loss": 0.8085, + "step": 3762 + }, + { + "epoch": 0.575822494261668, + "grad_norm": 2.398681606627602, + "learning_rate": 1.6710681946347046e-05, + "loss": 0.8752, + "step": 3763 + }, + { + "epoch": 0.5759755164498852, + "grad_norm": 2.1659581982539042, + "learning_rate": 1.6708844426220378e-05, + "loss": 0.7433, + "step": 3764 + }, + { + "epoch": 0.5761285386381025, + "grad_norm": 2.2458337659004317, + "learning_rate": 1.6707006494077328e-05, + "loss": 0.7098, + "step": 3765 + }, + { + "epoch": 0.5762815608263198, + "grad_norm": 2.2737668614456337, + "learning_rate": 1.670516815003077e-05, + "loss": 0.7792, + "step": 3766 + }, + { + "epoch": 0.5764345830145371, + "grad_norm": 2.305720566758425, + "learning_rate": 1.6703329394193612e-05, + "loss": 0.6744, + "step": 3767 + }, + { + "epoch": 0.5765876052027544, + "grad_norm": 2.557917734239461, + "learning_rate": 1.6701490226678768e-05, + "loss": 0.6602, + "step": 3768 + }, + { + "epoch": 0.5767406273909716, + "grad_norm": 2.22313464861298, + "learning_rate": 1.6699650647599194e-05, + "loss": 0.7318, + "step": 3769 + }, + { + "epoch": 0.576893649579189, + "grad_norm": 2.3570935722006143, + "learning_rate": 1.669781065706786e-05, + "loss": 0.6865, + "step": 3770 + }, + { + "epoch": 0.5770466717674063, + "grad_norm": 2.328764118860341, + "learning_rate": 1.6695970255197775e-05, + "loss": 0.8174, + "step": 3771 + }, + { + "epoch": 0.5771996939556235, + "grad_norm": 2.206086498235264, + "learning_rate": 1.669412944210196e-05, + "loss": 0.7195, + "step": 3772 + }, + { + "epoch": 0.5773527161438409, + "grad_norm": 2.4574715151597206, + "learning_rate": 1.6692288217893473e-05, + "loss": 0.6601, + "step": 3773 + }, + { + "epoch": 0.5775057383320581, + "grad_norm": 2.5554333693809337, + "learning_rate": 1.6690446582685384e-05, + "loss": 0.9636, + "step": 3774 + }, + { + "epoch": 0.5776587605202754, + "grad_norm": 2.0830553360852653, + "learning_rate": 1.6688604536590797e-05, + "loss": 0.688, + "step": 3775 + }, + { + "epoch": 0.5778117827084928, + "grad_norm": 2.1131326036234825, + "learning_rate": 1.668676207972284e-05, + "loss": 0.7221, + "step": 3776 + }, + { + "epoch": 0.57796480489671, + "grad_norm": 2.055677955743591, + "learning_rate": 1.6684919212194664e-05, + "loss": 0.6651, + "step": 3777 + }, + { + "epoch": 0.5781178270849273, + "grad_norm": 2.2701708775009135, + "learning_rate": 1.6683075934119448e-05, + "loss": 0.7376, + "step": 3778 + }, + { + "epoch": 0.5782708492731446, + "grad_norm": 2.0336407250984827, + "learning_rate": 1.6681232245610393e-05, + "loss": 0.76, + "step": 3779 + }, + { + "epoch": 0.5784238714613619, + "grad_norm": 2.1902709950009633, + "learning_rate": 1.6679388146780732e-05, + "loss": 0.7113, + "step": 3780 + }, + { + "epoch": 0.5785768936495792, + "grad_norm": 2.283177953498535, + "learning_rate": 1.667754363774371e-05, + "loss": 0.8212, + "step": 3781 + }, + { + "epoch": 0.5787299158377964, + "grad_norm": 2.5358371176198005, + "learning_rate": 1.6675698718612613e-05, + "loss": 0.7745, + "step": 3782 + }, + { + "epoch": 0.5788829380260138, + "grad_norm": 2.4035073241578404, + "learning_rate": 1.6673853389500746e-05, + "loss": 0.7967, + "step": 3783 + }, + { + "epoch": 0.5790359602142311, + "grad_norm": 1.9008232222332522, + "learning_rate": 1.667200765052143e-05, + "loss": 0.5186, + "step": 3784 + }, + { + "epoch": 0.5791889824024483, + "grad_norm": 2.660572345490886, + "learning_rate": 1.6670161501788025e-05, + "loss": 0.7472, + "step": 3785 + }, + { + "epoch": 0.5793420045906656, + "grad_norm": 2.3119643292610608, + "learning_rate": 1.666831494341391e-05, + "loss": 0.7326, + "step": 3786 + }, + { + "epoch": 0.579495026778883, + "grad_norm": 2.2646005923282346, + "learning_rate": 1.666646797551248e-05, + "loss": 0.7679, + "step": 3787 + }, + { + "epoch": 0.5796480489671002, + "grad_norm": 2.158895112775971, + "learning_rate": 1.666462059819718e-05, + "loss": 0.7638, + "step": 3788 + }, + { + "epoch": 0.5798010711553175, + "grad_norm": 2.155066404690859, + "learning_rate": 1.6662772811581453e-05, + "loss": 0.6579, + "step": 3789 + }, + { + "epoch": 0.5799540933435348, + "grad_norm": 2.524080120158428, + "learning_rate": 1.6660924615778784e-05, + "loss": 0.7391, + "step": 3790 + }, + { + "epoch": 0.5801071155317521, + "grad_norm": 2.3536130897830856, + "learning_rate": 1.6659076010902678e-05, + "loss": 0.7922, + "step": 3791 + }, + { + "epoch": 0.5802601377199694, + "grad_norm": 2.139282136745696, + "learning_rate": 1.665722699706666e-05, + "loss": 0.7086, + "step": 3792 + }, + { + "epoch": 0.5804131599081866, + "grad_norm": 2.0993206873081407, + "learning_rate": 1.6655377574384294e-05, + "loss": 0.6831, + "step": 3793 + }, + { + "epoch": 0.580566182096404, + "grad_norm": 2.0100115117297492, + "learning_rate": 1.6653527742969152e-05, + "loss": 0.7001, + "step": 3794 + }, + { + "epoch": 0.5807192042846213, + "grad_norm": 2.3782783324435783, + "learning_rate": 1.665167750293484e-05, + "loss": 0.7298, + "step": 3795 + }, + { + "epoch": 0.5808722264728385, + "grad_norm": 2.040780849933868, + "learning_rate": 1.6649826854394997e-05, + "loss": 0.7327, + "step": 3796 + }, + { + "epoch": 0.5810252486610559, + "grad_norm": 2.5334375804404625, + "learning_rate": 1.664797579746327e-05, + "loss": 0.6432, + "step": 3797 + }, + { + "epoch": 0.5811782708492731, + "grad_norm": 2.2545821441478933, + "learning_rate": 1.664612433225334e-05, + "loss": 0.7446, + "step": 3798 + }, + { + "epoch": 0.5813312930374904, + "grad_norm": 2.8266389872037845, + "learning_rate": 1.664427245887892e-05, + "loss": 0.8615, + "step": 3799 + }, + { + "epoch": 0.5814843152257078, + "grad_norm": 2.357933150397747, + "learning_rate": 1.6642420177453728e-05, + "loss": 0.7502, + "step": 3800 + }, + { + "epoch": 0.581637337413925, + "grad_norm": 2.4860458336444364, + "learning_rate": 1.6640567488091536e-05, + "loss": 0.7914, + "step": 3801 + }, + { + "epoch": 0.5817903596021423, + "grad_norm": 2.5167304694736052, + "learning_rate": 1.663871439090611e-05, + "loss": 0.7699, + "step": 3802 + }, + { + "epoch": 0.5819433817903596, + "grad_norm": 2.336138556851833, + "learning_rate": 1.6636860886011266e-05, + "loss": 0.7217, + "step": 3803 + }, + { + "epoch": 0.5820964039785769, + "grad_norm": 2.3654993443453485, + "learning_rate": 1.663500697352083e-05, + "loss": 0.738, + "step": 3804 + }, + { + "epoch": 0.5822494261667942, + "grad_norm": 2.4112699878549546, + "learning_rate": 1.663315265354866e-05, + "loss": 0.7535, + "step": 3805 + }, + { + "epoch": 0.5824024483550114, + "grad_norm": 2.381433431863148, + "learning_rate": 1.6631297926208637e-05, + "loss": 0.8315, + "step": 3806 + }, + { + "epoch": 0.5825554705432288, + "grad_norm": 2.540843627525302, + "learning_rate": 1.6629442791614667e-05, + "loss": 0.8621, + "step": 3807 + }, + { + "epoch": 0.5827084927314461, + "grad_norm": 2.301553310120931, + "learning_rate": 1.6627587249880677e-05, + "loss": 0.721, + "step": 3808 + }, + { + "epoch": 0.5828615149196633, + "grad_norm": 2.3190169251043353, + "learning_rate": 1.662573130112063e-05, + "loss": 0.6618, + "step": 3809 + }, + { + "epoch": 0.5830145371078806, + "grad_norm": 2.3010873402470033, + "learning_rate": 1.6623874945448503e-05, + "loss": 0.8682, + "step": 3810 + }, + { + "epoch": 0.583167559296098, + "grad_norm": 2.1375246030186035, + "learning_rate": 1.6622018182978304e-05, + "loss": 0.7486, + "step": 3811 + }, + { + "epoch": 0.5833205814843152, + "grad_norm": 2.293568821408462, + "learning_rate": 1.662016101382406e-05, + "loss": 0.7397, + "step": 3812 + }, + { + "epoch": 0.5834736036725325, + "grad_norm": 2.417223850861851, + "learning_rate": 1.6618303438099834e-05, + "loss": 0.7969, + "step": 3813 + }, + { + "epoch": 0.5836266258607498, + "grad_norm": 2.419752913361654, + "learning_rate": 1.6616445455919704e-05, + "loss": 0.7874, + "step": 3814 + }, + { + "epoch": 0.5837796480489671, + "grad_norm": 2.297683203684462, + "learning_rate": 1.6614587067397767e-05, + "loss": 0.7571, + "step": 3815 + }, + { + "epoch": 0.5839326702371844, + "grad_norm": 2.1676060389805043, + "learning_rate": 1.661272827264817e-05, + "loss": 0.6983, + "step": 3816 + }, + { + "epoch": 0.5840856924254016, + "grad_norm": 2.2039655542368712, + "learning_rate": 1.6610869071785055e-05, + "loss": 0.7334, + "step": 3817 + }, + { + "epoch": 0.584238714613619, + "grad_norm": 2.0194472601786146, + "learning_rate": 1.6609009464922613e-05, + "loss": 0.6393, + "step": 3818 + }, + { + "epoch": 0.5843917368018363, + "grad_norm": 2.515878768807154, + "learning_rate": 1.660714945217504e-05, + "loss": 0.7962, + "step": 3819 + }, + { + "epoch": 0.5845447589900535, + "grad_norm": 2.5345715323872806, + "learning_rate": 1.6605289033656575e-05, + "loss": 0.7854, + "step": 3820 + }, + { + "epoch": 0.5846977811782709, + "grad_norm": 2.2418644854452334, + "learning_rate": 1.660342820948147e-05, + "loss": 0.8298, + "step": 3821 + }, + { + "epoch": 0.5848508033664881, + "grad_norm": 2.4365303706928603, + "learning_rate": 1.6601566979764007e-05, + "loss": 0.7993, + "step": 3822 + }, + { + "epoch": 0.5850038255547054, + "grad_norm": 2.4005844284928415, + "learning_rate": 1.659970534461849e-05, + "loss": 0.7509, + "step": 3823 + }, + { + "epoch": 0.5851568477429228, + "grad_norm": 2.406705848873317, + "learning_rate": 1.6597843304159248e-05, + "loss": 0.7272, + "step": 3824 + }, + { + "epoch": 0.58530986993114, + "grad_norm": 2.3041140030698615, + "learning_rate": 1.659598085850064e-05, + "loss": 0.7891, + "step": 3825 + }, + { + "epoch": 0.5854628921193573, + "grad_norm": 2.379151533908481, + "learning_rate": 1.659411800775704e-05, + "loss": 0.7599, + "step": 3826 + }, + { + "epoch": 0.5856159143075746, + "grad_norm": 2.287716081596106, + "learning_rate": 1.659225475204286e-05, + "loss": 0.8061, + "step": 3827 + }, + { + "epoch": 0.5857689364957919, + "grad_norm": 2.2414433200668227, + "learning_rate": 1.6590391091472526e-05, + "loss": 0.7691, + "step": 3828 + }, + { + "epoch": 0.5859219586840092, + "grad_norm": 2.5086014363960554, + "learning_rate": 1.6588527026160493e-05, + "loss": 0.8479, + "step": 3829 + }, + { + "epoch": 0.5860749808722264, + "grad_norm": 2.419448089243427, + "learning_rate": 1.658666255622124e-05, + "loss": 0.8236, + "step": 3830 + }, + { + "epoch": 0.5862280030604438, + "grad_norm": 2.3163306355498157, + "learning_rate": 1.6584797681769273e-05, + "loss": 0.8364, + "step": 3831 + }, + { + "epoch": 0.5863810252486611, + "grad_norm": 2.1990484963992585, + "learning_rate": 1.6582932402919124e-05, + "loss": 0.7264, + "step": 3832 + }, + { + "epoch": 0.5865340474368783, + "grad_norm": 2.257065590223458, + "learning_rate": 1.658106671978534e-05, + "loss": 0.7227, + "step": 3833 + }, + { + "epoch": 0.5866870696250956, + "grad_norm": 2.3568110415125805, + "learning_rate": 1.6579200632482502e-05, + "loss": 0.8643, + "step": 3834 + }, + { + "epoch": 0.586840091813313, + "grad_norm": 2.2997320714448537, + "learning_rate": 1.6577334141125217e-05, + "loss": 0.804, + "step": 3835 + }, + { + "epoch": 0.5869931140015302, + "grad_norm": 2.4492067663950596, + "learning_rate": 1.657546724582811e-05, + "loss": 0.7471, + "step": 3836 + }, + { + "epoch": 0.5871461361897475, + "grad_norm": 2.2367454813232346, + "learning_rate": 1.657359994670584e-05, + "loss": 0.7728, + "step": 3837 + }, + { + "epoch": 0.5872991583779648, + "grad_norm": 2.387170074669952, + "learning_rate": 1.657173224387308e-05, + "loss": 0.7294, + "step": 3838 + }, + { + "epoch": 0.5874521805661821, + "grad_norm": 2.6611783518744367, + "learning_rate": 1.6569864137444533e-05, + "loss": 0.8531, + "step": 3839 + }, + { + "epoch": 0.5876052027543994, + "grad_norm": 2.5795969282876507, + "learning_rate": 1.6567995627534927e-05, + "loss": 0.7192, + "step": 3840 + }, + { + "epoch": 0.5877582249426166, + "grad_norm": 2.228751239656882, + "learning_rate": 1.6566126714259017e-05, + "loss": 0.7878, + "step": 3841 + }, + { + "epoch": 0.587911247130834, + "grad_norm": 2.418610511457032, + "learning_rate": 1.6564257397731577e-05, + "loss": 0.8162, + "step": 3842 + }, + { + "epoch": 0.5880642693190513, + "grad_norm": 2.1098189523920357, + "learning_rate": 1.656238767806741e-05, + "loss": 0.7597, + "step": 3843 + }, + { + "epoch": 0.5882172915072685, + "grad_norm": 2.4049855489791803, + "learning_rate": 1.6560517555381348e-05, + "loss": 0.8416, + "step": 3844 + }, + { + "epoch": 0.5883703136954859, + "grad_norm": 2.1864495709233913, + "learning_rate": 1.6558647029788234e-05, + "loss": 0.6129, + "step": 3845 + }, + { + "epoch": 0.5885233358837031, + "grad_norm": 2.571107206508435, + "learning_rate": 1.655677610140295e-05, + "loss": 0.8715, + "step": 3846 + }, + { + "epoch": 0.5886763580719204, + "grad_norm": 2.2997919979778016, + "learning_rate": 1.6554904770340393e-05, + "loss": 0.8151, + "step": 3847 + }, + { + "epoch": 0.5888293802601378, + "grad_norm": 2.2281125845467495, + "learning_rate": 1.6553033036715493e-05, + "loss": 0.8202, + "step": 3848 + }, + { + "epoch": 0.588982402448355, + "grad_norm": 2.1321131423457773, + "learning_rate": 1.6551160900643203e-05, + "loss": 0.6831, + "step": 3849 + }, + { + "epoch": 0.5891354246365723, + "grad_norm": 2.527648900196719, + "learning_rate": 1.654928836223849e-05, + "loss": 0.7904, + "step": 3850 + }, + { + "epoch": 0.5892884468247896, + "grad_norm": 2.28923012533992, + "learning_rate": 1.6547415421616353e-05, + "loss": 0.7622, + "step": 3851 + }, + { + "epoch": 0.5894414690130069, + "grad_norm": 2.7923038554466846, + "learning_rate": 1.6545542078891826e-05, + "loss": 0.8117, + "step": 3852 + }, + { + "epoch": 0.5895944912012242, + "grad_norm": 2.4396804219014423, + "learning_rate": 1.654366833417995e-05, + "loss": 0.8149, + "step": 3853 + }, + { + "epoch": 0.5897475133894414, + "grad_norm": 2.2650099970410795, + "learning_rate": 1.654179418759581e-05, + "loss": 0.7232, + "step": 3854 + }, + { + "epoch": 0.5899005355776588, + "grad_norm": 2.2479396320602865, + "learning_rate": 1.6539919639254494e-05, + "loss": 0.7672, + "step": 3855 + }, + { + "epoch": 0.5900535577658761, + "grad_norm": 2.345314819643059, + "learning_rate": 1.6538044689271126e-05, + "loss": 0.8204, + "step": 3856 + }, + { + "epoch": 0.5902065799540933, + "grad_norm": 2.0268666678732443, + "learning_rate": 1.653616933776086e-05, + "loss": 0.7722, + "step": 3857 + }, + { + "epoch": 0.5903596021423106, + "grad_norm": 2.3791503614863108, + "learning_rate": 1.653429358483886e-05, + "loss": 0.8463, + "step": 3858 + }, + { + "epoch": 0.590512624330528, + "grad_norm": 2.342414633790742, + "learning_rate": 1.6532417430620337e-05, + "loss": 0.77, + "step": 3859 + }, + { + "epoch": 0.5906656465187452, + "grad_norm": 2.327695041795651, + "learning_rate": 1.65305408752205e-05, + "loss": 0.8732, + "step": 3860 + }, + { + "epoch": 0.5908186687069625, + "grad_norm": 2.437108756237025, + "learning_rate": 1.6528663918754597e-05, + "loss": 0.8092, + "step": 3861 + }, + { + "epoch": 0.5909716908951798, + "grad_norm": 2.2774300160190046, + "learning_rate": 1.652678656133791e-05, + "loss": 0.7207, + "step": 3862 + }, + { + "epoch": 0.5911247130833971, + "grad_norm": 2.4901626265698886, + "learning_rate": 1.652490880308572e-05, + "loss": 0.8836, + "step": 3863 + }, + { + "epoch": 0.5912777352716144, + "grad_norm": 2.49244263349783, + "learning_rate": 1.6523030644113357e-05, + "loss": 0.7631, + "step": 3864 + }, + { + "epoch": 0.5914307574598316, + "grad_norm": 2.166963403745061, + "learning_rate": 1.6521152084536164e-05, + "loss": 0.7394, + "step": 3865 + }, + { + "epoch": 0.591583779648049, + "grad_norm": 2.3942979926831196, + "learning_rate": 1.6519273124469512e-05, + "loss": 0.8902, + "step": 3866 + }, + { + "epoch": 0.5917368018362663, + "grad_norm": 2.1880020195653764, + "learning_rate": 1.6517393764028793e-05, + "loss": 0.7427, + "step": 3867 + }, + { + "epoch": 0.5918898240244835, + "grad_norm": 2.106975274430694, + "learning_rate": 1.651551400332943e-05, + "loss": 0.7053, + "step": 3868 + }, + { + "epoch": 0.5920428462127009, + "grad_norm": 2.377254747191573, + "learning_rate": 1.6513633842486858e-05, + "loss": 0.8433, + "step": 3869 + }, + { + "epoch": 0.5921958684009181, + "grad_norm": 2.1159960070343753, + "learning_rate": 1.6511753281616552e-05, + "loss": 0.6735, + "step": 3870 + }, + { + "epoch": 0.5923488905891354, + "grad_norm": 1.9438413653670783, + "learning_rate": 1.6509872320834003e-05, + "loss": 0.6788, + "step": 3871 + }, + { + "epoch": 0.5925019127773528, + "grad_norm": 2.0955062116382943, + "learning_rate": 1.6507990960254728e-05, + "loss": 0.7541, + "step": 3872 + }, + { + "epoch": 0.59265493496557, + "grad_norm": 2.3820312322134187, + "learning_rate": 1.650610919999427e-05, + "loss": 0.8307, + "step": 3873 + }, + { + "epoch": 0.5928079571537873, + "grad_norm": 2.2354983100993446, + "learning_rate": 1.6504227040168194e-05, + "loss": 0.772, + "step": 3874 + }, + { + "epoch": 0.5929609793420046, + "grad_norm": 2.2484229287109625, + "learning_rate": 1.650234448089209e-05, + "loss": 0.8463, + "step": 3875 + }, + { + "epoch": 0.5931140015302219, + "grad_norm": 2.1798032783600783, + "learning_rate": 1.6500461522281575e-05, + "loss": 0.6458, + "step": 3876 + }, + { + "epoch": 0.5932670237184392, + "grad_norm": 2.2753556732841402, + "learning_rate": 1.6498578164452285e-05, + "loss": 0.7703, + "step": 3877 + }, + { + "epoch": 0.5934200459066564, + "grad_norm": 2.4188609922833737, + "learning_rate": 1.6496694407519888e-05, + "loss": 0.8333, + "step": 3878 + }, + { + "epoch": 0.5935730680948738, + "grad_norm": 2.3421062788752307, + "learning_rate": 1.6494810251600075e-05, + "loss": 0.7059, + "step": 3879 + }, + { + "epoch": 0.5937260902830911, + "grad_norm": 2.1227249655673717, + "learning_rate": 1.6492925696808555e-05, + "loss": 0.8143, + "step": 3880 + }, + { + "epoch": 0.5938791124713083, + "grad_norm": 2.308489799932691, + "learning_rate": 1.6491040743261065e-05, + "loss": 0.8233, + "step": 3881 + }, + { + "epoch": 0.5940321346595256, + "grad_norm": 2.4998079895811687, + "learning_rate": 1.6489155391073375e-05, + "loss": 0.7824, + "step": 3882 + }, + { + "epoch": 0.5941851568477429, + "grad_norm": 2.135136759805042, + "learning_rate": 1.6487269640361264e-05, + "loss": 0.789, + "step": 3883 + }, + { + "epoch": 0.5943381790359602, + "grad_norm": 2.0920550383410474, + "learning_rate": 1.6485383491240546e-05, + "loss": 0.743, + "step": 3884 + }, + { + "epoch": 0.5944912012241775, + "grad_norm": 2.355458998837855, + "learning_rate": 1.6483496943827056e-05, + "loss": 0.728, + "step": 3885 + }, + { + "epoch": 0.5946442234123948, + "grad_norm": 2.1107113168909004, + "learning_rate": 1.6481609998236656e-05, + "loss": 0.6521, + "step": 3886 + }, + { + "epoch": 0.5947972456006121, + "grad_norm": 2.116378932065588, + "learning_rate": 1.647972265458523e-05, + "loss": 0.765, + "step": 3887 + }, + { + "epoch": 0.5949502677888294, + "grad_norm": 2.195681235916977, + "learning_rate": 1.647783491298869e-05, + "loss": 0.7999, + "step": 3888 + }, + { + "epoch": 0.5951032899770466, + "grad_norm": 2.65848496667216, + "learning_rate": 1.647594677356296e-05, + "loss": 0.7752, + "step": 3889 + }, + { + "epoch": 0.595256312165264, + "grad_norm": 2.3804867928581412, + "learning_rate": 1.647405823642401e-05, + "loss": 0.7683, + "step": 3890 + }, + { + "epoch": 0.5954093343534812, + "grad_norm": 2.2256140777105893, + "learning_rate": 1.6472169301687816e-05, + "loss": 0.6946, + "step": 3891 + }, + { + "epoch": 0.5955623565416985, + "grad_norm": 2.259429617574836, + "learning_rate": 1.6470279969470384e-05, + "loss": 0.8007, + "step": 3892 + }, + { + "epoch": 0.5957153787299159, + "grad_norm": 2.4244045113345667, + "learning_rate": 1.646839023988775e-05, + "loss": 0.8071, + "step": 3893 + }, + { + "epoch": 0.5958684009181331, + "grad_norm": 2.21180016617028, + "learning_rate": 1.6466500113055963e-05, + "loss": 0.6792, + "step": 3894 + }, + { + "epoch": 0.5960214231063504, + "grad_norm": 2.1941067525426727, + "learning_rate": 1.646460958909111e-05, + "loss": 0.8207, + "step": 3895 + }, + { + "epoch": 0.5961744452945678, + "grad_norm": 2.1232749753013667, + "learning_rate": 1.646271866810929e-05, + "loss": 0.7, + "step": 3896 + }, + { + "epoch": 0.596327467482785, + "grad_norm": 2.190229920133476, + "learning_rate": 1.6460827350226637e-05, + "loss": 0.6335, + "step": 3897 + }, + { + "epoch": 0.5964804896710023, + "grad_norm": 2.2639688761323944, + "learning_rate": 1.64589356355593e-05, + "loss": 0.7466, + "step": 3898 + }, + { + "epoch": 0.5966335118592195, + "grad_norm": 2.460362228450079, + "learning_rate": 1.645704352422346e-05, + "loss": 0.8105, + "step": 3899 + }, + { + "epoch": 0.5967865340474369, + "grad_norm": 2.2461116435022377, + "learning_rate": 1.6455151016335312e-05, + "loss": 0.697, + "step": 3900 + }, + { + "epoch": 0.5969395562356542, + "grad_norm": 2.3594462673289947, + "learning_rate": 1.6453258112011094e-05, + "loss": 0.7736, + "step": 3901 + }, + { + "epoch": 0.5970925784238714, + "grad_norm": 2.106693165284362, + "learning_rate": 1.6451364811367044e-05, + "loss": 0.5927, + "step": 3902 + }, + { + "epoch": 0.5972456006120888, + "grad_norm": 2.247234785821779, + "learning_rate": 1.6449471114519447e-05, + "loss": 0.7286, + "step": 3903 + }, + { + "epoch": 0.5973986228003061, + "grad_norm": 2.0774276621075454, + "learning_rate": 1.6447577021584597e-05, + "loss": 0.6991, + "step": 3904 + }, + { + "epoch": 0.5975516449885233, + "grad_norm": 2.430421209143389, + "learning_rate": 1.644568253267882e-05, + "loss": 0.7457, + "step": 3905 + }, + { + "epoch": 0.5977046671767406, + "grad_norm": 2.330147887370278, + "learning_rate": 1.6443787647918464e-05, + "loss": 0.7581, + "step": 3906 + }, + { + "epoch": 0.5978576893649579, + "grad_norm": 2.0030331146685665, + "learning_rate": 1.6441892367419895e-05, + "loss": 0.5928, + "step": 3907 + }, + { + "epoch": 0.5980107115531752, + "grad_norm": 2.3916619016698166, + "learning_rate": 1.643999669129952e-05, + "loss": 0.781, + "step": 3908 + }, + { + "epoch": 0.5981637337413925, + "grad_norm": 2.399757751453464, + "learning_rate": 1.6438100619673757e-05, + "loss": 0.7236, + "step": 3909 + }, + { + "epoch": 0.5983167559296098, + "grad_norm": 2.5738835936498603, + "learning_rate": 1.6436204152659042e-05, + "loss": 0.7948, + "step": 3910 + }, + { + "epoch": 0.5984697781178271, + "grad_norm": 2.3044877765883913, + "learning_rate": 1.6434307290371855e-05, + "loss": 0.6954, + "step": 3911 + }, + { + "epoch": 0.5986228003060444, + "grad_norm": 2.2008661487646046, + "learning_rate": 1.6432410032928686e-05, + "loss": 0.7159, + "step": 3912 + }, + { + "epoch": 0.5987758224942616, + "grad_norm": 2.3895382666991876, + "learning_rate": 1.6430512380446052e-05, + "loss": 0.8146, + "step": 3913 + }, + { + "epoch": 0.598928844682479, + "grad_norm": 2.065309607267929, + "learning_rate": 1.64286143330405e-05, + "loss": 0.8357, + "step": 3914 + }, + { + "epoch": 0.5990818668706962, + "grad_norm": 2.3359874366462914, + "learning_rate": 1.642671589082859e-05, + "loss": 0.7237, + "step": 3915 + }, + { + "epoch": 0.5992348890589135, + "grad_norm": 2.4962956403966934, + "learning_rate": 1.6424817053926917e-05, + "loss": 0.8363, + "step": 3916 + }, + { + "epoch": 0.5993879112471309, + "grad_norm": 2.468183246991395, + "learning_rate": 1.6422917822452093e-05, + "loss": 0.8228, + "step": 3917 + }, + { + "epoch": 0.5995409334353481, + "grad_norm": 2.5397453332829016, + "learning_rate": 1.6421018196520758e-05, + "loss": 0.9307, + "step": 3918 + }, + { + "epoch": 0.5996939556235654, + "grad_norm": 2.225012213725225, + "learning_rate": 1.641911817624958e-05, + "loss": 0.76, + "step": 3919 + }, + { + "epoch": 0.5998469778117828, + "grad_norm": 2.3966394488811016, + "learning_rate": 1.641721776175524e-05, + "loss": 0.7146, + "step": 3920 + }, + { + "epoch": 0.6, + "grad_norm": 2.303920410231337, + "learning_rate": 1.6415316953154455e-05, + "loss": 0.8073, + "step": 3921 + }, + { + "epoch": 0.6001530221882173, + "grad_norm": 2.211302259446293, + "learning_rate": 1.6413415750563957e-05, + "loss": 0.7277, + "step": 3922 + }, + { + "epoch": 0.6003060443764345, + "grad_norm": 2.2345826594193166, + "learning_rate": 1.6411514154100513e-05, + "loss": 0.6917, + "step": 3923 + }, + { + "epoch": 0.6004590665646519, + "grad_norm": 2.201839243236848, + "learning_rate": 1.6409612163880898e-05, + "loss": 0.704, + "step": 3924 + }, + { + "epoch": 0.6006120887528692, + "grad_norm": 2.2928406532733194, + "learning_rate": 1.6407709780021925e-05, + "loss": 0.754, + "step": 3925 + }, + { + "epoch": 0.6007651109410864, + "grad_norm": 2.3920891472308736, + "learning_rate": 1.640580700264043e-05, + "loss": 0.8704, + "step": 3926 + }, + { + "epoch": 0.6009181331293038, + "grad_norm": 2.1292889428240294, + "learning_rate": 1.6403903831853265e-05, + "loss": 0.7045, + "step": 3927 + }, + { + "epoch": 0.6010711553175211, + "grad_norm": 2.379304273256747, + "learning_rate": 1.640200026777732e-05, + "loss": 0.8551, + "step": 3928 + }, + { + "epoch": 0.6012241775057383, + "grad_norm": 2.282693014805505, + "learning_rate": 1.6400096310529487e-05, + "loss": 0.7696, + "step": 3929 + }, + { + "epoch": 0.6013771996939556, + "grad_norm": 2.206776247521118, + "learning_rate": 1.63981919602267e-05, + "loss": 0.6743, + "step": 3930 + }, + { + "epoch": 0.6015302218821729, + "grad_norm": 2.608385419308765, + "learning_rate": 1.639628721698592e-05, + "loss": 0.8967, + "step": 3931 + }, + { + "epoch": 0.6016832440703902, + "grad_norm": 2.455187279779116, + "learning_rate": 1.639438208092412e-05, + "loss": 0.7461, + "step": 3932 + }, + { + "epoch": 0.6018362662586075, + "grad_norm": 2.2605743335848274, + "learning_rate": 1.63924765521583e-05, + "loss": 0.7199, + "step": 3933 + }, + { + "epoch": 0.6019892884468248, + "grad_norm": 2.445481051817927, + "learning_rate": 1.6390570630805487e-05, + "loss": 0.767, + "step": 3934 + }, + { + "epoch": 0.6021423106350421, + "grad_norm": 2.583444641012235, + "learning_rate": 1.638866431698273e-05, + "loss": 0.8196, + "step": 3935 + }, + { + "epoch": 0.6022953328232594, + "grad_norm": 2.5086801152333646, + "learning_rate": 1.6386757610807106e-05, + "loss": 0.816, + "step": 3936 + }, + { + "epoch": 0.6024483550114766, + "grad_norm": 2.3705704487061348, + "learning_rate": 1.6384850512395715e-05, + "loss": 0.7676, + "step": 3937 + }, + { + "epoch": 0.602601377199694, + "grad_norm": 2.1626859005056236, + "learning_rate": 1.638294302186567e-05, + "loss": 0.7177, + "step": 3938 + }, + { + "epoch": 0.6027543993879112, + "grad_norm": 2.46703763158888, + "learning_rate": 1.6381035139334128e-05, + "loss": 0.7297, + "step": 3939 + }, + { + "epoch": 0.6029074215761285, + "grad_norm": 2.5716263838061666, + "learning_rate": 1.6379126864918256e-05, + "loss": 0.9171, + "step": 3940 + }, + { + "epoch": 0.6030604437643459, + "grad_norm": 2.193123379792748, + "learning_rate": 1.6377218198735246e-05, + "loss": 0.7157, + "step": 3941 + }, + { + "epoch": 0.6032134659525631, + "grad_norm": 2.3421215315975807, + "learning_rate": 1.637530914090232e-05, + "loss": 0.748, + "step": 3942 + }, + { + "epoch": 0.6033664881407804, + "grad_norm": 2.20680704151605, + "learning_rate": 1.637339969153672e-05, + "loss": 0.7783, + "step": 3943 + }, + { + "epoch": 0.6035195103289978, + "grad_norm": 2.5368724067005823, + "learning_rate": 1.6371489850755712e-05, + "loss": 0.7137, + "step": 3944 + }, + { + "epoch": 0.603672532517215, + "grad_norm": 2.347964186524102, + "learning_rate": 1.6369579618676584e-05, + "loss": 0.7715, + "step": 3945 + }, + { + "epoch": 0.6038255547054323, + "grad_norm": 2.4869195064086487, + "learning_rate": 1.636766899541666e-05, + "loss": 0.8303, + "step": 3946 + }, + { + "epoch": 0.6039785768936495, + "grad_norm": 2.2848097041733326, + "learning_rate": 1.6365757981093266e-05, + "loss": 0.8186, + "step": 3947 + }, + { + "epoch": 0.6041315990818669, + "grad_norm": 2.4440133764573035, + "learning_rate": 1.6363846575823772e-05, + "loss": 0.8781, + "step": 3948 + }, + { + "epoch": 0.6042846212700842, + "grad_norm": 2.527286580207838, + "learning_rate": 1.6361934779725564e-05, + "loss": 0.7026, + "step": 3949 + }, + { + "epoch": 0.6044376434583014, + "grad_norm": 2.6768839444382713, + "learning_rate": 1.6360022592916056e-05, + "loss": 0.7902, + "step": 3950 + }, + { + "epoch": 0.6045906656465188, + "grad_norm": 2.563244839367599, + "learning_rate": 1.635811001551268e-05, + "loss": 0.7985, + "step": 3951 + }, + { + "epoch": 0.6047436878347361, + "grad_norm": 2.435160345656961, + "learning_rate": 1.6356197047632894e-05, + "loss": 0.8541, + "step": 3952 + }, + { + "epoch": 0.6048967100229533, + "grad_norm": 2.660440652538607, + "learning_rate": 1.635428368939418e-05, + "loss": 0.7915, + "step": 3953 + }, + { + "epoch": 0.6050497322111706, + "grad_norm": 2.302355575360249, + "learning_rate": 1.635236994091405e-05, + "loss": 0.7916, + "step": 3954 + }, + { + "epoch": 0.6052027543993879, + "grad_norm": 2.5678687348149127, + "learning_rate": 1.6350455802310027e-05, + "loss": 0.8768, + "step": 3955 + }, + { + "epoch": 0.6053557765876052, + "grad_norm": 2.4133307260880015, + "learning_rate": 1.6348541273699672e-05, + "loss": 0.8734, + "step": 3956 + }, + { + "epoch": 0.6055087987758225, + "grad_norm": 2.1442604005186716, + "learning_rate": 1.6346626355200564e-05, + "loss": 0.8057, + "step": 3957 + }, + { + "epoch": 0.6056618209640398, + "grad_norm": 2.2777122530462344, + "learning_rate": 1.63447110469303e-05, + "loss": 0.6944, + "step": 3958 + }, + { + "epoch": 0.6058148431522571, + "grad_norm": 2.2702458426898118, + "learning_rate": 1.6342795349006514e-05, + "loss": 0.7787, + "step": 3959 + }, + { + "epoch": 0.6059678653404744, + "grad_norm": 2.4138860942835665, + "learning_rate": 1.6340879261546848e-05, + "loss": 0.7319, + "step": 3960 + }, + { + "epoch": 0.6061208875286916, + "grad_norm": 2.406986355064948, + "learning_rate": 1.6338962784668984e-05, + "loss": 0.8305, + "step": 3961 + }, + { + "epoch": 0.606273909716909, + "grad_norm": 2.1320662468950053, + "learning_rate": 1.633704591849061e-05, + "loss": 0.7107, + "step": 3962 + }, + { + "epoch": 0.6064269319051262, + "grad_norm": 2.170118161412104, + "learning_rate": 1.6335128663129466e-05, + "loss": 0.6988, + "step": 3963 + }, + { + "epoch": 0.6065799540933435, + "grad_norm": 2.288231162241274, + "learning_rate": 1.6333211018703282e-05, + "loss": 0.7682, + "step": 3964 + }, + { + "epoch": 0.6067329762815609, + "grad_norm": 2.0736456947567863, + "learning_rate": 1.6331292985329835e-05, + "loss": 0.7372, + "step": 3965 + }, + { + "epoch": 0.6068859984697781, + "grad_norm": 2.107363700837136, + "learning_rate": 1.632937456312692e-05, + "loss": 0.7282, + "step": 3966 + }, + { + "epoch": 0.6070390206579954, + "grad_norm": 2.0881937408877524, + "learning_rate": 1.6327455752212348e-05, + "loss": 0.7685, + "step": 3967 + }, + { + "epoch": 0.6071920428462128, + "grad_norm": 2.1645877937641425, + "learning_rate": 1.6325536552703963e-05, + "loss": 0.834, + "step": 3968 + }, + { + "epoch": 0.60734506503443, + "grad_norm": 2.377608992372084, + "learning_rate": 1.6323616964719642e-05, + "loss": 0.7594, + "step": 3969 + }, + { + "epoch": 0.6074980872226473, + "grad_norm": 2.028436733602878, + "learning_rate": 1.632169698837726e-05, + "loss": 0.7597, + "step": 3970 + }, + { + "epoch": 0.6076511094108645, + "grad_norm": 2.113181958173014, + "learning_rate": 1.631977662379473e-05, + "loss": 0.8032, + "step": 3971 + }, + { + "epoch": 0.6078041315990819, + "grad_norm": 2.5583012059184402, + "learning_rate": 1.6317855871090003e-05, + "loss": 0.7125, + "step": 3972 + }, + { + "epoch": 0.6079571537872992, + "grad_norm": 2.3758675047618008, + "learning_rate": 1.6315934730381027e-05, + "loss": 0.81, + "step": 3973 + }, + { + "epoch": 0.6081101759755164, + "grad_norm": 2.2335382453442225, + "learning_rate": 1.631401320178579e-05, + "loss": 0.7042, + "step": 3974 + }, + { + "epoch": 0.6082631981637338, + "grad_norm": 2.388399327130747, + "learning_rate": 1.6312091285422305e-05, + "loss": 0.8333, + "step": 3975 + }, + { + "epoch": 0.6084162203519511, + "grad_norm": 2.545928594962447, + "learning_rate": 1.63101689814086e-05, + "loss": 0.8372, + "step": 3976 + }, + { + "epoch": 0.6085692425401683, + "grad_norm": 2.0449415964423037, + "learning_rate": 1.630824628986273e-05, + "loss": 0.7397, + "step": 3977 + }, + { + "epoch": 0.6087222647283856, + "grad_norm": 2.567039032923848, + "learning_rate": 1.6306323210902784e-05, + "loss": 0.7466, + "step": 3978 + }, + { + "epoch": 0.6088752869166029, + "grad_norm": 2.4160435851987905, + "learning_rate": 1.6304399744646854e-05, + "loss": 0.7566, + "step": 3979 + }, + { + "epoch": 0.6090283091048202, + "grad_norm": 2.536239714103674, + "learning_rate": 1.630247589121307e-05, + "loss": 0.9314, + "step": 3980 + }, + { + "epoch": 0.6091813312930375, + "grad_norm": 2.278829958141022, + "learning_rate": 1.630055165071959e-05, + "loss": 0.7122, + "step": 3981 + }, + { + "epoch": 0.6093343534812548, + "grad_norm": 2.387173473026114, + "learning_rate": 1.6298627023284584e-05, + "loss": 0.7642, + "step": 3982 + }, + { + "epoch": 0.6094873756694721, + "grad_norm": 2.3274832439959856, + "learning_rate": 1.6296702009026256e-05, + "loss": 0.7805, + "step": 3983 + }, + { + "epoch": 0.6096403978576894, + "grad_norm": 2.413713126301562, + "learning_rate": 1.6294776608062818e-05, + "loss": 0.8906, + "step": 3984 + }, + { + "epoch": 0.6097934200459066, + "grad_norm": 2.2657003983898134, + "learning_rate": 1.629285082051253e-05, + "loss": 0.759, + "step": 3985 + }, + { + "epoch": 0.609946442234124, + "grad_norm": 2.059377996694824, + "learning_rate": 1.6290924646493654e-05, + "loss": 0.6946, + "step": 3986 + }, + { + "epoch": 0.6100994644223412, + "grad_norm": 2.240469903234145, + "learning_rate": 1.6288998086124478e-05, + "loss": 0.7731, + "step": 3987 + }, + { + "epoch": 0.6102524866105585, + "grad_norm": 2.1045040115900506, + "learning_rate": 1.6287071139523334e-05, + "loss": 0.7427, + "step": 3988 + }, + { + "epoch": 0.6104055087987759, + "grad_norm": 2.2817203881792567, + "learning_rate": 1.6285143806808554e-05, + "loss": 0.7773, + "step": 3989 + }, + { + "epoch": 0.6105585309869931, + "grad_norm": 2.445802561201833, + "learning_rate": 1.628321608809851e-05, + "loss": 0.838, + "step": 3990 + }, + { + "epoch": 0.6107115531752104, + "grad_norm": 2.5828642223393214, + "learning_rate": 1.628128798351158e-05, + "loss": 0.6483, + "step": 3991 + }, + { + "epoch": 0.6108645753634276, + "grad_norm": 2.4672233905559477, + "learning_rate": 1.6279359493166183e-05, + "loss": 0.8611, + "step": 3992 + }, + { + "epoch": 0.611017597551645, + "grad_norm": 2.219671457282571, + "learning_rate": 1.6277430617180755e-05, + "loss": 0.7172, + "step": 3993 + }, + { + "epoch": 0.6111706197398623, + "grad_norm": 2.4789990648031153, + "learning_rate": 1.6275501355673756e-05, + "loss": 0.8063, + "step": 3994 + }, + { + "epoch": 0.6113236419280795, + "grad_norm": 2.3551450398894467, + "learning_rate": 1.6273571708763665e-05, + "loss": 0.7361, + "step": 3995 + }, + { + "epoch": 0.6114766641162969, + "grad_norm": 2.529516672420973, + "learning_rate": 1.6271641676569e-05, + "loss": 0.8252, + "step": 3996 + }, + { + "epoch": 0.6116296863045142, + "grad_norm": 2.1194774939225143, + "learning_rate": 1.626971125920828e-05, + "loss": 0.7994, + "step": 3997 + }, + { + "epoch": 0.6117827084927314, + "grad_norm": 2.130797951705835, + "learning_rate": 1.6267780456800066e-05, + "loss": 0.7897, + "step": 3998 + }, + { + "epoch": 0.6119357306809488, + "grad_norm": 2.3216160730004636, + "learning_rate": 1.6265849269462936e-05, + "loss": 0.8488, + "step": 3999 + }, + { + "epoch": 0.612088752869166, + "grad_norm": 2.7626139147850224, + "learning_rate": 1.6263917697315488e-05, + "loss": 0.7936, + "step": 4000 + }, + { + "epoch": 0.6122417750573833, + "grad_norm": 2.082553450963893, + "learning_rate": 1.6261985740476348e-05, + "loss": 0.7721, + "step": 4001 + }, + { + "epoch": 0.6123947972456006, + "grad_norm": 2.3936912056745965, + "learning_rate": 1.626005339906417e-05, + "loss": 0.8584, + "step": 4002 + }, + { + "epoch": 0.6125478194338179, + "grad_norm": 2.2087315724400036, + "learning_rate": 1.6258120673197623e-05, + "loss": 0.7804, + "step": 4003 + }, + { + "epoch": 0.6127008416220352, + "grad_norm": 2.354583524321165, + "learning_rate": 1.6256187562995403e-05, + "loss": 0.7383, + "step": 4004 + }, + { + "epoch": 0.6128538638102525, + "grad_norm": 2.098041581723552, + "learning_rate": 1.6254254068576227e-05, + "loss": 0.7599, + "step": 4005 + }, + { + "epoch": 0.6130068859984698, + "grad_norm": 2.0327806604154754, + "learning_rate": 1.6252320190058847e-05, + "loss": 0.7705, + "step": 4006 + }, + { + "epoch": 0.6131599081866871, + "grad_norm": 2.496718004055835, + "learning_rate": 1.625038592756202e-05, + "loss": 0.7543, + "step": 4007 + }, + { + "epoch": 0.6133129303749043, + "grad_norm": 2.4936165790493554, + "learning_rate": 1.6248451281204545e-05, + "loss": 0.8754, + "step": 4008 + }, + { + "epoch": 0.6134659525631216, + "grad_norm": 2.2205096614417075, + "learning_rate": 1.624651625110523e-05, + "loss": 0.7938, + "step": 4009 + }, + { + "epoch": 0.613618974751339, + "grad_norm": 2.1301811356198046, + "learning_rate": 1.624458083738292e-05, + "loss": 0.7472, + "step": 4010 + }, + { + "epoch": 0.6137719969395562, + "grad_norm": 2.3619766841954326, + "learning_rate": 1.624264504015647e-05, + "loss": 0.7185, + "step": 4011 + }, + { + "epoch": 0.6139250191277735, + "grad_norm": 2.6369274563223524, + "learning_rate": 1.6240708859544766e-05, + "loss": 0.8762, + "step": 4012 + }, + { + "epoch": 0.6140780413159909, + "grad_norm": 2.3391139454523877, + "learning_rate": 1.6238772295666718e-05, + "loss": 0.8134, + "step": 4013 + }, + { + "epoch": 0.6142310635042081, + "grad_norm": 2.472559033054485, + "learning_rate": 1.6236835348641254e-05, + "loss": 0.8442, + "step": 4014 + }, + { + "epoch": 0.6143840856924254, + "grad_norm": 2.30552269826598, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.7564, + "step": 4015 + }, + { + "epoch": 0.6145371078806426, + "grad_norm": 2.2725183178366644, + "learning_rate": 1.623296030562394e-05, + "loss": 0.8004, + "step": 4016 + }, + { + "epoch": 0.61469013006886, + "grad_norm": 2.1530496291189736, + "learning_rate": 1.6231022209870063e-05, + "loss": 0.7653, + "step": 4017 + }, + { + "epoch": 0.6148431522570773, + "grad_norm": 2.1620242897683797, + "learning_rate": 1.6229083731444745e-05, + "loss": 0.7129, + "step": 4018 + }, + { + "epoch": 0.6149961744452945, + "grad_norm": 2.25085442826096, + "learning_rate": 1.622714487046702e-05, + "loss": 0.7554, + "step": 4019 + }, + { + "epoch": 0.6151491966335119, + "grad_norm": 2.169103357595413, + "learning_rate": 1.622520562705597e-05, + "loss": 0.7607, + "step": 4020 + }, + { + "epoch": 0.6153022188217292, + "grad_norm": 2.3292061531576955, + "learning_rate": 1.622326600133069e-05, + "loss": 0.8104, + "step": 4021 + }, + { + "epoch": 0.6154552410099464, + "grad_norm": 2.170753862893607, + "learning_rate": 1.6221325993410295e-05, + "loss": 0.745, + "step": 4022 + }, + { + "epoch": 0.6156082631981638, + "grad_norm": 2.4632365927830207, + "learning_rate": 1.6219385603413937e-05, + "loss": 0.68, + "step": 4023 + }, + { + "epoch": 0.615761285386381, + "grad_norm": 2.085184298715942, + "learning_rate": 1.6217444831460777e-05, + "loss": 0.7114, + "step": 4024 + }, + { + "epoch": 0.6159143075745983, + "grad_norm": 2.7017932076046374, + "learning_rate": 1.621550367767001e-05, + "loss": 0.8658, + "step": 4025 + }, + { + "epoch": 0.6160673297628156, + "grad_norm": 2.66819564271173, + "learning_rate": 1.6213562142160842e-05, + "loss": 0.8669, + "step": 4026 + }, + { + "epoch": 0.6162203519510329, + "grad_norm": 2.137016223484698, + "learning_rate": 1.621162022505252e-05, + "loss": 0.7017, + "step": 4027 + }, + { + "epoch": 0.6163733741392502, + "grad_norm": 2.3297061769686174, + "learning_rate": 1.6209677926464297e-05, + "loss": 0.7102, + "step": 4028 + }, + { + "epoch": 0.6165263963274675, + "grad_norm": 2.2418631040587287, + "learning_rate": 1.620773524651546e-05, + "loss": 0.8347, + "step": 4029 + }, + { + "epoch": 0.6166794185156848, + "grad_norm": 2.2617571464792423, + "learning_rate": 1.6205792185325318e-05, + "loss": 0.8687, + "step": 4030 + }, + { + "epoch": 0.6168324407039021, + "grad_norm": 2.5724992577181705, + "learning_rate": 1.6203848743013202e-05, + "loss": 0.724, + "step": 4031 + }, + { + "epoch": 0.6169854628921193, + "grad_norm": 2.227267112590208, + "learning_rate": 1.6201904919698463e-05, + "loss": 0.6963, + "step": 4032 + }, + { + "epoch": 0.6171384850803366, + "grad_norm": 2.172006678534527, + "learning_rate": 1.6199960715500482e-05, + "loss": 0.7051, + "step": 4033 + }, + { + "epoch": 0.617291507268554, + "grad_norm": 2.1373746947180012, + "learning_rate": 1.619801613053866e-05, + "loss": 0.7327, + "step": 4034 + }, + { + "epoch": 0.6174445294567712, + "grad_norm": 2.11476592544764, + "learning_rate": 1.6196071164932415e-05, + "loss": 0.7032, + "step": 4035 + }, + { + "epoch": 0.6175975516449885, + "grad_norm": 2.300625622423079, + "learning_rate": 1.6194125818801207e-05, + "loss": 0.7691, + "step": 4036 + }, + { + "epoch": 0.6177505738332059, + "grad_norm": 2.428594165574761, + "learning_rate": 1.61921800922645e-05, + "loss": 0.723, + "step": 4037 + }, + { + "epoch": 0.6179035960214231, + "grad_norm": 2.169473308137043, + "learning_rate": 1.6190233985441786e-05, + "loss": 0.8408, + "step": 4038 + }, + { + "epoch": 0.6180566182096404, + "grad_norm": 2.5360559820737376, + "learning_rate": 1.618828749845259e-05, + "loss": 0.8811, + "step": 4039 + }, + { + "epoch": 0.6182096403978576, + "grad_norm": 2.39178836667541, + "learning_rate": 1.6186340631416452e-05, + "loss": 0.788, + "step": 4040 + }, + { + "epoch": 0.618362662586075, + "grad_norm": 2.3918450316007376, + "learning_rate": 1.6184393384452928e-05, + "loss": 0.7039, + "step": 4041 + }, + { + "epoch": 0.6185156847742923, + "grad_norm": 2.338128319850209, + "learning_rate": 1.6182445757681616e-05, + "loss": 0.8499, + "step": 4042 + }, + { + "epoch": 0.6186687069625095, + "grad_norm": 2.3173269279379136, + "learning_rate": 1.6180497751222127e-05, + "loss": 0.7303, + "step": 4043 + }, + { + "epoch": 0.6188217291507269, + "grad_norm": 2.438141149323398, + "learning_rate": 1.6178549365194086e-05, + "loss": 0.8503, + "step": 4044 + }, + { + "epoch": 0.6189747513389442, + "grad_norm": 2.2321932250097354, + "learning_rate": 1.6176600599717165e-05, + "loss": 0.7228, + "step": 4045 + }, + { + "epoch": 0.6191277735271614, + "grad_norm": 2.3226574409768825, + "learning_rate": 1.6174651454911034e-05, + "loss": 0.7527, + "step": 4046 + }, + { + "epoch": 0.6192807957153788, + "grad_norm": 2.0741031426015915, + "learning_rate": 1.6172701930895404e-05, + "loss": 0.7485, + "step": 4047 + }, + { + "epoch": 0.619433817903596, + "grad_norm": 2.4837597123114494, + "learning_rate": 1.6170752027790002e-05, + "loss": 0.7971, + "step": 4048 + }, + { + "epoch": 0.6195868400918133, + "grad_norm": 2.349665609451401, + "learning_rate": 1.6168801745714576e-05, + "loss": 0.7738, + "step": 4049 + }, + { + "epoch": 0.6197398622800306, + "grad_norm": 2.3536660431739045, + "learning_rate": 1.6166851084788898e-05, + "loss": 0.7474, + "step": 4050 + }, + { + "epoch": 0.6198928844682479, + "grad_norm": 2.516752609598269, + "learning_rate": 1.6164900045132777e-05, + "loss": 0.7539, + "step": 4051 + }, + { + "epoch": 0.6200459066564652, + "grad_norm": 2.5523705841883824, + "learning_rate": 1.6162948626866023e-05, + "loss": 0.8509, + "step": 4052 + }, + { + "epoch": 0.6201989288446825, + "grad_norm": 2.2026496418341495, + "learning_rate": 1.6160996830108486e-05, + "loss": 0.8403, + "step": 4053 + }, + { + "epoch": 0.6203519510328998, + "grad_norm": 2.017536909380819, + "learning_rate": 1.615904465498003e-05, + "loss": 0.6502, + "step": 4054 + }, + { + "epoch": 0.6205049732211171, + "grad_norm": 2.174952113668595, + "learning_rate": 1.6157092101600548e-05, + "loss": 0.7717, + "step": 4055 + }, + { + "epoch": 0.6206579954093343, + "grad_norm": 1.8763628970458017, + "learning_rate": 1.6155139170089956e-05, + "loss": 0.757, + "step": 4056 + }, + { + "epoch": 0.6208110175975516, + "grad_norm": 2.322317658519997, + "learning_rate": 1.6153185860568187e-05, + "loss": 0.7672, + "step": 4057 + }, + { + "epoch": 0.620964039785769, + "grad_norm": 2.3623718327673764, + "learning_rate": 1.61512321731552e-05, + "loss": 0.7172, + "step": 4058 + }, + { + "epoch": 0.6211170619739862, + "grad_norm": 2.1276117730331805, + "learning_rate": 1.6149278107970983e-05, + "loss": 0.7645, + "step": 4059 + }, + { + "epoch": 0.6212700841622035, + "grad_norm": 2.564749256711884, + "learning_rate": 1.6147323665135542e-05, + "loss": 0.7936, + "step": 4060 + }, + { + "epoch": 0.6214231063504209, + "grad_norm": 2.413120918684202, + "learning_rate": 1.6145368844768908e-05, + "loss": 0.6705, + "step": 4061 + }, + { + "epoch": 0.6215761285386381, + "grad_norm": 2.445889193129111, + "learning_rate": 1.6143413646991132e-05, + "loss": 0.8038, + "step": 4062 + }, + { + "epoch": 0.6217291507268554, + "grad_norm": 2.117825343139216, + "learning_rate": 1.6141458071922285e-05, + "loss": 0.7411, + "step": 4063 + }, + { + "epoch": 0.6218821729150726, + "grad_norm": 2.344357755958016, + "learning_rate": 1.6139502119682478e-05, + "loss": 0.855, + "step": 4064 + }, + { + "epoch": 0.62203519510329, + "grad_norm": 2.275018128282366, + "learning_rate": 1.613754579039183e-05, + "loss": 0.7594, + "step": 4065 + }, + { + "epoch": 0.6221882172915073, + "grad_norm": 2.123207758811535, + "learning_rate": 1.613558908417048e-05, + "loss": 0.831, + "step": 4066 + }, + { + "epoch": 0.6223412394797245, + "grad_norm": 2.318206412634962, + "learning_rate": 1.6133632001138604e-05, + "loss": 0.7474, + "step": 4067 + }, + { + "epoch": 0.6224942616679419, + "grad_norm": 2.3643768786552535, + "learning_rate": 1.613167454141639e-05, + "loss": 0.728, + "step": 4068 + }, + { + "epoch": 0.6226472838561592, + "grad_norm": 2.1021830426618084, + "learning_rate": 1.6129716705124055e-05, + "loss": 0.7078, + "step": 4069 + }, + { + "epoch": 0.6228003060443764, + "grad_norm": 2.1384768150213858, + "learning_rate": 1.612775849238184e-05, + "loss": 0.7496, + "step": 4070 + }, + { + "epoch": 0.6229533282325938, + "grad_norm": 2.0607954889554434, + "learning_rate": 1.612579990331e-05, + "loss": 0.6618, + "step": 4071 + }, + { + "epoch": 0.623106350420811, + "grad_norm": 2.370009379278904, + "learning_rate": 1.612384093802883e-05, + "loss": 0.8368, + "step": 4072 + }, + { + "epoch": 0.6232593726090283, + "grad_norm": 2.0723916821807427, + "learning_rate": 1.612188159665863e-05, + "loss": 0.6796, + "step": 4073 + }, + { + "epoch": 0.6234123947972456, + "grad_norm": 2.7584617297969425, + "learning_rate": 1.6119921879319728e-05, + "loss": 0.8882, + "step": 4074 + }, + { + "epoch": 0.6235654169854629, + "grad_norm": 2.436001429058616, + "learning_rate": 1.6117961786132485e-05, + "loss": 0.7565, + "step": 4075 + }, + { + "epoch": 0.6237184391736802, + "grad_norm": 2.4908275513686626, + "learning_rate": 1.6116001317217277e-05, + "loss": 0.8403, + "step": 4076 + }, + { + "epoch": 0.6238714613618975, + "grad_norm": 2.1720916230556515, + "learning_rate": 1.6114040472694496e-05, + "loss": 0.7549, + "step": 4077 + }, + { + "epoch": 0.6240244835501148, + "grad_norm": 2.1638352336847517, + "learning_rate": 1.611207925268458e-05, + "loss": 0.6232, + "step": 4078 + }, + { + "epoch": 0.6241775057383321, + "grad_norm": 2.254748294563211, + "learning_rate": 1.611011765730796e-05, + "loss": 0.8494, + "step": 4079 + }, + { + "epoch": 0.6243305279265493, + "grad_norm": 2.051169583162102, + "learning_rate": 1.610815568668511e-05, + "loss": 0.7465, + "step": 4080 + }, + { + "epoch": 0.6244835501147666, + "grad_norm": 2.480860331299139, + "learning_rate": 1.610619334093653e-05, + "loss": 0.9505, + "step": 4081 + }, + { + "epoch": 0.624636572302984, + "grad_norm": 2.581081394347132, + "learning_rate": 1.6104230620182724e-05, + "loss": 0.8689, + "step": 4082 + }, + { + "epoch": 0.6247895944912012, + "grad_norm": 2.4647455923728416, + "learning_rate": 1.610226752454424e-05, + "loss": 0.7986, + "step": 4083 + }, + { + "epoch": 0.6249426166794185, + "grad_norm": 2.3084776587149376, + "learning_rate": 1.6100304054141634e-05, + "loss": 0.8543, + "step": 4084 + }, + { + "epoch": 0.6250956388676359, + "grad_norm": 2.278351087731257, + "learning_rate": 1.609834020909549e-05, + "loss": 0.8122, + "step": 4085 + }, + { + "epoch": 0.6252486610558531, + "grad_norm": 2.276115015814235, + "learning_rate": 1.609637598952642e-05, + "loss": 0.647, + "step": 4086 + }, + { + "epoch": 0.6254016832440704, + "grad_norm": 2.4313627499110235, + "learning_rate": 1.609441139555505e-05, + "loss": 0.7891, + "step": 4087 + }, + { + "epoch": 0.6255547054322876, + "grad_norm": 2.4151414352242297, + "learning_rate": 1.6092446427302028e-05, + "loss": 0.7811, + "step": 4088 + }, + { + "epoch": 0.625707727620505, + "grad_norm": 2.1618931472619822, + "learning_rate": 1.609048108488805e-05, + "loss": 0.8202, + "step": 4089 + }, + { + "epoch": 0.6258607498087223, + "grad_norm": 2.2892118398805485, + "learning_rate": 1.6088515368433794e-05, + "loss": 0.7341, + "step": 4090 + }, + { + "epoch": 0.6260137719969395, + "grad_norm": 2.319986784666118, + "learning_rate": 1.6086549278059988e-05, + "loss": 0.7378, + "step": 4091 + }, + { + "epoch": 0.6261667941851569, + "grad_norm": 2.388332513852767, + "learning_rate": 1.6084582813887385e-05, + "loss": 0.8177, + "step": 4092 + }, + { + "epoch": 0.6263198163733742, + "grad_norm": 2.4953449296501113, + "learning_rate": 1.608261597603675e-05, + "loss": 0.7417, + "step": 4093 + }, + { + "epoch": 0.6264728385615914, + "grad_norm": 2.2534640175752516, + "learning_rate": 1.608064876462887e-05, + "loss": 0.6792, + "step": 4094 + }, + { + "epoch": 0.6266258607498087, + "grad_norm": 2.1813003582366566, + "learning_rate": 1.6078681179784563e-05, + "loss": 0.7858, + "step": 4095 + }, + { + "epoch": 0.626778882938026, + "grad_norm": 2.265021488746793, + "learning_rate": 1.6076713221624663e-05, + "loss": 0.7861, + "step": 4096 + }, + { + "epoch": 0.6269319051262433, + "grad_norm": 2.394134574972118, + "learning_rate": 1.6074744890270033e-05, + "loss": 0.7879, + "step": 4097 + }, + { + "epoch": 0.6270849273144606, + "grad_norm": 2.324105771909182, + "learning_rate": 1.6072776185841553e-05, + "loss": 0.8359, + "step": 4098 + }, + { + "epoch": 0.6272379495026779, + "grad_norm": 2.238004787754105, + "learning_rate": 1.6070807108460134e-05, + "loss": 0.7473, + "step": 4099 + }, + { + "epoch": 0.6273909716908952, + "grad_norm": 2.4442647854498007, + "learning_rate": 1.60688376582467e-05, + "loss": 0.7507, + "step": 4100 + }, + { + "epoch": 0.6275439938791124, + "grad_norm": 2.2586968036292965, + "learning_rate": 1.6066867835322205e-05, + "loss": 0.7854, + "step": 4101 + }, + { + "epoch": 0.6276970160673297, + "grad_norm": 2.1196358650293647, + "learning_rate": 1.606489763980762e-05, + "loss": 0.6373, + "step": 4102 + }, + { + "epoch": 0.6278500382555471, + "grad_norm": 2.211548528141079, + "learning_rate": 1.6062927071823944e-05, + "loss": 0.6638, + "step": 4103 + }, + { + "epoch": 0.6280030604437643, + "grad_norm": 2.4010811990195124, + "learning_rate": 1.6060956131492203e-05, + "loss": 0.7585, + "step": 4104 + }, + { + "epoch": 0.6281560826319816, + "grad_norm": 2.5356718330834416, + "learning_rate": 1.605898481893343e-05, + "loss": 0.8063, + "step": 4105 + }, + { + "epoch": 0.628309104820199, + "grad_norm": 2.205202390214095, + "learning_rate": 1.6057013134268703e-05, + "loss": 0.7701, + "step": 4106 + }, + { + "epoch": 0.6284621270084162, + "grad_norm": 2.2819514096743228, + "learning_rate": 1.6055041077619094e-05, + "loss": 0.7002, + "step": 4107 + }, + { + "epoch": 0.6286151491966335, + "grad_norm": 2.3022349264292745, + "learning_rate": 1.6053068649105735e-05, + "loss": 0.9141, + "step": 4108 + }, + { + "epoch": 0.6287681713848507, + "grad_norm": 2.277035553809249, + "learning_rate": 1.6051095848849747e-05, + "loss": 0.8087, + "step": 4109 + }, + { + "epoch": 0.6289211935730681, + "grad_norm": 2.4443791729171997, + "learning_rate": 1.604912267697229e-05, + "loss": 0.7876, + "step": 4110 + }, + { + "epoch": 0.6290742157612854, + "grad_norm": 2.1725975257265784, + "learning_rate": 1.6047149133594546e-05, + "loss": 0.7552, + "step": 4111 + }, + { + "epoch": 0.6292272379495026, + "grad_norm": 2.328562803605211, + "learning_rate": 1.6045175218837716e-05, + "loss": 0.7316, + "step": 4112 + }, + { + "epoch": 0.62938026013772, + "grad_norm": 2.247396465495871, + "learning_rate": 1.604320093282303e-05, + "loss": 0.6445, + "step": 4113 + }, + { + "epoch": 0.6295332823259373, + "grad_norm": 2.173139920348392, + "learning_rate": 1.6041226275671727e-05, + "loss": 0.7427, + "step": 4114 + }, + { + "epoch": 0.6296863045141545, + "grad_norm": 2.099285404139486, + "learning_rate": 1.603925124750509e-05, + "loss": 0.7111, + "step": 4115 + }, + { + "epoch": 0.6298393267023719, + "grad_norm": 1.9917731708573834, + "learning_rate": 1.603727584844441e-05, + "loss": 0.6858, + "step": 4116 + }, + { + "epoch": 0.6299923488905891, + "grad_norm": 2.215400393635466, + "learning_rate": 1.6035300078611e-05, + "loss": 0.7705, + "step": 4117 + }, + { + "epoch": 0.6301453710788064, + "grad_norm": 2.0245212493288447, + "learning_rate": 1.6033323938126198e-05, + "loss": 0.6323, + "step": 4118 + }, + { + "epoch": 0.6302983932670237, + "grad_norm": 2.3197733532871885, + "learning_rate": 1.603134742711138e-05, + "loss": 0.7391, + "step": 4119 + }, + { + "epoch": 0.630451415455241, + "grad_norm": 2.4936860390189213, + "learning_rate": 1.6029370545687912e-05, + "loss": 0.8139, + "step": 4120 + }, + { + "epoch": 0.6306044376434583, + "grad_norm": 2.181252151265491, + "learning_rate": 1.6027393293977217e-05, + "loss": 0.7221, + "step": 4121 + }, + { + "epoch": 0.6307574598316756, + "grad_norm": 2.1530316724351755, + "learning_rate": 1.602541567210072e-05, + "loss": 0.7455, + "step": 4122 + }, + { + "epoch": 0.6309104820198929, + "grad_norm": 2.215998435857359, + "learning_rate": 1.6023437680179875e-05, + "loss": 0.7335, + "step": 4123 + }, + { + "epoch": 0.6310635042081102, + "grad_norm": 2.4792874976399717, + "learning_rate": 1.6021459318336154e-05, + "loss": 0.7476, + "step": 4124 + }, + { + "epoch": 0.6312165263963274, + "grad_norm": 2.026954969477471, + "learning_rate": 1.6019480586691062e-05, + "loss": 0.5765, + "step": 4125 + }, + { + "epoch": 0.6313695485845447, + "grad_norm": 2.595157669941229, + "learning_rate": 1.6017501485366125e-05, + "loss": 0.8033, + "step": 4126 + }, + { + "epoch": 0.6315225707727621, + "grad_norm": 2.3899715610542094, + "learning_rate": 1.6015522014482877e-05, + "loss": 0.7496, + "step": 4127 + }, + { + "epoch": 0.6316755929609793, + "grad_norm": 2.234234590699503, + "learning_rate": 1.601354217416289e-05, + "loss": 0.7119, + "step": 4128 + }, + { + "epoch": 0.6318286151491966, + "grad_norm": 2.359124165929824, + "learning_rate": 1.6011561964527748e-05, + "loss": 0.8503, + "step": 4129 + }, + { + "epoch": 0.631981637337414, + "grad_norm": 2.3720387035217527, + "learning_rate": 1.6009581385699076e-05, + "loss": 0.7733, + "step": 4130 + }, + { + "epoch": 0.6321346595256312, + "grad_norm": 2.134605187970971, + "learning_rate": 1.6007600437798495e-05, + "loss": 0.7318, + "step": 4131 + }, + { + "epoch": 0.6322876817138485, + "grad_norm": 2.129685324766079, + "learning_rate": 1.6005619120947672e-05, + "loss": 0.7498, + "step": 4132 + }, + { + "epoch": 0.6324407039020657, + "grad_norm": 2.3785700225702224, + "learning_rate": 1.6003637435268285e-05, + "loss": 0.7669, + "step": 4133 + }, + { + "epoch": 0.6325937260902831, + "grad_norm": 2.2794316845639355, + "learning_rate": 1.6001655380882036e-05, + "loss": 0.8118, + "step": 4134 + }, + { + "epoch": 0.6327467482785004, + "grad_norm": 2.7039235062628566, + "learning_rate": 1.599967295791065e-05, + "loss": 0.8262, + "step": 4135 + }, + { + "epoch": 0.6328997704667176, + "grad_norm": 2.3953376457651268, + "learning_rate": 1.599769016647588e-05, + "loss": 0.6636, + "step": 4136 + }, + { + "epoch": 0.633052792654935, + "grad_norm": 2.4461597091487954, + "learning_rate": 1.599570700669949e-05, + "loss": 0.8773, + "step": 4137 + }, + { + "epoch": 0.6332058148431523, + "grad_norm": 2.2968863879054484, + "learning_rate": 1.599372347870328e-05, + "loss": 0.7605, + "step": 4138 + }, + { + "epoch": 0.6333588370313695, + "grad_norm": 2.319391750065703, + "learning_rate": 1.5991739582609066e-05, + "loss": 0.6621, + "step": 4139 + }, + { + "epoch": 0.6335118592195869, + "grad_norm": 2.0426803522722903, + "learning_rate": 1.598975531853868e-05, + "loss": 0.7313, + "step": 4140 + }, + { + "epoch": 0.6336648814078041, + "grad_norm": 2.2009394418994286, + "learning_rate": 1.598777068661399e-05, + "loss": 0.8457, + "step": 4141 + }, + { + "epoch": 0.6338179035960214, + "grad_norm": 2.265228617457404, + "learning_rate": 1.5985785686956877e-05, + "loss": 0.7932, + "step": 4142 + }, + { + "epoch": 0.6339709257842387, + "grad_norm": 2.2768626036052595, + "learning_rate": 1.598380031968925e-05, + "loss": 0.6946, + "step": 4143 + }, + { + "epoch": 0.634123947972456, + "grad_norm": 2.2367807099521615, + "learning_rate": 1.5981814584933037e-05, + "loss": 0.7776, + "step": 4144 + }, + { + "epoch": 0.6342769701606733, + "grad_norm": 2.174040440407306, + "learning_rate": 1.597982848281019e-05, + "loss": 0.6848, + "step": 4145 + }, + { + "epoch": 0.6344299923488906, + "grad_norm": 2.2403010646934223, + "learning_rate": 1.597784201344268e-05, + "loss": 0.7351, + "step": 4146 + }, + { + "epoch": 0.6345830145371079, + "grad_norm": 2.313387309452502, + "learning_rate": 1.5975855176952505e-05, + "loss": 0.8039, + "step": 4147 + }, + { + "epoch": 0.6347360367253252, + "grad_norm": 2.2112838415716225, + "learning_rate": 1.5973867973461687e-05, + "loss": 0.7345, + "step": 4148 + }, + { + "epoch": 0.6348890589135424, + "grad_norm": 2.1155329059767105, + "learning_rate": 1.5971880403092267e-05, + "loss": 0.6863, + "step": 4149 + }, + { + "epoch": 0.6350420811017597, + "grad_norm": 2.3988266632172373, + "learning_rate": 1.5969892465966315e-05, + "loss": 0.7015, + "step": 4150 + }, + { + "epoch": 0.6351951032899771, + "grad_norm": 2.5735735190206968, + "learning_rate": 1.5967904162205906e-05, + "loss": 0.7143, + "step": 4151 + }, + { + "epoch": 0.6353481254781943, + "grad_norm": 2.276882522860152, + "learning_rate": 1.5965915491933154e-05, + "loss": 0.7921, + "step": 4152 + }, + { + "epoch": 0.6355011476664116, + "grad_norm": 2.3309317873483577, + "learning_rate": 1.5963926455270196e-05, + "loss": 0.7485, + "step": 4153 + }, + { + "epoch": 0.635654169854629, + "grad_norm": 2.1790982554114797, + "learning_rate": 1.5961937052339184e-05, + "loss": 0.5373, + "step": 4154 + }, + { + "epoch": 0.6358071920428462, + "grad_norm": 2.666278766782337, + "learning_rate": 1.5959947283262296e-05, + "loss": 0.7133, + "step": 4155 + }, + { + "epoch": 0.6359602142310635, + "grad_norm": 2.2551622266417404, + "learning_rate": 1.5957957148161727e-05, + "loss": 0.7513, + "step": 4156 + }, + { + "epoch": 0.6361132364192807, + "grad_norm": 2.6383466391094563, + "learning_rate": 1.5955966647159706e-05, + "loss": 0.8115, + "step": 4157 + }, + { + "epoch": 0.6362662586074981, + "grad_norm": 2.1419449882341124, + "learning_rate": 1.5953975780378466e-05, + "loss": 0.8361, + "step": 4158 + }, + { + "epoch": 0.6364192807957154, + "grad_norm": 2.574103577501881, + "learning_rate": 1.5951984547940286e-05, + "loss": 0.8792, + "step": 4159 + }, + { + "epoch": 0.6365723029839326, + "grad_norm": 2.1063506557932348, + "learning_rate": 1.594999294996745e-05, + "loss": 0.7475, + "step": 4160 + }, + { + "epoch": 0.63672532517215, + "grad_norm": 2.321127303574338, + "learning_rate": 1.594800098658227e-05, + "loss": 0.7782, + "step": 4161 + }, + { + "epoch": 0.6368783473603673, + "grad_norm": 2.1374323400738633, + "learning_rate": 1.594600865790708e-05, + "loss": 0.6834, + "step": 4162 + }, + { + "epoch": 0.6370313695485845, + "grad_norm": 2.260939834196185, + "learning_rate": 1.594401596406424e-05, + "loss": 0.7039, + "step": 4163 + }, + { + "epoch": 0.6371843917368019, + "grad_norm": 2.0757294361561622, + "learning_rate": 1.5942022905176126e-05, + "loss": 0.6579, + "step": 4164 + }, + { + "epoch": 0.6373374139250191, + "grad_norm": 2.437640815822106, + "learning_rate": 1.5940029481365137e-05, + "loss": 0.7543, + "step": 4165 + }, + { + "epoch": 0.6374904361132364, + "grad_norm": 2.1674678167376484, + "learning_rate": 1.59380356927537e-05, + "loss": 0.6833, + "step": 4166 + }, + { + "epoch": 0.6376434583014537, + "grad_norm": 2.3065774043485363, + "learning_rate": 1.5936041539464266e-05, + "loss": 0.8161, + "step": 4167 + }, + { + "epoch": 0.637796480489671, + "grad_norm": 2.3020071837379663, + "learning_rate": 1.5934047021619295e-05, + "loss": 0.7168, + "step": 4168 + }, + { + "epoch": 0.6379495026778883, + "grad_norm": 2.379831436116885, + "learning_rate": 1.5932052139341285e-05, + "loss": 0.8322, + "step": 4169 + }, + { + "epoch": 0.6381025248661056, + "grad_norm": 2.215752580256027, + "learning_rate": 1.5930056892752745e-05, + "loss": 0.789, + "step": 4170 + }, + { + "epoch": 0.6382555470543229, + "grad_norm": 2.0602789500986307, + "learning_rate": 1.5928061281976215e-05, + "loss": 0.7685, + "step": 4171 + }, + { + "epoch": 0.6384085692425402, + "grad_norm": 2.249026928393553, + "learning_rate": 1.592606530713425e-05, + "loss": 0.6997, + "step": 4172 + }, + { + "epoch": 0.6385615914307574, + "grad_norm": 2.2625016939238534, + "learning_rate": 1.592406896834943e-05, + "loss": 0.8841, + "step": 4173 + }, + { + "epoch": 0.6387146136189747, + "grad_norm": 2.7844803793506427, + "learning_rate": 1.592207226574436e-05, + "loss": 0.8634, + "step": 4174 + }, + { + "epoch": 0.6388676358071921, + "grad_norm": 2.2100873097986526, + "learning_rate": 1.5920075199441665e-05, + "loss": 0.7085, + "step": 4175 + }, + { + "epoch": 0.6390206579954093, + "grad_norm": 2.339271594267563, + "learning_rate": 1.591807776956399e-05, + "loss": 0.6688, + "step": 4176 + }, + { + "epoch": 0.6391736801836266, + "grad_norm": 2.1828890019441634, + "learning_rate": 1.591607997623401e-05, + "loss": 0.7627, + "step": 4177 + }, + { + "epoch": 0.639326702371844, + "grad_norm": 2.4192884203876304, + "learning_rate": 1.5914081819574415e-05, + "loss": 0.8232, + "step": 4178 + }, + { + "epoch": 0.6394797245600612, + "grad_norm": 2.235385282434219, + "learning_rate": 1.5912083299707924e-05, + "loss": 0.6711, + "step": 4179 + }, + { + "epoch": 0.6396327467482785, + "grad_norm": 2.4216983966310823, + "learning_rate": 1.5910084416757264e-05, + "loss": 0.8284, + "step": 4180 + }, + { + "epoch": 0.6397857689364957, + "grad_norm": 2.326093648518836, + "learning_rate": 1.59080851708452e-05, + "loss": 0.8516, + "step": 4181 + }, + { + "epoch": 0.6399387911247131, + "grad_norm": 2.4974683641820157, + "learning_rate": 1.5906085562094512e-05, + "loss": 0.7125, + "step": 4182 + }, + { + "epoch": 0.6400918133129304, + "grad_norm": 2.2514434183700467, + "learning_rate": 1.590408559062801e-05, + "loss": 0.7538, + "step": 4183 + }, + { + "epoch": 0.6402448355011476, + "grad_norm": 2.2360998818579705, + "learning_rate": 1.5902085256568513e-05, + "loss": 0.7095, + "step": 4184 + }, + { + "epoch": 0.640397857689365, + "grad_norm": 2.4038858226459743, + "learning_rate": 1.5900084560038866e-05, + "loss": 0.8299, + "step": 4185 + }, + { + "epoch": 0.6405508798775823, + "grad_norm": 2.1972831229466845, + "learning_rate": 1.589808350116195e-05, + "loss": 0.7232, + "step": 4186 + }, + { + "epoch": 0.6407039020657995, + "grad_norm": 2.482921431242963, + "learning_rate": 1.5896082080060652e-05, + "loss": 0.7147, + "step": 4187 + }, + { + "epoch": 0.6408569242540169, + "grad_norm": 2.2355154693159904, + "learning_rate": 1.5894080296857888e-05, + "loss": 0.7198, + "step": 4188 + }, + { + "epoch": 0.6410099464422341, + "grad_norm": 2.2241296107205524, + "learning_rate": 1.5892078151676594e-05, + "loss": 0.7909, + "step": 4189 + }, + { + "epoch": 0.6411629686304514, + "grad_norm": 2.605633584782082, + "learning_rate": 1.589007564463973e-05, + "loss": 0.8084, + "step": 4190 + }, + { + "epoch": 0.6413159908186687, + "grad_norm": 2.454508772411002, + "learning_rate": 1.588807277587028e-05, + "loss": 0.8879, + "step": 4191 + }, + { + "epoch": 0.641469013006886, + "grad_norm": 2.4298870166317212, + "learning_rate": 1.5886069545491246e-05, + "loss": 0.6854, + "step": 4192 + }, + { + "epoch": 0.6416220351951033, + "grad_norm": 2.4906106059246733, + "learning_rate": 1.5884065953625657e-05, + "loss": 0.8755, + "step": 4193 + }, + { + "epoch": 0.6417750573833206, + "grad_norm": 2.1516796449874516, + "learning_rate": 1.5882062000396558e-05, + "loss": 0.7066, + "step": 4194 + }, + { + "epoch": 0.6419280795715379, + "grad_norm": 2.493491767448356, + "learning_rate": 1.588005768592702e-05, + "loss": 0.7756, + "step": 4195 + }, + { + "epoch": 0.6420811017597552, + "grad_norm": 2.6459928697516086, + "learning_rate": 1.587805301034014e-05, + "loss": 0.8202, + "step": 4196 + }, + { + "epoch": 0.6422341239479724, + "grad_norm": 2.4018200071295177, + "learning_rate": 1.587604797375902e-05, + "loss": 0.6751, + "step": 4197 + }, + { + "epoch": 0.6423871461361897, + "grad_norm": 2.367904569625447, + "learning_rate": 1.5874042576306817e-05, + "loss": 0.7709, + "step": 4198 + }, + { + "epoch": 0.6425401683244071, + "grad_norm": 2.3896747175232327, + "learning_rate": 1.5872036818106672e-05, + "loss": 0.8643, + "step": 4199 + }, + { + "epoch": 0.6426931905126243, + "grad_norm": 2.7365450842405306, + "learning_rate": 1.5870030699281783e-05, + "loss": 0.709, + "step": 4200 + }, + { + "epoch": 0.6428462127008416, + "grad_norm": 1.9490071277493157, + "learning_rate": 1.5868024219955337e-05, + "loss": 0.61, + "step": 4201 + }, + { + "epoch": 0.642999234889059, + "grad_norm": 2.2493824782464, + "learning_rate": 1.5866017380250573e-05, + "loss": 0.6998, + "step": 4202 + }, + { + "epoch": 0.6431522570772762, + "grad_norm": 2.078835252832914, + "learning_rate": 1.5864010180290732e-05, + "loss": 0.6524, + "step": 4203 + }, + { + "epoch": 0.6433052792654935, + "grad_norm": 2.295853031190946, + "learning_rate": 1.5862002620199086e-05, + "loss": 0.7786, + "step": 4204 + }, + { + "epoch": 0.6434583014537107, + "grad_norm": 2.5791704393175507, + "learning_rate": 1.5859994700098925e-05, + "loss": 0.7799, + "step": 4205 + }, + { + "epoch": 0.6436113236419281, + "grad_norm": 2.288695724711449, + "learning_rate": 1.5857986420113568e-05, + "loss": 0.7069, + "step": 4206 + }, + { + "epoch": 0.6437643458301454, + "grad_norm": 2.0640414343256586, + "learning_rate": 1.5855977780366347e-05, + "loss": 0.6614, + "step": 4207 + }, + { + "epoch": 0.6439173680183626, + "grad_norm": 2.687001513306309, + "learning_rate": 1.5853968780980624e-05, + "loss": 0.6861, + "step": 4208 + }, + { + "epoch": 0.64407039020658, + "grad_norm": 2.3769771682509444, + "learning_rate": 1.585195942207977e-05, + "loss": 0.7767, + "step": 4209 + }, + { + "epoch": 0.6442234123947972, + "grad_norm": 2.5169919724774026, + "learning_rate": 1.58499497037872e-05, + "loss": 0.7743, + "step": 4210 + }, + { + "epoch": 0.6443764345830145, + "grad_norm": 2.3993993027725664, + "learning_rate": 1.5847939626226336e-05, + "loss": 0.7773, + "step": 4211 + }, + { + "epoch": 0.6445294567712319, + "grad_norm": 2.37377422024334, + "learning_rate": 1.5845929189520623e-05, + "loss": 0.8059, + "step": 4212 + }, + { + "epoch": 0.6446824789594491, + "grad_norm": 2.4955691858759743, + "learning_rate": 1.5843918393793523e-05, + "loss": 0.6907, + "step": 4213 + }, + { + "epoch": 0.6448355011476664, + "grad_norm": 2.4733738421996296, + "learning_rate": 1.5841907239168535e-05, + "loss": 0.909, + "step": 4214 + }, + { + "epoch": 0.6449885233358837, + "grad_norm": 2.1763820948064976, + "learning_rate": 1.5839895725769173e-05, + "loss": 0.7843, + "step": 4215 + }, + { + "epoch": 0.645141545524101, + "grad_norm": 2.1398197268300114, + "learning_rate": 1.5837883853718964e-05, + "loss": 0.6951, + "step": 4216 + }, + { + "epoch": 0.6452945677123183, + "grad_norm": 2.5145027467966528, + "learning_rate": 1.583587162314147e-05, + "loss": 0.8311, + "step": 4217 + }, + { + "epoch": 0.6454475899005355, + "grad_norm": 2.4465579575147656, + "learning_rate": 1.5833859034160275e-05, + "loss": 0.6852, + "step": 4218 + }, + { + "epoch": 0.6456006120887529, + "grad_norm": 2.3378074319427453, + "learning_rate": 1.583184608689897e-05, + "loss": 0.7895, + "step": 4219 + }, + { + "epoch": 0.6457536342769702, + "grad_norm": 1.9247135984095447, + "learning_rate": 1.5829832781481186e-05, + "loss": 0.6891, + "step": 4220 + }, + { + "epoch": 0.6459066564651874, + "grad_norm": 2.2282892978052735, + "learning_rate": 1.582781911803056e-05, + "loss": 0.736, + "step": 4221 + }, + { + "epoch": 0.6460596786534047, + "grad_norm": 2.285669724890519, + "learning_rate": 1.5825805096670766e-05, + "loss": 0.7469, + "step": 4222 + }, + { + "epoch": 0.6462127008416221, + "grad_norm": 2.581987099916605, + "learning_rate": 1.5823790717525487e-05, + "loss": 0.8131, + "step": 4223 + }, + { + "epoch": 0.6463657230298393, + "grad_norm": 2.00862726818773, + "learning_rate": 1.5821775980718443e-05, + "loss": 0.7074, + "step": 4224 + }, + { + "epoch": 0.6465187452180566, + "grad_norm": 2.3151539318728314, + "learning_rate": 1.581976088637336e-05, + "loss": 0.711, + "step": 4225 + }, + { + "epoch": 0.6466717674062739, + "grad_norm": 2.159998848069135, + "learning_rate": 1.581774543461399e-05, + "loss": 0.7569, + "step": 4226 + }, + { + "epoch": 0.6468247895944912, + "grad_norm": 2.2315436312344916, + "learning_rate": 1.5815729625564116e-05, + "loss": 0.6982, + "step": 4227 + }, + { + "epoch": 0.6469778117827085, + "grad_norm": 2.3296082075550983, + "learning_rate": 1.5813713459347532e-05, + "loss": 0.6977, + "step": 4228 + }, + { + "epoch": 0.6471308339709257, + "grad_norm": 2.3650900215329758, + "learning_rate": 1.5811696936088066e-05, + "loss": 0.7675, + "step": 4229 + }, + { + "epoch": 0.6472838561591431, + "grad_norm": 2.0510993157963515, + "learning_rate": 1.5809680055909552e-05, + "loss": 0.5907, + "step": 4230 + }, + { + "epoch": 0.6474368783473604, + "grad_norm": 2.392826682725871, + "learning_rate": 1.580766281893586e-05, + "loss": 0.756, + "step": 4231 + }, + { + "epoch": 0.6475899005355776, + "grad_norm": 2.159486959255893, + "learning_rate": 1.5805645225290872e-05, + "loss": 0.7136, + "step": 4232 + }, + { + "epoch": 0.647742922723795, + "grad_norm": 2.13632556895698, + "learning_rate": 1.58036272750985e-05, + "loss": 0.7156, + "step": 4233 + }, + { + "epoch": 0.6478959449120122, + "grad_norm": 2.2967568833902283, + "learning_rate": 1.5801608968482676e-05, + "loss": 0.8513, + "step": 4234 + }, + { + "epoch": 0.6480489671002295, + "grad_norm": 2.1817154806929016, + "learning_rate": 1.5799590305567345e-05, + "loss": 0.7536, + "step": 4235 + }, + { + "epoch": 0.6482019892884469, + "grad_norm": 2.0809234386044073, + "learning_rate": 1.5797571286476484e-05, + "loss": 0.7957, + "step": 4236 + }, + { + "epoch": 0.6483550114766641, + "grad_norm": 2.6517476323715115, + "learning_rate": 1.5795551911334096e-05, + "loss": 0.7822, + "step": 4237 + }, + { + "epoch": 0.6485080336648814, + "grad_norm": 2.350343075077592, + "learning_rate": 1.579353218026419e-05, + "loss": 0.7427, + "step": 4238 + }, + { + "epoch": 0.6486610558530987, + "grad_norm": 2.1243287340078383, + "learning_rate": 1.579151209339081e-05, + "loss": 0.8415, + "step": 4239 + }, + { + "epoch": 0.648814078041316, + "grad_norm": 1.989695986982748, + "learning_rate": 1.5789491650838013e-05, + "loss": 0.733, + "step": 4240 + }, + { + "epoch": 0.6489671002295333, + "grad_norm": 2.215200133687346, + "learning_rate": 1.5787470852729886e-05, + "loss": 0.7286, + "step": 4241 + }, + { + "epoch": 0.6491201224177505, + "grad_norm": 2.002861466308425, + "learning_rate": 1.5785449699190533e-05, + "loss": 0.708, + "step": 4242 + }, + { + "epoch": 0.6492731446059679, + "grad_norm": 2.198265404670774, + "learning_rate": 1.578342819034408e-05, + "loss": 0.6877, + "step": 4243 + }, + { + "epoch": 0.6494261667941852, + "grad_norm": 2.2444980427910064, + "learning_rate": 1.578140632631468e-05, + "loss": 0.7127, + "step": 4244 + }, + { + "epoch": 0.6495791889824024, + "grad_norm": 2.3743717173706855, + "learning_rate": 1.57793841072265e-05, + "loss": 0.7277, + "step": 4245 + }, + { + "epoch": 0.6497322111706197, + "grad_norm": 2.094820997637052, + "learning_rate": 1.5777361533203733e-05, + "loss": 0.6453, + "step": 4246 + }, + { + "epoch": 0.6498852333588371, + "grad_norm": 2.359769995436231, + "learning_rate": 1.577533860437059e-05, + "loss": 0.7156, + "step": 4247 + }, + { + "epoch": 0.6500382555470543, + "grad_norm": 2.4827444410396127, + "learning_rate": 1.577331532085131e-05, + "loss": 0.8702, + "step": 4248 + }, + { + "epoch": 0.6501912777352716, + "grad_norm": 2.392016169668508, + "learning_rate": 1.5771291682770156e-05, + "loss": 0.8331, + "step": 4249 + }, + { + "epoch": 0.6503442999234889, + "grad_norm": 2.3940592611494895, + "learning_rate": 1.57692676902514e-05, + "loss": 0.7551, + "step": 4250 + }, + { + "epoch": 0.6504973221117062, + "grad_norm": 2.3839301346084025, + "learning_rate": 1.5767243343419342e-05, + "loss": 0.8463, + "step": 4251 + }, + { + "epoch": 0.6506503442999235, + "grad_norm": 2.4202585155038787, + "learning_rate": 1.5765218642398314e-05, + "loss": 0.7652, + "step": 4252 + }, + { + "epoch": 0.6508033664881407, + "grad_norm": 2.1053361807836937, + "learning_rate": 1.5763193587312655e-05, + "loss": 0.6554, + "step": 4253 + }, + { + "epoch": 0.6509563886763581, + "grad_norm": 1.8735355733426593, + "learning_rate": 1.5761168178286727e-05, + "loss": 0.691, + "step": 4254 + }, + { + "epoch": 0.6511094108645754, + "grad_norm": 2.224692423547609, + "learning_rate": 1.5759142415444925e-05, + "loss": 0.696, + "step": 4255 + }, + { + "epoch": 0.6512624330527926, + "grad_norm": 2.0753222236814683, + "learning_rate": 1.575711629891166e-05, + "loss": 0.7598, + "step": 4256 + }, + { + "epoch": 0.65141545524101, + "grad_norm": 2.2697210222802306, + "learning_rate": 1.5755089828811362e-05, + "loss": 0.6358, + "step": 4257 + }, + { + "epoch": 0.6515684774292272, + "grad_norm": 2.3751008200480275, + "learning_rate": 1.5753063005268483e-05, + "loss": 0.7119, + "step": 4258 + }, + { + "epoch": 0.6517214996174445, + "grad_norm": 2.124262512388635, + "learning_rate": 1.5751035828407494e-05, + "loss": 0.6876, + "step": 4259 + }, + { + "epoch": 0.6518745218056619, + "grad_norm": 2.2054070929703276, + "learning_rate": 1.57490082983529e-05, + "loss": 0.7775, + "step": 4260 + }, + { + "epoch": 0.6520275439938791, + "grad_norm": 2.077240839261722, + "learning_rate": 1.5746980415229217e-05, + "loss": 0.6499, + "step": 4261 + }, + { + "epoch": 0.6521805661820964, + "grad_norm": 2.2208562054207053, + "learning_rate": 1.5744952179160985e-05, + "loss": 0.8069, + "step": 4262 + }, + { + "epoch": 0.6523335883703137, + "grad_norm": 2.135417668181956, + "learning_rate": 1.5742923590272762e-05, + "loss": 0.7635, + "step": 4263 + }, + { + "epoch": 0.652486610558531, + "grad_norm": 2.2710496430797926, + "learning_rate": 1.5740894648689138e-05, + "loss": 0.8612, + "step": 4264 + }, + { + "epoch": 0.6526396327467483, + "grad_norm": 2.097980111255635, + "learning_rate": 1.5738865354534714e-05, + "loss": 0.7313, + "step": 4265 + }, + { + "epoch": 0.6527926549349655, + "grad_norm": 2.3366510066682835, + "learning_rate": 1.573683570793412e-05, + "loss": 0.7146, + "step": 4266 + }, + { + "epoch": 0.6529456771231829, + "grad_norm": 2.162653896948852, + "learning_rate": 1.5734805709012e-05, + "loss": 0.7029, + "step": 4267 + }, + { + "epoch": 0.6530986993114002, + "grad_norm": 2.3656466257792323, + "learning_rate": 1.5732775357893024e-05, + "loss": 0.8957, + "step": 4268 + }, + { + "epoch": 0.6532517214996174, + "grad_norm": 2.098111046017186, + "learning_rate": 1.573074465470189e-05, + "loss": 0.6822, + "step": 4269 + }, + { + "epoch": 0.6534047436878347, + "grad_norm": 2.4117741397672408, + "learning_rate": 1.5728713599563306e-05, + "loss": 0.7866, + "step": 4270 + }, + { + "epoch": 0.6535577658760521, + "grad_norm": 2.174338870646714, + "learning_rate": 1.5726682192602015e-05, + "loss": 0.8173, + "step": 4271 + }, + { + "epoch": 0.6537107880642693, + "grad_norm": 2.186513283043854, + "learning_rate": 1.572465043394276e-05, + "loss": 0.776, + "step": 4272 + }, + { + "epoch": 0.6538638102524866, + "grad_norm": 2.4290604461156273, + "learning_rate": 1.572261832371033e-05, + "loss": 0.8385, + "step": 4273 + }, + { + "epoch": 0.6540168324407039, + "grad_norm": 2.204819777026553, + "learning_rate": 1.5720585862029522e-05, + "loss": 0.7703, + "step": 4274 + }, + { + "epoch": 0.6541698546289212, + "grad_norm": 2.3510224710248004, + "learning_rate": 1.571855304902516e-05, + "loss": 0.7174, + "step": 4275 + }, + { + "epoch": 0.6543228768171385, + "grad_norm": 2.5013178502584084, + "learning_rate": 1.571651988482208e-05, + "loss": 0.7605, + "step": 4276 + }, + { + "epoch": 0.6544758990053557, + "grad_norm": 2.4612060334374517, + "learning_rate": 1.5714486369545152e-05, + "loss": 0.7707, + "step": 4277 + }, + { + "epoch": 0.6546289211935731, + "grad_norm": 2.2725366255063655, + "learning_rate": 1.571245250331926e-05, + "loss": 0.8654, + "step": 4278 + }, + { + "epoch": 0.6547819433817904, + "grad_norm": 2.216534816275824, + "learning_rate": 1.5710418286269313e-05, + "loss": 0.68, + "step": 4279 + }, + { + "epoch": 0.6549349655700076, + "grad_norm": 2.2998627589268352, + "learning_rate": 1.570838371852024e-05, + "loss": 0.7354, + "step": 4280 + }, + { + "epoch": 0.655087987758225, + "grad_norm": 2.559430498766767, + "learning_rate": 1.5706348800196993e-05, + "loss": 0.7744, + "step": 4281 + }, + { + "epoch": 0.6552410099464422, + "grad_norm": 1.9862496757382806, + "learning_rate": 1.570431353142454e-05, + "loss": 0.674, + "step": 4282 + }, + { + "epoch": 0.6553940321346595, + "grad_norm": 2.315587355397246, + "learning_rate": 1.570227791232788e-05, + "loss": 0.7833, + "step": 4283 + }, + { + "epoch": 0.6555470543228769, + "grad_norm": 2.175044828127138, + "learning_rate": 1.570024194303202e-05, + "loss": 0.6864, + "step": 4284 + }, + { + "epoch": 0.6557000765110941, + "grad_norm": 2.037261450774724, + "learning_rate": 1.5698205623662013e-05, + "loss": 0.619, + "step": 4285 + }, + { + "epoch": 0.6558530986993114, + "grad_norm": 2.1956095246913505, + "learning_rate": 1.56961689543429e-05, + "loss": 0.6983, + "step": 4286 + }, + { + "epoch": 0.6560061208875287, + "grad_norm": 2.5219192285962375, + "learning_rate": 1.569413193519977e-05, + "loss": 0.7786, + "step": 4287 + }, + { + "epoch": 0.656159143075746, + "grad_norm": 1.9680199150456783, + "learning_rate": 1.5692094566357723e-05, + "loss": 0.6138, + "step": 4288 + }, + { + "epoch": 0.6563121652639633, + "grad_norm": 2.791679205377205, + "learning_rate": 1.569005684794188e-05, + "loss": 0.7804, + "step": 4289 + }, + { + "epoch": 0.6564651874521805, + "grad_norm": 2.441683768664178, + "learning_rate": 1.5688018780077387e-05, + "loss": 0.7648, + "step": 4290 + }, + { + "epoch": 0.6566182096403979, + "grad_norm": 2.3593952030509393, + "learning_rate": 1.5685980362889407e-05, + "loss": 0.7983, + "step": 4291 + }, + { + "epoch": 0.6567712318286152, + "grad_norm": 2.44463161656858, + "learning_rate": 1.5683941596503132e-05, + "loss": 0.8427, + "step": 4292 + }, + { + "epoch": 0.6569242540168324, + "grad_norm": 2.477271920848612, + "learning_rate": 1.568190248104377e-05, + "loss": 0.709, + "step": 4293 + }, + { + "epoch": 0.6570772762050497, + "grad_norm": 2.0726834727509247, + "learning_rate": 1.567986301663654e-05, + "loss": 0.7644, + "step": 4294 + }, + { + "epoch": 0.6572302983932671, + "grad_norm": 2.4549904996769523, + "learning_rate": 1.567782320340671e-05, + "loss": 0.6868, + "step": 4295 + }, + { + "epoch": 0.6573833205814843, + "grad_norm": 2.133291763535228, + "learning_rate": 1.5675783041479542e-05, + "loss": 0.6548, + "step": 4296 + }, + { + "epoch": 0.6575363427697016, + "grad_norm": 2.69944862833919, + "learning_rate": 1.5673742530980337e-05, + "loss": 0.7521, + "step": 4297 + }, + { + "epoch": 0.6576893649579189, + "grad_norm": 2.318466203294317, + "learning_rate": 1.5671701672034406e-05, + "loss": 0.6864, + "step": 4298 + }, + { + "epoch": 0.6578423871461362, + "grad_norm": 2.4267845713840233, + "learning_rate": 1.5669660464767087e-05, + "loss": 0.8534, + "step": 4299 + }, + { + "epoch": 0.6579954093343535, + "grad_norm": 2.382318083873871, + "learning_rate": 1.5667618909303738e-05, + "loss": 0.7832, + "step": 4300 + }, + { + "epoch": 0.6581484315225707, + "grad_norm": 2.2591848691994003, + "learning_rate": 1.566557700576974e-05, + "loss": 0.7867, + "step": 4301 + }, + { + "epoch": 0.6583014537107881, + "grad_norm": 2.1813851643328905, + "learning_rate": 1.5663534754290496e-05, + "loss": 0.8173, + "step": 4302 + }, + { + "epoch": 0.6584544758990054, + "grad_norm": 2.3371976725354635, + "learning_rate": 1.5661492154991424e-05, + "loss": 0.6822, + "step": 4303 + }, + { + "epoch": 0.6586074980872226, + "grad_norm": 2.4281104499333086, + "learning_rate": 1.5659449207997975e-05, + "loss": 0.7563, + "step": 4304 + }, + { + "epoch": 0.65876052027544, + "grad_norm": 2.332443416486321, + "learning_rate": 1.5657405913435608e-05, + "loss": 0.7429, + "step": 4305 + }, + { + "epoch": 0.6589135424636572, + "grad_norm": 2.1794383230938497, + "learning_rate": 1.565536227142981e-05, + "loss": 0.7894, + "step": 4306 + }, + { + "epoch": 0.6590665646518745, + "grad_norm": 2.2246346257704994, + "learning_rate": 1.5653318282106096e-05, + "loss": 0.7179, + "step": 4307 + }, + { + "epoch": 0.6592195868400919, + "grad_norm": 2.278677670858125, + "learning_rate": 1.5651273945589984e-05, + "loss": 0.8088, + "step": 4308 + }, + { + "epoch": 0.6593726090283091, + "grad_norm": 2.3163101708985665, + "learning_rate": 1.5649229262007035e-05, + "loss": 0.7675, + "step": 4309 + }, + { + "epoch": 0.6595256312165264, + "grad_norm": 2.1382456702220924, + "learning_rate": 1.5647184231482816e-05, + "loss": 0.6105, + "step": 4310 + }, + { + "epoch": 0.6596786534047437, + "grad_norm": 2.6339501865403645, + "learning_rate": 1.5645138854142926e-05, + "loss": 0.7263, + "step": 4311 + }, + { + "epoch": 0.659831675592961, + "grad_norm": 2.622754234753976, + "learning_rate": 1.564309313011297e-05, + "loss": 0.8526, + "step": 4312 + }, + { + "epoch": 0.6599846977811783, + "grad_norm": 2.3998143541848145, + "learning_rate": 1.564104705951859e-05, + "loss": 0.8304, + "step": 4313 + }, + { + "epoch": 0.6601377199693955, + "grad_norm": 2.197439826841551, + "learning_rate": 1.563900064248544e-05, + "loss": 0.7395, + "step": 4314 + }, + { + "epoch": 0.6602907421576129, + "grad_norm": 2.3044007948624072, + "learning_rate": 1.5636953879139204e-05, + "loss": 0.7928, + "step": 4315 + }, + { + "epoch": 0.6604437643458302, + "grad_norm": 2.2479485395298666, + "learning_rate": 1.563490676960558e-05, + "loss": 0.791, + "step": 4316 + }, + { + "epoch": 0.6605967865340474, + "grad_norm": 2.3513886944279325, + "learning_rate": 1.563285931401028e-05, + "loss": 0.7341, + "step": 4317 + }, + { + "epoch": 0.6607498087222647, + "grad_norm": 2.177828319997214, + "learning_rate": 1.563081151247906e-05, + "loss": 0.7671, + "step": 4318 + }, + { + "epoch": 0.6609028309104821, + "grad_norm": 2.192816682090379, + "learning_rate": 1.562876336513768e-05, + "loss": 0.83, + "step": 4319 + }, + { + "epoch": 0.6610558530986993, + "grad_norm": 2.0953415645877778, + "learning_rate": 1.5626714872111915e-05, + "loss": 0.6911, + "step": 4320 + }, + { + "epoch": 0.6612088752869166, + "grad_norm": 2.2103399446325276, + "learning_rate": 1.5624666033527577e-05, + "loss": 0.7532, + "step": 4321 + }, + { + "epoch": 0.6613618974751339, + "grad_norm": 2.4237262824412125, + "learning_rate": 1.5622616849510497e-05, + "loss": 0.7964, + "step": 4322 + }, + { + "epoch": 0.6615149196633512, + "grad_norm": 2.100080910063574, + "learning_rate": 1.5620567320186522e-05, + "loss": 0.5986, + "step": 4323 + }, + { + "epoch": 0.6616679418515685, + "grad_norm": 2.2211004196585367, + "learning_rate": 1.561851744568152e-05, + "loss": 0.7725, + "step": 4324 + }, + { + "epoch": 0.6618209640397857, + "grad_norm": 2.2274632487007358, + "learning_rate": 1.561646722612138e-05, + "loss": 0.7935, + "step": 4325 + }, + { + "epoch": 0.6619739862280031, + "grad_norm": 2.0708948012053563, + "learning_rate": 1.5614416661632015e-05, + "loss": 0.6594, + "step": 4326 + }, + { + "epoch": 0.6621270084162203, + "grad_norm": 2.2197858350008843, + "learning_rate": 1.5612365752339364e-05, + "loss": 0.766, + "step": 4327 + }, + { + "epoch": 0.6622800306044376, + "grad_norm": 2.220248640125984, + "learning_rate": 1.5610314498369372e-05, + "loss": 0.7767, + "step": 4328 + }, + { + "epoch": 0.662433052792655, + "grad_norm": 2.380043383610386, + "learning_rate": 1.560826289984802e-05, + "loss": 0.7342, + "step": 4329 + }, + { + "epoch": 0.6625860749808722, + "grad_norm": 2.6236063675363046, + "learning_rate": 1.5606210956901303e-05, + "loss": 0.7274, + "step": 4330 + }, + { + "epoch": 0.6627390971690895, + "grad_norm": 2.148371880434111, + "learning_rate": 1.560415866965524e-05, + "loss": 0.6713, + "step": 4331 + }, + { + "epoch": 0.6628921193573069, + "grad_norm": 2.0025618604021607, + "learning_rate": 1.5602106038235874e-05, + "loss": 0.6365, + "step": 4332 + }, + { + "epoch": 0.6630451415455241, + "grad_norm": 2.21401912073108, + "learning_rate": 1.5600053062769253e-05, + "loss": 0.6911, + "step": 4333 + }, + { + "epoch": 0.6631981637337414, + "grad_norm": 2.228557608606604, + "learning_rate": 1.559799974338147e-05, + "loss": 0.6889, + "step": 4334 + }, + { + "epoch": 0.6633511859219586, + "grad_norm": 2.438672814007364, + "learning_rate": 1.5595946080198624e-05, + "loss": 0.7678, + "step": 4335 + }, + { + "epoch": 0.663504208110176, + "grad_norm": 2.2428273489360913, + "learning_rate": 1.5593892073346837e-05, + "loss": 0.7243, + "step": 4336 + }, + { + "epoch": 0.6636572302983933, + "grad_norm": 2.4099325849538364, + "learning_rate": 1.5591837722952253e-05, + "loss": 0.8096, + "step": 4337 + }, + { + "epoch": 0.6638102524866105, + "grad_norm": 2.19582365303818, + "learning_rate": 1.5589783029141038e-05, + "loss": 0.7445, + "step": 4338 + }, + { + "epoch": 0.6639632746748279, + "grad_norm": 2.2901855214253173, + "learning_rate": 1.558772799203938e-05, + "loss": 0.7788, + "step": 4339 + }, + { + "epoch": 0.6641162968630452, + "grad_norm": 2.093622043015938, + "learning_rate": 1.558567261177349e-05, + "loss": 0.8354, + "step": 4340 + }, + { + "epoch": 0.6642693190512624, + "grad_norm": 2.233333883954918, + "learning_rate": 1.5583616888469594e-05, + "loss": 0.8035, + "step": 4341 + }, + { + "epoch": 0.6644223412394797, + "grad_norm": 2.027649735934898, + "learning_rate": 1.5581560822253938e-05, + "loss": 0.6271, + "step": 4342 + }, + { + "epoch": 0.664575363427697, + "grad_norm": 2.3410794486831534, + "learning_rate": 1.55795044132528e-05, + "loss": 0.6748, + "step": 4343 + }, + { + "epoch": 0.6647283856159143, + "grad_norm": 2.034670891220707, + "learning_rate": 1.5577447661592467e-05, + "loss": 0.6798, + "step": 4344 + }, + { + "epoch": 0.6648814078041316, + "grad_norm": 2.267729163658624, + "learning_rate": 1.5575390567399253e-05, + "loss": 0.7299, + "step": 4345 + }, + { + "epoch": 0.6650344299923489, + "grad_norm": 2.3485035033426493, + "learning_rate": 1.5573333130799494e-05, + "loss": 0.7796, + "step": 4346 + }, + { + "epoch": 0.6651874521805662, + "grad_norm": 2.3608093796256657, + "learning_rate": 1.5571275351919543e-05, + "loss": 1.0151, + "step": 4347 + }, + { + "epoch": 0.6653404743687835, + "grad_norm": 2.258946043310113, + "learning_rate": 1.556921723088578e-05, + "loss": 0.668, + "step": 4348 + }, + { + "epoch": 0.6654934965570007, + "grad_norm": 2.4385006256619626, + "learning_rate": 1.5567158767824603e-05, + "loss": 0.7649, + "step": 4349 + }, + { + "epoch": 0.6656465187452181, + "grad_norm": 2.187731470321015, + "learning_rate": 1.556509996286242e-05, + "loss": 0.74, + "step": 4350 + }, + { + "epoch": 0.6657995409334353, + "grad_norm": 2.136213298646861, + "learning_rate": 1.5563040816125683e-05, + "loss": 0.756, + "step": 4351 + }, + { + "epoch": 0.6659525631216526, + "grad_norm": 2.464238595477538, + "learning_rate": 1.5560981327740846e-05, + "loss": 0.7432, + "step": 4352 + }, + { + "epoch": 0.66610558530987, + "grad_norm": 2.138263083825212, + "learning_rate": 1.5558921497834387e-05, + "loss": 0.7623, + "step": 4353 + }, + { + "epoch": 0.6662586074980872, + "grad_norm": 2.2159031648485157, + "learning_rate": 1.5556861326532817e-05, + "loss": 0.6481, + "step": 4354 + }, + { + "epoch": 0.6664116296863045, + "grad_norm": 2.3079101636175405, + "learning_rate": 1.5554800813962652e-05, + "loss": 0.7337, + "step": 4355 + }, + { + "epoch": 0.6665646518745219, + "grad_norm": 2.3557552648531095, + "learning_rate": 1.555273996025044e-05, + "loss": 0.7614, + "step": 4356 + }, + { + "epoch": 0.6667176740627391, + "grad_norm": 2.3975045855476234, + "learning_rate": 1.5550678765522743e-05, + "loss": 0.8563, + "step": 4357 + }, + { + "epoch": 0.6668706962509564, + "grad_norm": 2.3698126148175263, + "learning_rate": 1.554861722990615e-05, + "loss": 0.6551, + "step": 4358 + }, + { + "epoch": 0.6670237184391736, + "grad_norm": 1.9109474500910666, + "learning_rate": 1.5546555353527268e-05, + "loss": 0.5765, + "step": 4359 + }, + { + "epoch": 0.667176740627391, + "grad_norm": 2.347918132400794, + "learning_rate": 1.554449313651272e-05, + "loss": 0.8157, + "step": 4360 + }, + { + "epoch": 0.6673297628156083, + "grad_norm": 2.1667966924767517, + "learning_rate": 1.554243057898916e-05, + "loss": 0.7399, + "step": 4361 + }, + { + "epoch": 0.6674827850038255, + "grad_norm": 2.086540955838038, + "learning_rate": 1.5540367681083256e-05, + "loss": 0.6853, + "step": 4362 + }, + { + "epoch": 0.6676358071920429, + "grad_norm": 2.132291609887321, + "learning_rate": 1.5538304442921694e-05, + "loss": 0.7374, + "step": 4363 + }, + { + "epoch": 0.6677888293802602, + "grad_norm": 2.363328018543426, + "learning_rate": 1.5536240864631197e-05, + "loss": 0.824, + "step": 4364 + }, + { + "epoch": 0.6679418515684774, + "grad_norm": 2.3075046294927977, + "learning_rate": 1.553417694633849e-05, + "loss": 0.7507, + "step": 4365 + }, + { + "epoch": 0.6680948737566947, + "grad_norm": 2.5578916953080086, + "learning_rate": 1.553211268817032e-05, + "loss": 0.9281, + "step": 4366 + }, + { + "epoch": 0.668247895944912, + "grad_norm": 2.0753350483746233, + "learning_rate": 1.5530048090253475e-05, + "loss": 0.7547, + "step": 4367 + }, + { + "epoch": 0.6684009181331293, + "grad_norm": 2.069627851881168, + "learning_rate": 1.552798315271474e-05, + "loss": 0.7056, + "step": 4368 + }, + { + "epoch": 0.6685539403213466, + "grad_norm": 2.3010025507959573, + "learning_rate": 1.5525917875680936e-05, + "loss": 0.6966, + "step": 4369 + }, + { + "epoch": 0.6687069625095639, + "grad_norm": 2.5111567762780154, + "learning_rate": 1.55238522592789e-05, + "loss": 0.7832, + "step": 4370 + }, + { + "epoch": 0.6688599846977812, + "grad_norm": 1.9618299176729825, + "learning_rate": 1.5521786303635483e-05, + "loss": 0.6293, + "step": 4371 + }, + { + "epoch": 0.6690130068859985, + "grad_norm": 2.225950898436061, + "learning_rate": 1.5519720008877567e-05, + "loss": 0.7209, + "step": 4372 + }, + { + "epoch": 0.6691660290742157, + "grad_norm": 1.992624936802358, + "learning_rate": 1.5517653375132055e-05, + "loss": 0.7138, + "step": 4373 + }, + { + "epoch": 0.6693190512624331, + "grad_norm": 1.8752156297552323, + "learning_rate": 1.551558640252586e-05, + "loss": 0.6508, + "step": 4374 + }, + { + "epoch": 0.6694720734506503, + "grad_norm": 2.3301325690600945, + "learning_rate": 1.5513519091185934e-05, + "loss": 0.5898, + "step": 4375 + }, + { + "epoch": 0.6696250956388676, + "grad_norm": 2.3208751688365576, + "learning_rate": 1.5511451441239227e-05, + "loss": 0.6817, + "step": 4376 + }, + { + "epoch": 0.669778117827085, + "grad_norm": 2.3173083606568396, + "learning_rate": 1.5509383452812725e-05, + "loss": 0.5693, + "step": 4377 + }, + { + "epoch": 0.6699311400153022, + "grad_norm": 2.383092745146042, + "learning_rate": 1.5507315126033435e-05, + "loss": 0.6256, + "step": 4378 + }, + { + "epoch": 0.6700841622035195, + "grad_norm": 1.8645480695198386, + "learning_rate": 1.550524646102838e-05, + "loss": 0.526, + "step": 4379 + }, + { + "epoch": 0.6702371843917369, + "grad_norm": 2.4363367750243805, + "learning_rate": 1.5503177457924597e-05, + "loss": 0.7734, + "step": 4380 + }, + { + "epoch": 0.6703902065799541, + "grad_norm": 2.5537622593414415, + "learning_rate": 1.5501108116849162e-05, + "loss": 0.7684, + "step": 4381 + }, + { + "epoch": 0.6705432287681714, + "grad_norm": 2.226941029837064, + "learning_rate": 1.549903843792916e-05, + "loss": 0.7214, + "step": 4382 + }, + { + "epoch": 0.6706962509563886, + "grad_norm": 2.4219083185269015, + "learning_rate": 1.5496968421291687e-05, + "loss": 0.8276, + "step": 4383 + }, + { + "epoch": 0.670849273144606, + "grad_norm": 2.160286832500492, + "learning_rate": 1.5494898067063885e-05, + "loss": 0.7445, + "step": 4384 + }, + { + "epoch": 0.6710022953328233, + "grad_norm": 2.207589097594382, + "learning_rate": 1.5492827375372895e-05, + "loss": 0.5768, + "step": 4385 + }, + { + "epoch": 0.6711553175210405, + "grad_norm": 2.163120547285754, + "learning_rate": 1.5490756346345887e-05, + "loss": 0.7213, + "step": 4386 + }, + { + "epoch": 0.6713083397092579, + "grad_norm": 2.2884691646975153, + "learning_rate": 1.5488684980110052e-05, + "loss": 0.7249, + "step": 4387 + }, + { + "epoch": 0.6714613618974752, + "grad_norm": 2.302420265586536, + "learning_rate": 1.5486613276792604e-05, + "loss": 0.7526, + "step": 4388 + }, + { + "epoch": 0.6716143840856924, + "grad_norm": 2.2770861895161585, + "learning_rate": 1.5484541236520767e-05, + "loss": 0.6862, + "step": 4389 + }, + { + "epoch": 0.6717674062739097, + "grad_norm": 2.1419356393123854, + "learning_rate": 1.54824688594218e-05, + "loss": 0.6584, + "step": 4390 + }, + { + "epoch": 0.671920428462127, + "grad_norm": 2.29317049827774, + "learning_rate": 1.548039614562297e-05, + "loss": 0.7182, + "step": 4391 + }, + { + "epoch": 0.6720734506503443, + "grad_norm": 2.432437397742068, + "learning_rate": 1.5478323095251572e-05, + "loss": 0.7444, + "step": 4392 + }, + { + "epoch": 0.6722264728385616, + "grad_norm": 2.129156587913408, + "learning_rate": 1.5476249708434928e-05, + "loss": 0.7134, + "step": 4393 + }, + { + "epoch": 0.6723794950267789, + "grad_norm": 2.4526432928617687, + "learning_rate": 1.547417598530036e-05, + "loss": 0.8522, + "step": 4394 + }, + { + "epoch": 0.6725325172149962, + "grad_norm": 1.9197792032530787, + "learning_rate": 1.5472101925975232e-05, + "loss": 0.6823, + "step": 4395 + }, + { + "epoch": 0.6726855394032135, + "grad_norm": 2.276953404476131, + "learning_rate": 1.5470027530586917e-05, + "loss": 0.7254, + "step": 4396 + }, + { + "epoch": 0.6728385615914307, + "grad_norm": 1.8933823182228429, + "learning_rate": 1.546795279926281e-05, + "loss": 0.5938, + "step": 4397 + }, + { + "epoch": 0.6729915837796481, + "grad_norm": 2.573327299964346, + "learning_rate": 1.5465877732130334e-05, + "loss": 0.7998, + "step": 4398 + }, + { + "epoch": 0.6731446059678653, + "grad_norm": 2.0314553890273146, + "learning_rate": 1.5463802329316925e-05, + "loss": 0.7874, + "step": 4399 + }, + { + "epoch": 0.6732976281560826, + "grad_norm": 2.5070632781376814, + "learning_rate": 1.5461726590950038e-05, + "loss": 0.8208, + "step": 4400 + }, + { + "epoch": 0.6734506503443, + "grad_norm": 2.3025849045760953, + "learning_rate": 1.5459650517157155e-05, + "loss": 0.7712, + "step": 4401 + }, + { + "epoch": 0.6736036725325172, + "grad_norm": 2.7271924988858642, + "learning_rate": 1.545757410806578e-05, + "loss": 0.8297, + "step": 4402 + }, + { + "epoch": 0.6737566947207345, + "grad_norm": 2.4592991194582554, + "learning_rate": 1.545549736380342e-05, + "loss": 0.755, + "step": 4403 + }, + { + "epoch": 0.6739097169089519, + "grad_norm": 2.2928675487770684, + "learning_rate": 1.545342028449763e-05, + "loss": 0.7825, + "step": 4404 + }, + { + "epoch": 0.6740627390971691, + "grad_norm": 2.396197246986, + "learning_rate": 1.5451342870275966e-05, + "loss": 0.7941, + "step": 4405 + }, + { + "epoch": 0.6742157612853864, + "grad_norm": 2.388805239861892, + "learning_rate": 1.5449265121266013e-05, + "loss": 0.7196, + "step": 4406 + }, + { + "epoch": 0.6743687834736036, + "grad_norm": 1.9013526614221556, + "learning_rate": 1.5447187037595365e-05, + "loss": 0.6383, + "step": 4407 + }, + { + "epoch": 0.674521805661821, + "grad_norm": 2.2240279181316263, + "learning_rate": 1.5445108619391658e-05, + "loss": 0.7109, + "step": 4408 + }, + { + "epoch": 0.6746748278500383, + "grad_norm": 2.197479041596447, + "learning_rate": 1.5443029866782527e-05, + "loss": 0.7536, + "step": 4409 + }, + { + "epoch": 0.6748278500382555, + "grad_norm": 2.2551420831296647, + "learning_rate": 1.544095077989564e-05, + "loss": 0.7972, + "step": 4410 + }, + { + "epoch": 0.6749808722264728, + "grad_norm": 2.0258059419407903, + "learning_rate": 1.5438871358858677e-05, + "loss": 0.6498, + "step": 4411 + }, + { + "epoch": 0.6751338944146902, + "grad_norm": 2.09550472938151, + "learning_rate": 1.543679160379935e-05, + "loss": 0.7254, + "step": 4412 + }, + { + "epoch": 0.6752869166029074, + "grad_norm": 2.5330437432780735, + "learning_rate": 1.543471151484538e-05, + "loss": 0.7032, + "step": 4413 + }, + { + "epoch": 0.6754399387911247, + "grad_norm": 2.348682051814609, + "learning_rate": 1.5432631092124516e-05, + "loss": 0.8294, + "step": 4414 + }, + { + "epoch": 0.675592960979342, + "grad_norm": 2.1542786517889905, + "learning_rate": 1.5430550335764522e-05, + "loss": 0.5664, + "step": 4415 + }, + { + "epoch": 0.6757459831675593, + "grad_norm": 2.690330238546623, + "learning_rate": 1.542846924589319e-05, + "loss": 0.7578, + "step": 4416 + }, + { + "epoch": 0.6758990053557766, + "grad_norm": 2.398820373779532, + "learning_rate": 1.5426387822638323e-05, + "loss": 0.7756, + "step": 4417 + }, + { + "epoch": 0.6760520275439938, + "grad_norm": 2.583573843262456, + "learning_rate": 1.5424306066127754e-05, + "loss": 0.6986, + "step": 4418 + }, + { + "epoch": 0.6762050497322112, + "grad_norm": 2.935202140832654, + "learning_rate": 1.542222397648933e-05, + "loss": 0.8748, + "step": 4419 + }, + { + "epoch": 0.6763580719204285, + "grad_norm": 2.119216436906614, + "learning_rate": 1.5420141553850914e-05, + "loss": 0.6931, + "step": 4420 + }, + { + "epoch": 0.6765110941086457, + "grad_norm": 2.2188152996527304, + "learning_rate": 1.541805879834041e-05, + "loss": 0.7145, + "step": 4421 + }, + { + "epoch": 0.6766641162968631, + "grad_norm": 2.2188544191352104, + "learning_rate": 1.5415975710085715e-05, + "loss": 0.6896, + "step": 4422 + }, + { + "epoch": 0.6768171384850803, + "grad_norm": 2.1994570146008567, + "learning_rate": 1.5413892289214765e-05, + "loss": 0.7304, + "step": 4423 + }, + { + "epoch": 0.6769701606732976, + "grad_norm": 2.031793893241159, + "learning_rate": 1.5411808535855508e-05, + "loss": 0.7114, + "step": 4424 + }, + { + "epoch": 0.677123182861515, + "grad_norm": 2.2744688009146516, + "learning_rate": 1.540972445013592e-05, + "loss": 0.7738, + "step": 4425 + }, + { + "epoch": 0.6772762050497322, + "grad_norm": 1.9530244694556822, + "learning_rate": 1.5407640032183993e-05, + "loss": 0.5981, + "step": 4426 + }, + { + "epoch": 0.6774292272379495, + "grad_norm": 2.2978859459660987, + "learning_rate": 1.5405555282127732e-05, + "loss": 0.6649, + "step": 4427 + }, + { + "epoch": 0.6775822494261668, + "grad_norm": 2.279440938423363, + "learning_rate": 1.5403470200095176e-05, + "loss": 0.7574, + "step": 4428 + }, + { + "epoch": 0.6777352716143841, + "grad_norm": 2.40984717918197, + "learning_rate": 1.5401384786214377e-05, + "loss": 0.7694, + "step": 4429 + }, + { + "epoch": 0.6778882938026014, + "grad_norm": 2.3415709152188033, + "learning_rate": 1.5399299040613408e-05, + "loss": 0.7097, + "step": 4430 + }, + { + "epoch": 0.6780413159908186, + "grad_norm": 2.270028435261307, + "learning_rate": 1.539721296342036e-05, + "loss": 0.7187, + "step": 4431 + }, + { + "epoch": 0.678194338179036, + "grad_norm": 2.231886593812637, + "learning_rate": 1.5395126554763357e-05, + "loss": 0.7461, + "step": 4432 + }, + { + "epoch": 0.6783473603672533, + "grad_norm": 2.030871074521151, + "learning_rate": 1.539303981477052e-05, + "loss": 0.6325, + "step": 4433 + }, + { + "epoch": 0.6785003825554705, + "grad_norm": 2.3190377138045912, + "learning_rate": 1.5390952743570015e-05, + "loss": 0.6715, + "step": 4434 + }, + { + "epoch": 0.6786534047436878, + "grad_norm": 2.2574521183138123, + "learning_rate": 1.538886534129001e-05, + "loss": 0.7406, + "step": 4435 + }, + { + "epoch": 0.6788064269319051, + "grad_norm": 2.1855521286184527, + "learning_rate": 1.5386777608058702e-05, + "loss": 0.7811, + "step": 4436 + }, + { + "epoch": 0.6789594491201224, + "grad_norm": 2.4386053936890324, + "learning_rate": 1.5384689544004307e-05, + "loss": 0.8023, + "step": 4437 + }, + { + "epoch": 0.6791124713083397, + "grad_norm": 2.424761529872376, + "learning_rate": 1.5382601149255063e-05, + "loss": 0.7948, + "step": 4438 + }, + { + "epoch": 0.679265493496557, + "grad_norm": 2.14136896989551, + "learning_rate": 1.5380512423939227e-05, + "loss": 0.7419, + "step": 4439 + }, + { + "epoch": 0.6794185156847743, + "grad_norm": 2.1760452971611994, + "learning_rate": 1.5378423368185074e-05, + "loss": 0.7354, + "step": 4440 + }, + { + "epoch": 0.6795715378729916, + "grad_norm": 2.4588643253323603, + "learning_rate": 1.53763339821209e-05, + "loss": 0.7851, + "step": 4441 + }, + { + "epoch": 0.6797245600612088, + "grad_norm": 2.3166074058614443, + "learning_rate": 1.5374244265875026e-05, + "loss": 0.7038, + "step": 4442 + }, + { + "epoch": 0.6798775822494262, + "grad_norm": 2.1635072983925836, + "learning_rate": 1.5372154219575788e-05, + "loss": 0.8344, + "step": 4443 + }, + { + "epoch": 0.6800306044376434, + "grad_norm": 2.2860730808637797, + "learning_rate": 1.5370063843351538e-05, + "loss": 0.7131, + "step": 4444 + }, + { + "epoch": 0.6801836266258607, + "grad_norm": 2.2691362821289847, + "learning_rate": 1.5367973137330667e-05, + "loss": 0.7036, + "step": 4445 + }, + { + "epoch": 0.6803366488140781, + "grad_norm": 2.2305641983243927, + "learning_rate": 1.536588210164156e-05, + "loss": 0.6088, + "step": 4446 + }, + { + "epoch": 0.6804896710022953, + "grad_norm": 2.2145056979037303, + "learning_rate": 1.5363790736412646e-05, + "loss": 0.8274, + "step": 4447 + }, + { + "epoch": 0.6806426931905126, + "grad_norm": 2.8007677770733492, + "learning_rate": 1.5361699041772358e-05, + "loss": 0.8732, + "step": 4448 + }, + { + "epoch": 0.68079571537873, + "grad_norm": 2.3126948601147457, + "learning_rate": 1.5359607017849156e-05, + "loss": 0.8034, + "step": 4449 + }, + { + "epoch": 0.6809487375669472, + "grad_norm": 2.2147887174883616, + "learning_rate": 1.535751466477152e-05, + "loss": 0.7773, + "step": 4450 + }, + { + "epoch": 0.6811017597551645, + "grad_norm": 2.4159115687576618, + "learning_rate": 1.535542198266795e-05, + "loss": 0.7059, + "step": 4451 + }, + { + "epoch": 0.6812547819433817, + "grad_norm": 2.0860891506089665, + "learning_rate": 1.535332897166697e-05, + "loss": 0.7719, + "step": 4452 + }, + { + "epoch": 0.6814078041315991, + "grad_norm": 1.9432480744857157, + "learning_rate": 1.535123563189711e-05, + "loss": 0.681, + "step": 4453 + }, + { + "epoch": 0.6815608263198164, + "grad_norm": 2.4927618482640583, + "learning_rate": 1.5349141963486934e-05, + "loss": 0.7504, + "step": 4454 + }, + { + "epoch": 0.6817138485080336, + "grad_norm": 2.5913353045572856, + "learning_rate": 1.5347047966565026e-05, + "loss": 0.8291, + "step": 4455 + }, + { + "epoch": 0.681866870696251, + "grad_norm": 2.083645177191506, + "learning_rate": 1.534495364125999e-05, + "loss": 0.73, + "step": 4456 + }, + { + "epoch": 0.6820198928844683, + "grad_norm": 2.319715812091627, + "learning_rate": 1.5342858987700434e-05, + "loss": 0.8116, + "step": 4457 + }, + { + "epoch": 0.6821729150726855, + "grad_norm": 2.2755235463056542, + "learning_rate": 1.534076400601501e-05, + "loss": 0.7882, + "step": 4458 + }, + { + "epoch": 0.6823259372609028, + "grad_norm": 2.262029634001563, + "learning_rate": 1.5338668696332374e-05, + "loss": 0.6669, + "step": 4459 + }, + { + "epoch": 0.6824789594491201, + "grad_norm": 2.6664918202648633, + "learning_rate": 1.533657305878121e-05, + "loss": 0.7102, + "step": 4460 + }, + { + "epoch": 0.6826319816373374, + "grad_norm": 2.127031830300462, + "learning_rate": 1.5334477093490215e-05, + "loss": 0.7144, + "step": 4461 + }, + { + "epoch": 0.6827850038255547, + "grad_norm": 2.4017809639453396, + "learning_rate": 1.5332380800588116e-05, + "loss": 0.7166, + "step": 4462 + }, + { + "epoch": 0.682938026013772, + "grad_norm": 2.5503010827973833, + "learning_rate": 1.5330284180203648e-05, + "loss": 0.8424, + "step": 4463 + }, + { + "epoch": 0.6830910482019893, + "grad_norm": 2.31586137036302, + "learning_rate": 1.532818723246558e-05, + "loss": 0.8279, + "step": 4464 + }, + { + "epoch": 0.6832440703902066, + "grad_norm": 2.1965797282146737, + "learning_rate": 1.5326089957502688e-05, + "loss": 0.7076, + "step": 4465 + }, + { + "epoch": 0.6833970925784238, + "grad_norm": 2.3924228128071867, + "learning_rate": 1.532399235544378e-05, + "loss": 0.7191, + "step": 4466 + }, + { + "epoch": 0.6835501147666412, + "grad_norm": 2.1377788155877355, + "learning_rate": 1.5321894426417667e-05, + "loss": 0.6807, + "step": 4467 + }, + { + "epoch": 0.6837031369548584, + "grad_norm": 2.2934918162416698, + "learning_rate": 1.5319796170553202e-05, + "loss": 0.7232, + "step": 4468 + }, + { + "epoch": 0.6838561591430757, + "grad_norm": 2.130277739327962, + "learning_rate": 1.5317697587979243e-05, + "loss": 0.638, + "step": 4469 + }, + { + "epoch": 0.6840091813312931, + "grad_norm": 2.312260633732955, + "learning_rate": 1.531559867882467e-05, + "loss": 0.8274, + "step": 4470 + }, + { + "epoch": 0.6841622035195103, + "grad_norm": 2.586845946207191, + "learning_rate": 1.5313499443218388e-05, + "loss": 0.7508, + "step": 4471 + }, + { + "epoch": 0.6843152257077276, + "grad_norm": 2.378424806595788, + "learning_rate": 1.5311399881289322e-05, + "loss": 0.7575, + "step": 4472 + }, + { + "epoch": 0.684468247895945, + "grad_norm": 2.1522295879865427, + "learning_rate": 1.5309299993166405e-05, + "loss": 0.6589, + "step": 4473 + }, + { + "epoch": 0.6846212700841622, + "grad_norm": 2.3097812688810784, + "learning_rate": 1.530719977897861e-05, + "loss": 0.7054, + "step": 4474 + }, + { + "epoch": 0.6847742922723795, + "grad_norm": 2.3172326880772736, + "learning_rate": 1.5305099238854913e-05, + "loss": 0.6894, + "step": 4475 + }, + { + "epoch": 0.6849273144605967, + "grad_norm": 2.1003335405866106, + "learning_rate": 1.530299837292432e-05, + "loss": 0.7624, + "step": 4476 + }, + { + "epoch": 0.6850803366488141, + "grad_norm": 2.599385998272937, + "learning_rate": 1.5300897181315852e-05, + "loss": 0.7463, + "step": 4477 + }, + { + "epoch": 0.6852333588370314, + "grad_norm": 2.3900101225352395, + "learning_rate": 1.5298795664158547e-05, + "loss": 0.8077, + "step": 4478 + }, + { + "epoch": 0.6853863810252486, + "grad_norm": 2.191784966065849, + "learning_rate": 1.5296693821581474e-05, + "loss": 0.6986, + "step": 4479 + }, + { + "epoch": 0.685539403213466, + "grad_norm": 2.6280784977891147, + "learning_rate": 1.529459165371371e-05, + "loss": 0.7687, + "step": 4480 + }, + { + "epoch": 0.6856924254016833, + "grad_norm": 2.471333458938139, + "learning_rate": 1.5292489160684362e-05, + "loss": 0.8365, + "step": 4481 + }, + { + "epoch": 0.6858454475899005, + "grad_norm": 2.3021022470545582, + "learning_rate": 1.529038634262255e-05, + "loss": 0.8477, + "step": 4482 + }, + { + "epoch": 0.6859984697781178, + "grad_norm": 2.3246002041267304, + "learning_rate": 1.528828319965742e-05, + "loss": 0.8146, + "step": 4483 + }, + { + "epoch": 0.6861514919663351, + "grad_norm": 2.1854019951696757, + "learning_rate": 1.5286179731918126e-05, + "loss": 0.6378, + "step": 4484 + }, + { + "epoch": 0.6863045141545524, + "grad_norm": 1.9833151918210372, + "learning_rate": 1.5284075939533854e-05, + "loss": 0.8069, + "step": 4485 + }, + { + "epoch": 0.6864575363427697, + "grad_norm": 2.324187336487013, + "learning_rate": 1.5281971822633812e-05, + "loss": 0.858, + "step": 4486 + }, + { + "epoch": 0.686610558530987, + "grad_norm": 2.151761922724606, + "learning_rate": 1.527986738134721e-05, + "loss": 0.63, + "step": 4487 + }, + { + "epoch": 0.6867635807192043, + "grad_norm": 1.9302796780045488, + "learning_rate": 1.5277762615803308e-05, + "loss": 0.7935, + "step": 4488 + }, + { + "epoch": 0.6869166029074216, + "grad_norm": 2.0743268312554854, + "learning_rate": 1.527565752613135e-05, + "loss": 0.7011, + "step": 4489 + }, + { + "epoch": 0.6870696250956388, + "grad_norm": 2.5266338890936133, + "learning_rate": 1.5273552112460626e-05, + "loss": 0.8576, + "step": 4490 + }, + { + "epoch": 0.6872226472838562, + "grad_norm": 2.0054601689178035, + "learning_rate": 1.5271446374920435e-05, + "loss": 0.6995, + "step": 4491 + }, + { + "epoch": 0.6873756694720734, + "grad_norm": 2.208190332410164, + "learning_rate": 1.5269340313640104e-05, + "loss": 0.6487, + "step": 4492 + }, + { + "epoch": 0.6875286916602907, + "grad_norm": 2.083733044038154, + "learning_rate": 1.526723392874897e-05, + "loss": 0.782, + "step": 4493 + }, + { + "epoch": 0.6876817138485081, + "grad_norm": 2.2792750232808676, + "learning_rate": 1.526512722037639e-05, + "loss": 0.726, + "step": 4494 + }, + { + "epoch": 0.6878347360367253, + "grad_norm": 2.089507560978964, + "learning_rate": 1.526302018865175e-05, + "loss": 0.7222, + "step": 4495 + }, + { + "epoch": 0.6879877582249426, + "grad_norm": 2.2579331803512193, + "learning_rate": 1.526091283370446e-05, + "loss": 0.7362, + "step": 4496 + }, + { + "epoch": 0.68814078041316, + "grad_norm": 2.2197350362031933, + "learning_rate": 1.5258805155663924e-05, + "loss": 0.7617, + "step": 4497 + }, + { + "epoch": 0.6882938026013772, + "grad_norm": 2.4317255785428267, + "learning_rate": 1.5256697154659593e-05, + "loss": 0.7737, + "step": 4498 + }, + { + "epoch": 0.6884468247895945, + "grad_norm": 2.28504107301684, + "learning_rate": 1.5254588830820925e-05, + "loss": 0.7777, + "step": 4499 + }, + { + "epoch": 0.6885998469778117, + "grad_norm": 2.272161786994709, + "learning_rate": 1.5252480184277403e-05, + "loss": 0.8356, + "step": 4500 + }, + { + "epoch": 0.6887528691660291, + "grad_norm": 2.335814379960226, + "learning_rate": 1.5250371215158523e-05, + "loss": 0.841, + "step": 4501 + }, + { + "epoch": 0.6889058913542464, + "grad_norm": 2.168986813520893, + "learning_rate": 1.5248261923593805e-05, + "loss": 0.7132, + "step": 4502 + }, + { + "epoch": 0.6890589135424636, + "grad_norm": 2.1288853465211113, + "learning_rate": 1.5246152309712795e-05, + "loss": 0.7998, + "step": 4503 + }, + { + "epoch": 0.689211935730681, + "grad_norm": 2.319619776523877, + "learning_rate": 1.5244042373645047e-05, + "loss": 0.802, + "step": 4504 + }, + { + "epoch": 0.6893649579188983, + "grad_norm": 2.2306663348214166, + "learning_rate": 1.5241932115520142e-05, + "loss": 0.6982, + "step": 4505 + }, + { + "epoch": 0.6895179801071155, + "grad_norm": 1.9740335208345237, + "learning_rate": 1.523982153546768e-05, + "loss": 0.6945, + "step": 4506 + }, + { + "epoch": 0.6896710022953328, + "grad_norm": 2.3212474725289, + "learning_rate": 1.5237710633617278e-05, + "loss": 0.7454, + "step": 4507 + }, + { + "epoch": 0.6898240244835501, + "grad_norm": 2.181356444869256, + "learning_rate": 1.5235599410098576e-05, + "loss": 0.6968, + "step": 4508 + }, + { + "epoch": 0.6899770466717674, + "grad_norm": 2.4681098588473676, + "learning_rate": 1.5233487865041237e-05, + "loss": 0.7217, + "step": 4509 + }, + { + "epoch": 0.6901300688599847, + "grad_norm": 2.2233164003664068, + "learning_rate": 1.5231375998574929e-05, + "loss": 0.6686, + "step": 4510 + }, + { + "epoch": 0.690283091048202, + "grad_norm": 2.1293830198002177, + "learning_rate": 1.5229263810829355e-05, + "loss": 0.7067, + "step": 4511 + }, + { + "epoch": 0.6904361132364193, + "grad_norm": 2.333776572385844, + "learning_rate": 1.5227151301934235e-05, + "loss": 0.7336, + "step": 4512 + }, + { + "epoch": 0.6905891354246366, + "grad_norm": 2.488107023266574, + "learning_rate": 1.522503847201931e-05, + "loss": 0.8592, + "step": 4513 + }, + { + "epoch": 0.6907421576128538, + "grad_norm": 2.2582773023106526, + "learning_rate": 1.5222925321214326e-05, + "loss": 0.6915, + "step": 4514 + }, + { + "epoch": 0.6908951798010712, + "grad_norm": 2.222243925719928, + "learning_rate": 1.5220811849649066e-05, + "loss": 0.7331, + "step": 4515 + }, + { + "epoch": 0.6910482019892884, + "grad_norm": 2.0850055482377776, + "learning_rate": 1.521869805745333e-05, + "loss": 0.7538, + "step": 4516 + }, + { + "epoch": 0.6912012241775057, + "grad_norm": 2.221052204851177, + "learning_rate": 1.5216583944756927e-05, + "loss": 0.6374, + "step": 4517 + }, + { + "epoch": 0.6913542463657231, + "grad_norm": 2.1796358857455043, + "learning_rate": 1.5214469511689698e-05, + "loss": 0.6842, + "step": 4518 + }, + { + "epoch": 0.6915072685539403, + "grad_norm": 2.405770425488477, + "learning_rate": 1.5212354758381498e-05, + "loss": 0.7566, + "step": 4519 + }, + { + "epoch": 0.6916602907421576, + "grad_norm": 2.5722939852043254, + "learning_rate": 1.5210239684962197e-05, + "loss": 0.7809, + "step": 4520 + }, + { + "epoch": 0.691813312930375, + "grad_norm": 2.465616131274673, + "learning_rate": 1.5208124291561702e-05, + "loss": 0.78, + "step": 4521 + }, + { + "epoch": 0.6919663351185922, + "grad_norm": 2.3138649418442356, + "learning_rate": 1.5206008578309917e-05, + "loss": 0.73, + "step": 4522 + }, + { + "epoch": 0.6921193573068095, + "grad_norm": 2.3167740039306755, + "learning_rate": 1.5203892545336776e-05, + "loss": 0.703, + "step": 4523 + }, + { + "epoch": 0.6922723794950267, + "grad_norm": 2.670744482743038, + "learning_rate": 1.5201776192772237e-05, + "loss": 0.7674, + "step": 4524 + }, + { + "epoch": 0.6924254016832441, + "grad_norm": 2.217729480121748, + "learning_rate": 1.5199659520746275e-05, + "loss": 0.7415, + "step": 4525 + }, + { + "epoch": 0.6925784238714614, + "grad_norm": 2.001448225210971, + "learning_rate": 1.5197542529388878e-05, + "loss": 0.6448, + "step": 4526 + }, + { + "epoch": 0.6927314460596786, + "grad_norm": 2.238292825597769, + "learning_rate": 1.5195425218830063e-05, + "loss": 0.7611, + "step": 4527 + }, + { + "epoch": 0.692884468247896, + "grad_norm": 2.7725948227174784, + "learning_rate": 1.5193307589199862e-05, + "loss": 0.8477, + "step": 4528 + }, + { + "epoch": 0.6930374904361133, + "grad_norm": 2.110571689962769, + "learning_rate": 1.519118964062832e-05, + "loss": 0.6644, + "step": 4529 + }, + { + "epoch": 0.6931905126243305, + "grad_norm": 2.2568105040470257, + "learning_rate": 1.5189071373245521e-05, + "loss": 0.6753, + "step": 4530 + }, + { + "epoch": 0.6933435348125478, + "grad_norm": 2.5626809993301594, + "learning_rate": 1.5186952787181546e-05, + "loss": 0.8011, + "step": 4531 + }, + { + "epoch": 0.6934965570007651, + "grad_norm": 2.2221418431380155, + "learning_rate": 1.5184833882566507e-05, + "loss": 0.764, + "step": 4532 + }, + { + "epoch": 0.6936495791889824, + "grad_norm": 2.4274947741070005, + "learning_rate": 1.518271465953054e-05, + "loss": 0.7462, + "step": 4533 + }, + { + "epoch": 0.6938026013771997, + "grad_norm": 2.270795029988415, + "learning_rate": 1.5180595118203791e-05, + "loss": 0.6212, + "step": 4534 + }, + { + "epoch": 0.693955623565417, + "grad_norm": 2.3794007844331966, + "learning_rate": 1.5178475258716426e-05, + "loss": 0.7506, + "step": 4535 + }, + { + "epoch": 0.6941086457536343, + "grad_norm": 2.404191858909533, + "learning_rate": 1.5176355081198638e-05, + "loss": 0.7967, + "step": 4536 + }, + { + "epoch": 0.6942616679418516, + "grad_norm": 2.272161701559529, + "learning_rate": 1.5174234585780633e-05, + "loss": 0.8067, + "step": 4537 + }, + { + "epoch": 0.6944146901300688, + "grad_norm": 2.16796211094527, + "learning_rate": 1.5172113772592645e-05, + "loss": 0.8289, + "step": 4538 + }, + { + "epoch": 0.6945677123182862, + "grad_norm": 2.7916757266297894, + "learning_rate": 1.5169992641764913e-05, + "loss": 0.7694, + "step": 4539 + }, + { + "epoch": 0.6947207345065034, + "grad_norm": 2.2621760884957345, + "learning_rate": 1.5167871193427709e-05, + "loss": 0.7442, + "step": 4540 + }, + { + "epoch": 0.6948737566947207, + "grad_norm": 2.2540368894976583, + "learning_rate": 1.5165749427711318e-05, + "loss": 0.8203, + "step": 4541 + }, + { + "epoch": 0.6950267788829381, + "grad_norm": 2.2972220573536957, + "learning_rate": 1.5163627344746045e-05, + "loss": 0.818, + "step": 4542 + }, + { + "epoch": 0.6951798010711553, + "grad_norm": 2.482793009436285, + "learning_rate": 1.516150494466222e-05, + "loss": 0.8353, + "step": 4543 + }, + { + "epoch": 0.6953328232593726, + "grad_norm": 2.3515004139625897, + "learning_rate": 1.515938222759018e-05, + "loss": 0.778, + "step": 4544 + }, + { + "epoch": 0.6954858454475898, + "grad_norm": 2.2361132567474615, + "learning_rate": 1.5157259193660295e-05, + "loss": 0.7126, + "step": 4545 + }, + { + "epoch": 0.6956388676358072, + "grad_norm": 2.5681665991598286, + "learning_rate": 1.5155135843002948e-05, + "loss": 0.8872, + "step": 4546 + }, + { + "epoch": 0.6957918898240245, + "grad_norm": 2.3461733455885727, + "learning_rate": 1.5153012175748542e-05, + "loss": 0.7425, + "step": 4547 + }, + { + "epoch": 0.6959449120122417, + "grad_norm": 2.1546909441667124, + "learning_rate": 1.5150888192027502e-05, + "loss": 0.5938, + "step": 4548 + }, + { + "epoch": 0.6960979342004591, + "grad_norm": 2.1532119909978937, + "learning_rate": 1.5148763891970264e-05, + "loss": 0.6435, + "step": 4549 + }, + { + "epoch": 0.6962509563886764, + "grad_norm": 2.141592558646377, + "learning_rate": 1.5146639275707298e-05, + "loss": 0.6884, + "step": 4550 + }, + { + "epoch": 0.6964039785768936, + "grad_norm": 2.5681176533512216, + "learning_rate": 1.5144514343369075e-05, + "loss": 0.9509, + "step": 4551 + }, + { + "epoch": 0.696557000765111, + "grad_norm": 2.3532875826075554, + "learning_rate": 1.5142389095086103e-05, + "loss": 0.7296, + "step": 4552 + }, + { + "epoch": 0.6967100229533282, + "grad_norm": 2.464874640315048, + "learning_rate": 1.5140263530988898e-05, + "loss": 0.8494, + "step": 4553 + }, + { + "epoch": 0.6968630451415455, + "grad_norm": 2.3773010925552773, + "learning_rate": 1.5138137651208004e-05, + "loss": 0.6996, + "step": 4554 + }, + { + "epoch": 0.6970160673297628, + "grad_norm": 1.9930665522309956, + "learning_rate": 1.5136011455873973e-05, + "loss": 0.7253, + "step": 4555 + }, + { + "epoch": 0.6971690895179801, + "grad_norm": 2.287025093599191, + "learning_rate": 1.513388494511739e-05, + "loss": 0.6763, + "step": 4556 + }, + { + "epoch": 0.6973221117061974, + "grad_norm": 2.141087425346586, + "learning_rate": 1.5131758119068844e-05, + "loss": 0.6891, + "step": 4557 + }, + { + "epoch": 0.6974751338944147, + "grad_norm": 1.9646395637835101, + "learning_rate": 1.512963097785896e-05, + "loss": 0.6165, + "step": 4558 + }, + { + "epoch": 0.697628156082632, + "grad_norm": 2.0897594617619903, + "learning_rate": 1.5127503521618371e-05, + "loss": 0.5849, + "step": 4559 + }, + { + "epoch": 0.6977811782708493, + "grad_norm": 2.305818341701876, + "learning_rate": 1.5125375750477731e-05, + "loss": 0.7084, + "step": 4560 + }, + { + "epoch": 0.6979342004590665, + "grad_norm": 2.5024862701767345, + "learning_rate": 1.512324766456771e-05, + "loss": 0.8031, + "step": 4561 + }, + { + "epoch": 0.6980872226472838, + "grad_norm": 2.4597919538294217, + "learning_rate": 1.5121119264019015e-05, + "loss": 0.8691, + "step": 4562 + }, + { + "epoch": 0.6982402448355012, + "grad_norm": 2.2910984541415385, + "learning_rate": 1.511899054896235e-05, + "loss": 0.6815, + "step": 4563 + }, + { + "epoch": 0.6983932670237184, + "grad_norm": 2.364384147976584, + "learning_rate": 1.5116861519528447e-05, + "loss": 0.7718, + "step": 4564 + }, + { + "epoch": 0.6985462892119357, + "grad_norm": 2.4982226125776226, + "learning_rate": 1.5114732175848064e-05, + "loss": 0.7911, + "step": 4565 + }, + { + "epoch": 0.6986993114001531, + "grad_norm": 2.6777365242008986, + "learning_rate": 1.5112602518051971e-05, + "loss": 0.9079, + "step": 4566 + }, + { + "epoch": 0.6988523335883703, + "grad_norm": 2.3841456314000538, + "learning_rate": 1.5110472546270952e-05, + "loss": 0.6749, + "step": 4567 + }, + { + "epoch": 0.6990053557765876, + "grad_norm": 2.150933316605576, + "learning_rate": 1.5108342260635821e-05, + "loss": 0.576, + "step": 4568 + }, + { + "epoch": 0.6991583779648048, + "grad_norm": 2.383057686481416, + "learning_rate": 1.5106211661277412e-05, + "loss": 0.7526, + "step": 4569 + }, + { + "epoch": 0.6993114001530222, + "grad_norm": 2.273681917930905, + "learning_rate": 1.5104080748326568e-05, + "loss": 0.817, + "step": 4570 + }, + { + "epoch": 0.6994644223412395, + "grad_norm": 2.223542529079718, + "learning_rate": 1.510194952191416e-05, + "loss": 0.7082, + "step": 4571 + }, + { + "epoch": 0.6996174445294567, + "grad_norm": 2.3811340188781487, + "learning_rate": 1.509981798217107e-05, + "loss": 0.7807, + "step": 4572 + }, + { + "epoch": 0.6997704667176741, + "grad_norm": 2.278119095067911, + "learning_rate": 1.5097686129228208e-05, + "loss": 0.6676, + "step": 4573 + }, + { + "epoch": 0.6999234889058914, + "grad_norm": 2.2273074977858, + "learning_rate": 1.5095553963216496e-05, + "loss": 0.6651, + "step": 4574 + }, + { + "epoch": 0.7000765110941086, + "grad_norm": 2.483082682130355, + "learning_rate": 1.5093421484266885e-05, + "loss": 0.6142, + "step": 4575 + }, + { + "epoch": 0.700229533282326, + "grad_norm": 2.1945222620117533, + "learning_rate": 1.5091288692510334e-05, + "loss": 0.6586, + "step": 4576 + }, + { + "epoch": 0.7003825554705432, + "grad_norm": 2.3719793672214244, + "learning_rate": 1.5089155588077824e-05, + "loss": 0.814, + "step": 4577 + }, + { + "epoch": 0.7005355776587605, + "grad_norm": 2.4060459274427783, + "learning_rate": 1.5087022171100362e-05, + "loss": 0.7127, + "step": 4578 + }, + { + "epoch": 0.7006885998469778, + "grad_norm": 2.309202456137207, + "learning_rate": 1.5084888441708972e-05, + "loss": 0.6896, + "step": 4579 + }, + { + "epoch": 0.7008416220351951, + "grad_norm": 2.3124390047154764, + "learning_rate": 1.5082754400034687e-05, + "loss": 0.8279, + "step": 4580 + }, + { + "epoch": 0.7009946442234124, + "grad_norm": 2.0904361229422763, + "learning_rate": 1.508062004620857e-05, + "loss": 0.6954, + "step": 4581 + }, + { + "epoch": 0.7011476664116297, + "grad_norm": 2.094692589966736, + "learning_rate": 1.5078485380361702e-05, + "loss": 0.6429, + "step": 4582 + }, + { + "epoch": 0.701300688599847, + "grad_norm": 2.4664976414160535, + "learning_rate": 1.507635040262518e-05, + "loss": 0.734, + "step": 4583 + }, + { + "epoch": 0.7014537107880643, + "grad_norm": 2.230209940959713, + "learning_rate": 1.507421511313012e-05, + "loss": 0.7579, + "step": 4584 + }, + { + "epoch": 0.7016067329762815, + "grad_norm": 2.129917064589851, + "learning_rate": 1.5072079512007661e-05, + "loss": 0.7132, + "step": 4585 + }, + { + "epoch": 0.7017597551644988, + "grad_norm": 2.6035639966486883, + "learning_rate": 1.5069943599388952e-05, + "loss": 0.8368, + "step": 4586 + }, + { + "epoch": 0.7019127773527162, + "grad_norm": 2.0753617962083895, + "learning_rate": 1.506780737540518e-05, + "loss": 0.6343, + "step": 4587 + }, + { + "epoch": 0.7020657995409334, + "grad_norm": 2.2775042401770023, + "learning_rate": 1.5065670840187527e-05, + "loss": 0.7708, + "step": 4588 + }, + { + "epoch": 0.7022188217291507, + "grad_norm": 2.1526579899134552, + "learning_rate": 1.5063533993867209e-05, + "loss": 0.6628, + "step": 4589 + }, + { + "epoch": 0.7023718439173681, + "grad_norm": 2.176150777690883, + "learning_rate": 1.5061396836575463e-05, + "loss": 0.7387, + "step": 4590 + }, + { + "epoch": 0.7025248661055853, + "grad_norm": 2.129854293577214, + "learning_rate": 1.5059259368443541e-05, + "loss": 0.6796, + "step": 4591 + }, + { + "epoch": 0.7026778882938026, + "grad_norm": 2.207031757536129, + "learning_rate": 1.5057121589602703e-05, + "loss": 0.6714, + "step": 4592 + }, + { + "epoch": 0.7028309104820198, + "grad_norm": 2.148029900786362, + "learning_rate": 1.5054983500184245e-05, + "loss": 0.8089, + "step": 4593 + }, + { + "epoch": 0.7029839326702372, + "grad_norm": 2.138588131175154, + "learning_rate": 1.5052845100319478e-05, + "loss": 0.6533, + "step": 4594 + }, + { + "epoch": 0.7031369548584545, + "grad_norm": 2.350243331629356, + "learning_rate": 1.5050706390139724e-05, + "loss": 0.7045, + "step": 4595 + }, + { + "epoch": 0.7032899770466717, + "grad_norm": 2.4869590482805126, + "learning_rate": 1.5048567369776334e-05, + "loss": 0.8078, + "step": 4596 + }, + { + "epoch": 0.7034429992348891, + "grad_norm": 2.367666383463255, + "learning_rate": 1.5046428039360673e-05, + "loss": 0.6646, + "step": 4597 + }, + { + "epoch": 0.7035960214231064, + "grad_norm": 2.06402576755447, + "learning_rate": 1.5044288399024121e-05, + "loss": 0.8041, + "step": 4598 + }, + { + "epoch": 0.7037490436113236, + "grad_norm": 2.15944555184331, + "learning_rate": 1.5042148448898086e-05, + "loss": 0.6758, + "step": 4599 + }, + { + "epoch": 0.703902065799541, + "grad_norm": 2.5051434873442933, + "learning_rate": 1.504000818911399e-05, + "loss": 0.8164, + "step": 4600 + }, + { + "epoch": 0.7040550879877582, + "grad_norm": 2.3479880611298487, + "learning_rate": 1.5037867619803276e-05, + "loss": 0.8284, + "step": 4601 + }, + { + "epoch": 0.7042081101759755, + "grad_norm": 2.1908232486121, + "learning_rate": 1.5035726741097398e-05, + "loss": 0.7382, + "step": 4602 + }, + { + "epoch": 0.7043611323641928, + "grad_norm": 2.3218035445254226, + "learning_rate": 1.5033585553127848e-05, + "loss": 0.6095, + "step": 4603 + }, + { + "epoch": 0.7045141545524101, + "grad_norm": 2.2092916242716907, + "learning_rate": 1.5031444056026114e-05, + "loss": 0.8147, + "step": 4604 + }, + { + "epoch": 0.7046671767406274, + "grad_norm": 2.217187498878475, + "learning_rate": 1.5029302249923716e-05, + "loss": 0.7882, + "step": 4605 + }, + { + "epoch": 0.7048201989288447, + "grad_norm": 2.2389718404752834, + "learning_rate": 1.5027160134952197e-05, + "loss": 0.6402, + "step": 4606 + }, + { + "epoch": 0.704973221117062, + "grad_norm": 2.4569438697426893, + "learning_rate": 1.5025017711243104e-05, + "loss": 0.8244, + "step": 4607 + }, + { + "epoch": 0.7051262433052793, + "grad_norm": 2.1755220170600422, + "learning_rate": 1.5022874978928015e-05, + "loss": 0.6957, + "step": 4608 + }, + { + "epoch": 0.7052792654934965, + "grad_norm": 2.1244164995728982, + "learning_rate": 1.5020731938138523e-05, + "loss": 0.6932, + "step": 4609 + }, + { + "epoch": 0.7054322876817138, + "grad_norm": 2.2433322410932757, + "learning_rate": 1.5018588589006247e-05, + "loss": 0.7543, + "step": 4610 + }, + { + "epoch": 0.7055853098699312, + "grad_norm": 2.118332433021445, + "learning_rate": 1.501644493166281e-05, + "loss": 0.8065, + "step": 4611 + }, + { + "epoch": 0.7057383320581484, + "grad_norm": 2.1458348809316914, + "learning_rate": 1.5014300966239863e-05, + "loss": 0.741, + "step": 4612 + }, + { + "epoch": 0.7058913542463657, + "grad_norm": 2.2114252595349178, + "learning_rate": 1.501215669286908e-05, + "loss": 0.7222, + "step": 4613 + }, + { + "epoch": 0.7060443764345831, + "grad_norm": 2.114569239497734, + "learning_rate": 1.5010012111682146e-05, + "loss": 0.6805, + "step": 4614 + }, + { + "epoch": 0.7061973986228003, + "grad_norm": 2.065733198288359, + "learning_rate": 1.5007867222810769e-05, + "loss": 0.6061, + "step": 4615 + }, + { + "epoch": 0.7063504208110176, + "grad_norm": 2.214125841483104, + "learning_rate": 1.5005722026386675e-05, + "loss": 0.6689, + "step": 4616 + }, + { + "epoch": 0.7065034429992348, + "grad_norm": 2.2882545330622057, + "learning_rate": 1.5003576522541609e-05, + "loss": 0.6518, + "step": 4617 + }, + { + "epoch": 0.7066564651874522, + "grad_norm": 2.3663303914915375, + "learning_rate": 1.5001430711407335e-05, + "loss": 0.6424, + "step": 4618 + }, + { + "epoch": 0.7068094873756695, + "grad_norm": 2.431556481342764, + "learning_rate": 1.4999284593115633e-05, + "loss": 0.8046, + "step": 4619 + }, + { + "epoch": 0.7069625095638867, + "grad_norm": 2.389676632888257, + "learning_rate": 1.499713816779831e-05, + "loss": 0.7433, + "step": 4620 + }, + { + "epoch": 0.7071155317521041, + "grad_norm": 2.2577603438639184, + "learning_rate": 1.4994991435587181e-05, + "loss": 0.7294, + "step": 4621 + }, + { + "epoch": 0.7072685539403214, + "grad_norm": 2.309661103988358, + "learning_rate": 1.499284439661409e-05, + "loss": 0.769, + "step": 4622 + }, + { + "epoch": 0.7074215761285386, + "grad_norm": 2.117249156808084, + "learning_rate": 1.499069705101089e-05, + "loss": 0.7223, + "step": 4623 + }, + { + "epoch": 0.707574598316756, + "grad_norm": 2.2679542217853284, + "learning_rate": 1.4988549398909461e-05, + "loss": 0.7463, + "step": 4624 + }, + { + "epoch": 0.7077276205049732, + "grad_norm": 2.4233667275174873, + "learning_rate": 1.4986401440441698e-05, + "loss": 0.6929, + "step": 4625 + }, + { + "epoch": 0.7078806426931905, + "grad_norm": 2.4387312401102728, + "learning_rate": 1.4984253175739516e-05, + "loss": 0.7185, + "step": 4626 + }, + { + "epoch": 0.7080336648814078, + "grad_norm": 2.1516389780097813, + "learning_rate": 1.4982104604934847e-05, + "loss": 0.6914, + "step": 4627 + }, + { + "epoch": 0.7081866870696251, + "grad_norm": 2.1911271817443976, + "learning_rate": 1.4979955728159648e-05, + "loss": 0.7076, + "step": 4628 + }, + { + "epoch": 0.7083397092578424, + "grad_norm": 2.2764653082340605, + "learning_rate": 1.4977806545545882e-05, + "loss": 0.7879, + "step": 4629 + }, + { + "epoch": 0.7084927314460597, + "grad_norm": 2.258450187406121, + "learning_rate": 1.4975657057225541e-05, + "loss": 0.8093, + "step": 4630 + }, + { + "epoch": 0.708645753634277, + "grad_norm": 2.102515060129002, + "learning_rate": 1.497350726333064e-05, + "loss": 0.6645, + "step": 4631 + }, + { + "epoch": 0.7087987758224943, + "grad_norm": 2.3893729812773725, + "learning_rate": 1.4971357163993201e-05, + "loss": 0.7671, + "step": 4632 + }, + { + "epoch": 0.7089517980107115, + "grad_norm": 1.9505505804431376, + "learning_rate": 1.4969206759345268e-05, + "loss": 0.6499, + "step": 4633 + }, + { + "epoch": 0.7091048201989288, + "grad_norm": 2.2384013451963143, + "learning_rate": 1.4967056049518909e-05, + "loss": 0.7929, + "step": 4634 + }, + { + "epoch": 0.7092578423871462, + "grad_norm": 2.4005614517986893, + "learning_rate": 1.4964905034646207e-05, + "loss": 0.7999, + "step": 4635 + }, + { + "epoch": 0.7094108645753634, + "grad_norm": 2.0678028281202065, + "learning_rate": 1.4962753714859265e-05, + "loss": 0.7242, + "step": 4636 + }, + { + "epoch": 0.7095638867635807, + "grad_norm": 2.462845353036398, + "learning_rate": 1.4960602090290201e-05, + "loss": 0.8538, + "step": 4637 + }, + { + "epoch": 0.7097169089517981, + "grad_norm": 2.3117301226127798, + "learning_rate": 1.495845016107116e-05, + "loss": 0.7825, + "step": 4638 + }, + { + "epoch": 0.7098699311400153, + "grad_norm": 2.2646133688524297, + "learning_rate": 1.4956297927334293e-05, + "loss": 0.7812, + "step": 4639 + }, + { + "epoch": 0.7100229533282326, + "grad_norm": 2.2227803644605286, + "learning_rate": 1.4954145389211783e-05, + "loss": 0.6609, + "step": 4640 + }, + { + "epoch": 0.7101759755164498, + "grad_norm": 2.282965177589414, + "learning_rate": 1.4951992546835822e-05, + "loss": 0.7945, + "step": 4641 + }, + { + "epoch": 0.7103289977046672, + "grad_norm": 2.4988127974075436, + "learning_rate": 1.494983940033863e-05, + "loss": 0.7835, + "step": 4642 + }, + { + "epoch": 0.7104820198928845, + "grad_norm": 2.0580200836857627, + "learning_rate": 1.4947685949852433e-05, + "loss": 0.7276, + "step": 4643 + }, + { + "epoch": 0.7106350420811017, + "grad_norm": 2.115513884581086, + "learning_rate": 1.4945532195509489e-05, + "loss": 0.8458, + "step": 4644 + }, + { + "epoch": 0.7107880642693191, + "grad_norm": 2.1312885230885694, + "learning_rate": 1.4943378137442066e-05, + "loss": 0.6838, + "step": 4645 + }, + { + "epoch": 0.7109410864575364, + "grad_norm": 2.0192826011804788, + "learning_rate": 1.494122377578245e-05, + "loss": 0.6525, + "step": 4646 + }, + { + "epoch": 0.7110941086457536, + "grad_norm": 2.14333559550761, + "learning_rate": 1.4939069110662953e-05, + "loss": 0.7441, + "step": 4647 + }, + { + "epoch": 0.711247130833971, + "grad_norm": 2.735799505033225, + "learning_rate": 1.4936914142215901e-05, + "loss": 0.8995, + "step": 4648 + }, + { + "epoch": 0.7114001530221882, + "grad_norm": 2.3028298562489193, + "learning_rate": 1.4934758870573638e-05, + "loss": 0.8153, + "step": 4649 + }, + { + "epoch": 0.7115531752104055, + "grad_norm": 2.0552740637304434, + "learning_rate": 1.4932603295868529e-05, + "loss": 0.7292, + "step": 4650 + }, + { + "epoch": 0.7117061973986228, + "grad_norm": 2.0362932635684676, + "learning_rate": 1.4930447418232954e-05, + "loss": 0.7156, + "step": 4651 + }, + { + "epoch": 0.7118592195868401, + "grad_norm": 2.3297210965014323, + "learning_rate": 1.4928291237799316e-05, + "loss": 0.7321, + "step": 4652 + }, + { + "epoch": 0.7120122417750574, + "grad_norm": 2.3448837720146587, + "learning_rate": 1.4926134754700033e-05, + "loss": 0.7765, + "step": 4653 + }, + { + "epoch": 0.7121652639632746, + "grad_norm": 2.090763497741587, + "learning_rate": 1.4923977969067542e-05, + "loss": 0.7825, + "step": 4654 + }, + { + "epoch": 0.712318286151492, + "grad_norm": 2.1568103828538137, + "learning_rate": 1.4921820881034303e-05, + "loss": 0.6317, + "step": 4655 + }, + { + "epoch": 0.7124713083397093, + "grad_norm": 2.1517165425688307, + "learning_rate": 1.4919663490732788e-05, + "loss": 0.7669, + "step": 4656 + }, + { + "epoch": 0.7126243305279265, + "grad_norm": 2.348912710534796, + "learning_rate": 1.4917505798295496e-05, + "loss": 0.6615, + "step": 4657 + }, + { + "epoch": 0.7127773527161438, + "grad_norm": 2.3188303433517197, + "learning_rate": 1.491534780385493e-05, + "loss": 0.6957, + "step": 4658 + }, + { + "epoch": 0.7129303749043612, + "grad_norm": 2.31676216581471, + "learning_rate": 1.4913189507543629e-05, + "loss": 0.7658, + "step": 4659 + }, + { + "epoch": 0.7130833970925784, + "grad_norm": 2.3905665163661616, + "learning_rate": 1.491103090949414e-05, + "loss": 0.8188, + "step": 4660 + }, + { + "epoch": 0.7132364192807957, + "grad_norm": 2.331333999855849, + "learning_rate": 1.4908872009839032e-05, + "loss": 0.7859, + "step": 4661 + }, + { + "epoch": 0.713389441469013, + "grad_norm": 2.383095717161027, + "learning_rate": 1.4906712808710887e-05, + "loss": 0.7431, + "step": 4662 + }, + { + "epoch": 0.7135424636572303, + "grad_norm": 2.225256917614341, + "learning_rate": 1.4904553306242315e-05, + "loss": 0.7684, + "step": 4663 + }, + { + "epoch": 0.7136954858454476, + "grad_norm": 2.313231003189659, + "learning_rate": 1.4902393502565938e-05, + "loss": 0.7094, + "step": 4664 + }, + { + "epoch": 0.7138485080336648, + "grad_norm": 2.602060352447604, + "learning_rate": 1.49002333978144e-05, + "loss": 0.7902, + "step": 4665 + }, + { + "epoch": 0.7140015302218822, + "grad_norm": 2.279485041471543, + "learning_rate": 1.4898072992120354e-05, + "loss": 0.6993, + "step": 4666 + }, + { + "epoch": 0.7141545524100995, + "grad_norm": 2.3635218759370136, + "learning_rate": 1.4895912285616488e-05, + "loss": 0.6515, + "step": 4667 + }, + { + "epoch": 0.7143075745983167, + "grad_norm": 2.096166872095998, + "learning_rate": 1.489375127843549e-05, + "loss": 0.6639, + "step": 4668 + }, + { + "epoch": 0.7144605967865341, + "grad_norm": 2.3381009761630818, + "learning_rate": 1.489158997071009e-05, + "loss": 0.7515, + "step": 4669 + }, + { + "epoch": 0.7146136189747513, + "grad_norm": 2.195710905149429, + "learning_rate": 1.488942836257301e-05, + "loss": 0.6466, + "step": 4670 + }, + { + "epoch": 0.7147666411629686, + "grad_norm": 2.1753748778774877, + "learning_rate": 1.4887266454157005e-05, + "loss": 0.6547, + "step": 4671 + }, + { + "epoch": 0.714919663351186, + "grad_norm": 2.122937116378295, + "learning_rate": 1.488510424559485e-05, + "loss": 0.7748, + "step": 4672 + }, + { + "epoch": 0.7150726855394032, + "grad_norm": 2.406407272988174, + "learning_rate": 1.4882941737019334e-05, + "loss": 0.8305, + "step": 4673 + }, + { + "epoch": 0.7152257077276205, + "grad_norm": 2.2145862249870514, + "learning_rate": 1.488077892856326e-05, + "loss": 0.6078, + "step": 4674 + }, + { + "epoch": 0.7153787299158378, + "grad_norm": 2.1791170205115225, + "learning_rate": 1.487861582035946e-05, + "loss": 0.6832, + "step": 4675 + }, + { + "epoch": 0.7155317521040551, + "grad_norm": 2.21390173825772, + "learning_rate": 1.4876452412540778e-05, + "loss": 0.7003, + "step": 4676 + }, + { + "epoch": 0.7156847742922724, + "grad_norm": 2.276605772799139, + "learning_rate": 1.4874288705240077e-05, + "loss": 0.739, + "step": 4677 + }, + { + "epoch": 0.7158377964804896, + "grad_norm": 2.14951981367647, + "learning_rate": 1.4872124698590239e-05, + "loss": 0.6115, + "step": 4678 + }, + { + "epoch": 0.715990818668707, + "grad_norm": 2.3320357024227785, + "learning_rate": 1.4869960392724162e-05, + "loss": 0.7127, + "step": 4679 + }, + { + "epoch": 0.7161438408569243, + "grad_norm": 2.3126591024809047, + "learning_rate": 1.4867795787774766e-05, + "loss": 0.7032, + "step": 4680 + }, + { + "epoch": 0.7162968630451415, + "grad_norm": 2.4048082026718247, + "learning_rate": 1.4865630883874983e-05, + "loss": 0.7534, + "step": 4681 + }, + { + "epoch": 0.7164498852333588, + "grad_norm": 2.451316391581056, + "learning_rate": 1.486346568115778e-05, + "loss": 0.7529, + "step": 4682 + }, + { + "epoch": 0.7166029074215762, + "grad_norm": 2.053071504108291, + "learning_rate": 1.4861300179756122e-05, + "loss": 0.6541, + "step": 4683 + }, + { + "epoch": 0.7167559296097934, + "grad_norm": 2.3417044813298715, + "learning_rate": 1.4859134379803e-05, + "loss": 0.7603, + "step": 4684 + }, + { + "epoch": 0.7169089517980107, + "grad_norm": 2.3473788742841015, + "learning_rate": 1.4856968281431428e-05, + "loss": 0.6605, + "step": 4685 + }, + { + "epoch": 0.717061973986228, + "grad_norm": 2.2490942525781707, + "learning_rate": 1.4854801884774435e-05, + "loss": 0.6712, + "step": 4686 + }, + { + "epoch": 0.7172149961744453, + "grad_norm": 2.231128453832111, + "learning_rate": 1.4852635189965063e-05, + "loss": 0.6849, + "step": 4687 + }, + { + "epoch": 0.7173680183626626, + "grad_norm": 2.0474238341958215, + "learning_rate": 1.485046819713638e-05, + "loss": 0.7608, + "step": 4688 + }, + { + "epoch": 0.7175210405508798, + "grad_norm": 2.178205755401646, + "learning_rate": 1.4848300906421473e-05, + "loss": 0.8481, + "step": 4689 + }, + { + "epoch": 0.7176740627390972, + "grad_norm": 2.205174854802361, + "learning_rate": 1.4846133317953441e-05, + "loss": 0.7059, + "step": 4690 + }, + { + "epoch": 0.7178270849273145, + "grad_norm": 2.3020851967727944, + "learning_rate": 1.4843965431865401e-05, + "loss": 0.7218, + "step": 4691 + }, + { + "epoch": 0.7179801071155317, + "grad_norm": 2.4590759700573046, + "learning_rate": 1.4841797248290494e-05, + "loss": 0.8537, + "step": 4692 + }, + { + "epoch": 0.7181331293037491, + "grad_norm": 2.1277534202106225, + "learning_rate": 1.483962876736188e-05, + "loss": 0.6644, + "step": 4693 + }, + { + "epoch": 0.7182861514919663, + "grad_norm": 2.3080061593741434, + "learning_rate": 1.4837459989212728e-05, + "loss": 0.6971, + "step": 4694 + }, + { + "epoch": 0.7184391736801836, + "grad_norm": 2.014506867651919, + "learning_rate": 1.4835290913976237e-05, + "loss": 0.7153, + "step": 4695 + }, + { + "epoch": 0.718592195868401, + "grad_norm": 2.3493421964319565, + "learning_rate": 1.4833121541785612e-05, + "loss": 0.8454, + "step": 4696 + }, + { + "epoch": 0.7187452180566182, + "grad_norm": 2.5341913901501676, + "learning_rate": 1.4830951872774084e-05, + "loss": 0.8979, + "step": 4697 + }, + { + "epoch": 0.7188982402448355, + "grad_norm": 1.9746285755318667, + "learning_rate": 1.4828781907074907e-05, + "loss": 0.6924, + "step": 4698 + }, + { + "epoch": 0.7190512624330528, + "grad_norm": 2.3123766427327763, + "learning_rate": 1.4826611644821342e-05, + "loss": 0.7356, + "step": 4699 + }, + { + "epoch": 0.7192042846212701, + "grad_norm": 2.079211608687565, + "learning_rate": 1.4824441086146673e-05, + "loss": 0.6949, + "step": 4700 + }, + { + "epoch": 0.7193573068094874, + "grad_norm": 2.0936906022949326, + "learning_rate": 1.4822270231184202e-05, + "loss": 0.6237, + "step": 4701 + }, + { + "epoch": 0.7195103289977046, + "grad_norm": 2.2139872126331377, + "learning_rate": 1.4820099080067256e-05, + "loss": 0.8631, + "step": 4702 + }, + { + "epoch": 0.719663351185922, + "grad_norm": 2.8509554055876634, + "learning_rate": 1.4817927632929166e-05, + "loss": 0.865, + "step": 4703 + }, + { + "epoch": 0.7198163733741393, + "grad_norm": 2.203551816183465, + "learning_rate": 1.4815755889903292e-05, + "loss": 0.8114, + "step": 4704 + }, + { + "epoch": 0.7199693955623565, + "grad_norm": 2.2383720960632463, + "learning_rate": 1.481358385112301e-05, + "loss": 0.7976, + "step": 4705 + }, + { + "epoch": 0.7201224177505738, + "grad_norm": 2.066559424344547, + "learning_rate": 1.4811411516721713e-05, + "loss": 0.7183, + "step": 4706 + }, + { + "epoch": 0.7202754399387912, + "grad_norm": 2.104497394187465, + "learning_rate": 1.480923888683281e-05, + "loss": 0.7099, + "step": 4707 + }, + { + "epoch": 0.7204284621270084, + "grad_norm": 2.111770796173033, + "learning_rate": 1.4807065961589737e-05, + "loss": 0.6569, + "step": 4708 + }, + { + "epoch": 0.7205814843152257, + "grad_norm": 2.147742730612102, + "learning_rate": 1.4804892741125934e-05, + "loss": 0.7724, + "step": 4709 + }, + { + "epoch": 0.720734506503443, + "grad_norm": 2.246631746401689, + "learning_rate": 1.4802719225574876e-05, + "loss": 0.8219, + "step": 4710 + }, + { + "epoch": 0.7208875286916603, + "grad_norm": 2.13309088287406, + "learning_rate": 1.4800545415070037e-05, + "loss": 0.6694, + "step": 4711 + }, + { + "epoch": 0.7210405508798776, + "grad_norm": 2.0313301699605866, + "learning_rate": 1.4798371309744925e-05, + "loss": 0.6445, + "step": 4712 + }, + { + "epoch": 0.7211935730680948, + "grad_norm": 2.530711557247044, + "learning_rate": 1.4796196909733063e-05, + "loss": 0.7704, + "step": 4713 + }, + { + "epoch": 0.7213465952563122, + "grad_norm": 2.47685892445677, + "learning_rate": 1.4794022215167983e-05, + "loss": 0.8267, + "step": 4714 + }, + { + "epoch": 0.7214996174445295, + "grad_norm": 2.0937083712741194, + "learning_rate": 1.4791847226183245e-05, + "loss": 0.6132, + "step": 4715 + }, + { + "epoch": 0.7216526396327467, + "grad_norm": 2.189520413249951, + "learning_rate": 1.4789671942912424e-05, + "loss": 0.6962, + "step": 4716 + }, + { + "epoch": 0.7218056618209641, + "grad_norm": 2.39097362182118, + "learning_rate": 1.4787496365489112e-05, + "loss": 0.6877, + "step": 4717 + }, + { + "epoch": 0.7219586840091813, + "grad_norm": 2.3351690545072827, + "learning_rate": 1.4785320494046917e-05, + "loss": 0.7894, + "step": 4718 + }, + { + "epoch": 0.7221117061973986, + "grad_norm": 2.061697603752519, + "learning_rate": 1.4783144328719471e-05, + "loss": 0.6861, + "step": 4719 + }, + { + "epoch": 0.722264728385616, + "grad_norm": 2.2225612629930938, + "learning_rate": 1.4780967869640424e-05, + "loss": 0.6203, + "step": 4720 + }, + { + "epoch": 0.7224177505738332, + "grad_norm": 2.573723966712967, + "learning_rate": 1.4778791116943432e-05, + "loss": 0.7715, + "step": 4721 + }, + { + "epoch": 0.7225707727620505, + "grad_norm": 2.594210095762241, + "learning_rate": 1.4776614070762183e-05, + "loss": 0.6993, + "step": 4722 + }, + { + "epoch": 0.7227237949502678, + "grad_norm": 2.371055805718302, + "learning_rate": 1.477443673123038e-05, + "loss": 0.8131, + "step": 4723 + }, + { + "epoch": 0.7228768171384851, + "grad_norm": 2.096628347839013, + "learning_rate": 1.4772259098481741e-05, + "loss": 0.7364, + "step": 4724 + }, + { + "epoch": 0.7230298393267024, + "grad_norm": 2.3398331654499103, + "learning_rate": 1.4770081172649995e-05, + "loss": 0.8091, + "step": 4725 + }, + { + "epoch": 0.7231828615149196, + "grad_norm": 2.1132524686931795, + "learning_rate": 1.4767902953868908e-05, + "loss": 0.8064, + "step": 4726 + }, + { + "epoch": 0.723335883703137, + "grad_norm": 2.0520566030415286, + "learning_rate": 1.4765724442272252e-05, + "loss": 0.6772, + "step": 4727 + }, + { + "epoch": 0.7234889058913543, + "grad_norm": 2.30000431071158, + "learning_rate": 1.4763545637993808e-05, + "loss": 0.7558, + "step": 4728 + }, + { + "epoch": 0.7236419280795715, + "grad_norm": 2.632293513300626, + "learning_rate": 1.4761366541167394e-05, + "loss": 0.746, + "step": 4729 + }, + { + "epoch": 0.7237949502677888, + "grad_norm": 2.379979644133256, + "learning_rate": 1.4759187151926833e-05, + "loss": 0.8553, + "step": 4730 + }, + { + "epoch": 0.7239479724560062, + "grad_norm": 2.2346609604101344, + "learning_rate": 1.4757007470405973e-05, + "loss": 0.6299, + "step": 4731 + }, + { + "epoch": 0.7241009946442234, + "grad_norm": 2.27430307026169, + "learning_rate": 1.4754827496738672e-05, + "loss": 0.6518, + "step": 4732 + }, + { + "epoch": 0.7242540168324407, + "grad_norm": 2.1413328730472427, + "learning_rate": 1.4752647231058816e-05, + "loss": 0.6566, + "step": 4733 + }, + { + "epoch": 0.724407039020658, + "grad_norm": 2.3859874983347202, + "learning_rate": 1.47504666735003e-05, + "loss": 0.7186, + "step": 4734 + }, + { + "epoch": 0.7245600612088753, + "grad_norm": 2.3047056874773912, + "learning_rate": 1.4748285824197042e-05, + "loss": 0.866, + "step": 4735 + }, + { + "epoch": 0.7247130833970926, + "grad_norm": 2.1601134195962137, + "learning_rate": 1.4746104683282978e-05, + "loss": 0.6606, + "step": 4736 + }, + { + "epoch": 0.7248661055853098, + "grad_norm": 2.416663587369718, + "learning_rate": 1.4743923250892056e-05, + "loss": 0.8479, + "step": 4737 + }, + { + "epoch": 0.7250191277735272, + "grad_norm": 2.4091001521362276, + "learning_rate": 1.4741741527158246e-05, + "loss": 0.7452, + "step": 4738 + }, + { + "epoch": 0.7251721499617445, + "grad_norm": 2.1203901701257464, + "learning_rate": 1.4739559512215546e-05, + "loss": 0.6542, + "step": 4739 + }, + { + "epoch": 0.7253251721499617, + "grad_norm": 2.132147928958173, + "learning_rate": 1.4737377206197951e-05, + "loss": 0.7092, + "step": 4740 + }, + { + "epoch": 0.7254781943381791, + "grad_norm": 2.208144698192683, + "learning_rate": 1.4735194609239487e-05, + "loss": 0.7039, + "step": 4741 + }, + { + "epoch": 0.7256312165263963, + "grad_norm": 2.505747937898946, + "learning_rate": 1.4733011721474202e-05, + "loss": 0.8391, + "step": 4742 + }, + { + "epoch": 0.7257842387146136, + "grad_norm": 2.04294489654917, + "learning_rate": 1.4730828543036151e-05, + "loss": 0.6893, + "step": 4743 + }, + { + "epoch": 0.725937260902831, + "grad_norm": 2.2859908932549153, + "learning_rate": 1.4728645074059409e-05, + "loss": 0.7034, + "step": 4744 + }, + { + "epoch": 0.7260902830910482, + "grad_norm": 2.0581272816435408, + "learning_rate": 1.4726461314678075e-05, + "loss": 0.6576, + "step": 4745 + }, + { + "epoch": 0.7262433052792655, + "grad_norm": 2.1795007417010406, + "learning_rate": 1.4724277265026263e-05, + "loss": 0.7988, + "step": 4746 + }, + { + "epoch": 0.7263963274674828, + "grad_norm": 2.2960157182927365, + "learning_rate": 1.4722092925238106e-05, + "loss": 0.7517, + "step": 4747 + }, + { + "epoch": 0.7265493496557001, + "grad_norm": 2.623740200196195, + "learning_rate": 1.4719908295447745e-05, + "loss": 0.8784, + "step": 4748 + }, + { + "epoch": 0.7267023718439174, + "grad_norm": 2.360442706550242, + "learning_rate": 1.4717723375789353e-05, + "loss": 0.8329, + "step": 4749 + }, + { + "epoch": 0.7268553940321346, + "grad_norm": 2.4744425533825245, + "learning_rate": 1.4715538166397109e-05, + "loss": 0.6913, + "step": 4750 + }, + { + "epoch": 0.727008416220352, + "grad_norm": 2.2088002606216492, + "learning_rate": 1.4713352667405222e-05, + "loss": 0.6413, + "step": 4751 + }, + { + "epoch": 0.7271614384085693, + "grad_norm": 2.1576929521842128, + "learning_rate": 1.4711166878947911e-05, + "loss": 0.7331, + "step": 4752 + }, + { + "epoch": 0.7273144605967865, + "grad_norm": 2.1404435433709343, + "learning_rate": 1.470898080115941e-05, + "loss": 0.7478, + "step": 4753 + }, + { + "epoch": 0.7274674827850038, + "grad_norm": 2.0884135256005627, + "learning_rate": 1.4706794434173974e-05, + "loss": 0.6151, + "step": 4754 + }, + { + "epoch": 0.7276205049732212, + "grad_norm": 2.296005837056293, + "learning_rate": 1.470460777812588e-05, + "loss": 0.7234, + "step": 4755 + }, + { + "epoch": 0.7277735271614384, + "grad_norm": 2.2279102958195995, + "learning_rate": 1.470242083314942e-05, + "loss": 0.7359, + "step": 4756 + }, + { + "epoch": 0.7279265493496557, + "grad_norm": 2.4351625984189536, + "learning_rate": 1.47002335993789e-05, + "loss": 0.7718, + "step": 4757 + }, + { + "epoch": 0.728079571537873, + "grad_norm": 2.4439542483712335, + "learning_rate": 1.4698046076948647e-05, + "loss": 0.7665, + "step": 4758 + }, + { + "epoch": 0.7282325937260903, + "grad_norm": 2.157585434584162, + "learning_rate": 1.4695858265993005e-05, + "loss": 0.7102, + "step": 4759 + }, + { + "epoch": 0.7283856159143076, + "grad_norm": 2.38408004828983, + "learning_rate": 1.4693670166646337e-05, + "loss": 0.7396, + "step": 4760 + }, + { + "epoch": 0.7285386381025248, + "grad_norm": 2.2861670459333525, + "learning_rate": 1.4691481779043022e-05, + "loss": 0.7067, + "step": 4761 + }, + { + "epoch": 0.7286916602907422, + "grad_norm": 2.4361904893888395, + "learning_rate": 1.4689293103317456e-05, + "loss": 0.7469, + "step": 4762 + }, + { + "epoch": 0.7288446824789594, + "grad_norm": 2.031659778568357, + "learning_rate": 1.4687104139604058e-05, + "loss": 0.6313, + "step": 4763 + }, + { + "epoch": 0.7289977046671767, + "grad_norm": 2.0556376938872942, + "learning_rate": 1.468491488803726e-05, + "loss": 0.8087, + "step": 4764 + }, + { + "epoch": 0.7291507268553941, + "grad_norm": 2.565523844865347, + "learning_rate": 1.468272534875151e-05, + "loss": 0.8579, + "step": 4765 + }, + { + "epoch": 0.7293037490436113, + "grad_norm": 2.363770120524518, + "learning_rate": 1.4680535521881277e-05, + "loss": 0.7409, + "step": 4766 + }, + { + "epoch": 0.7294567712318286, + "grad_norm": 2.318947990020625, + "learning_rate": 1.4678345407561046e-05, + "loss": 0.7089, + "step": 4767 + }, + { + "epoch": 0.729609793420046, + "grad_norm": 2.1665273836897954, + "learning_rate": 1.4676155005925323e-05, + "loss": 0.6989, + "step": 4768 + }, + { + "epoch": 0.7297628156082632, + "grad_norm": 2.282864821091944, + "learning_rate": 1.4673964317108627e-05, + "loss": 0.6375, + "step": 4769 + }, + { + "epoch": 0.7299158377964805, + "grad_norm": 2.1550424071289473, + "learning_rate": 1.4671773341245499e-05, + "loss": 0.7207, + "step": 4770 + }, + { + "epoch": 0.7300688599846977, + "grad_norm": 2.1784007795790505, + "learning_rate": 1.4669582078470494e-05, + "loss": 0.6942, + "step": 4771 + }, + { + "epoch": 0.7302218821729151, + "grad_norm": 2.2050045587066025, + "learning_rate": 1.4667390528918186e-05, + "loss": 0.7414, + "step": 4772 + }, + { + "epoch": 0.7303749043611324, + "grad_norm": 2.107735164826548, + "learning_rate": 1.4665198692723166e-05, + "loss": 0.7075, + "step": 4773 + }, + { + "epoch": 0.7305279265493496, + "grad_norm": 2.145877315751559, + "learning_rate": 1.4663006570020044e-05, + "loss": 0.7179, + "step": 4774 + }, + { + "epoch": 0.730680948737567, + "grad_norm": 2.282915199457753, + "learning_rate": 1.4660814160943448e-05, + "loss": 0.7671, + "step": 4775 + }, + { + "epoch": 0.7308339709257843, + "grad_norm": 2.0724299524788115, + "learning_rate": 1.465862146562802e-05, + "loss": 0.6496, + "step": 4776 + }, + { + "epoch": 0.7309869931140015, + "grad_norm": 2.36617080210269, + "learning_rate": 1.4656428484208423e-05, + "loss": 0.7133, + "step": 4777 + }, + { + "epoch": 0.7311400153022188, + "grad_norm": 2.3154604337397346, + "learning_rate": 1.4654235216819337e-05, + "loss": 0.6548, + "step": 4778 + }, + { + "epoch": 0.7312930374904361, + "grad_norm": 2.284140303142969, + "learning_rate": 1.4652041663595457e-05, + "loss": 0.6584, + "step": 4779 + }, + { + "epoch": 0.7314460596786534, + "grad_norm": 2.549272830669625, + "learning_rate": 1.4649847824671503e-05, + "loss": 0.7738, + "step": 4780 + }, + { + "epoch": 0.7315990818668707, + "grad_norm": 2.3716627589861474, + "learning_rate": 1.4647653700182203e-05, + "loss": 0.6948, + "step": 4781 + }, + { + "epoch": 0.731752104055088, + "grad_norm": 2.254070886426423, + "learning_rate": 1.4645459290262304e-05, + "loss": 0.7778, + "step": 4782 + }, + { + "epoch": 0.7319051262433053, + "grad_norm": 2.4077618806749204, + "learning_rate": 1.464326459504658e-05, + "loss": 0.7467, + "step": 4783 + }, + { + "epoch": 0.7320581484315226, + "grad_norm": 2.4854494866694794, + "learning_rate": 1.4641069614669814e-05, + "loss": 0.799, + "step": 4784 + }, + { + "epoch": 0.7322111706197398, + "grad_norm": 2.097663474137104, + "learning_rate": 1.4638874349266806e-05, + "loss": 0.7088, + "step": 4785 + }, + { + "epoch": 0.7323641928079572, + "grad_norm": 2.2978940941367743, + "learning_rate": 1.4636678798972374e-05, + "loss": 0.6974, + "step": 4786 + }, + { + "epoch": 0.7325172149961744, + "grad_norm": 2.2559176492956676, + "learning_rate": 1.4634482963921362e-05, + "loss": 0.7634, + "step": 4787 + }, + { + "epoch": 0.7326702371843917, + "grad_norm": 2.29409352342819, + "learning_rate": 1.4632286844248618e-05, + "loss": 0.677, + "step": 4788 + }, + { + "epoch": 0.7328232593726091, + "grad_norm": 2.186287939404946, + "learning_rate": 1.463009044008902e-05, + "loss": 0.7605, + "step": 4789 + }, + { + "epoch": 0.7329762815608263, + "grad_norm": 2.487897472414095, + "learning_rate": 1.4627893751577454e-05, + "loss": 0.7555, + "step": 4790 + }, + { + "epoch": 0.7331293037490436, + "grad_norm": 2.1811604114846115, + "learning_rate": 1.4625696778848826e-05, + "loss": 0.7005, + "step": 4791 + }, + { + "epoch": 0.733282325937261, + "grad_norm": 2.344426014732196, + "learning_rate": 1.4623499522038064e-05, + "loss": 0.7748, + "step": 4792 + }, + { + "epoch": 0.7334353481254782, + "grad_norm": 2.344890853831977, + "learning_rate": 1.4621301981280112e-05, + "loss": 0.8042, + "step": 4793 + }, + { + "epoch": 0.7335883703136955, + "grad_norm": 1.9922549418646183, + "learning_rate": 1.4619104156709924e-05, + "loss": 0.6517, + "step": 4794 + }, + { + "epoch": 0.7337413925019127, + "grad_norm": 2.0746510135183365, + "learning_rate": 1.461690604846248e-05, + "loss": 0.5896, + "step": 4795 + }, + { + "epoch": 0.7338944146901301, + "grad_norm": 2.168429526773284, + "learning_rate": 1.4614707656672775e-05, + "loss": 0.7213, + "step": 4796 + }, + { + "epoch": 0.7340474368783474, + "grad_norm": 2.317615047962928, + "learning_rate": 1.4612508981475819e-05, + "loss": 0.7598, + "step": 4797 + }, + { + "epoch": 0.7342004590665646, + "grad_norm": 2.000689298049806, + "learning_rate": 1.4610310023006643e-05, + "loss": 0.67, + "step": 4798 + }, + { + "epoch": 0.734353481254782, + "grad_norm": 2.340228400641039, + "learning_rate": 1.4608110781400293e-05, + "loss": 0.7711, + "step": 4799 + }, + { + "epoch": 0.7345065034429993, + "grad_norm": 2.044964571461997, + "learning_rate": 1.4605911256791831e-05, + "loss": 0.6814, + "step": 4800 + }, + { + "epoch": 0.7346595256312165, + "grad_norm": 2.3241346893015478, + "learning_rate": 1.4603711449316342e-05, + "loss": 0.7522, + "step": 4801 + }, + { + "epoch": 0.7348125478194338, + "grad_norm": 2.3624868394174325, + "learning_rate": 1.4601511359108924e-05, + "loss": 0.7272, + "step": 4802 + }, + { + "epoch": 0.7349655700076511, + "grad_norm": 2.475081517037485, + "learning_rate": 1.4599310986304691e-05, + "loss": 0.735, + "step": 4803 + }, + { + "epoch": 0.7351185921958684, + "grad_norm": 2.2359477222481865, + "learning_rate": 1.4597110331038775e-05, + "loss": 0.6041, + "step": 4804 + }, + { + "epoch": 0.7352716143840857, + "grad_norm": 2.0780513139500374, + "learning_rate": 1.4594909393446334e-05, + "loss": 0.7102, + "step": 4805 + }, + { + "epoch": 0.735424636572303, + "grad_norm": 2.0920809638081943, + "learning_rate": 1.459270817366253e-05, + "loss": 0.7028, + "step": 4806 + }, + { + "epoch": 0.7355776587605203, + "grad_norm": 2.222222916454537, + "learning_rate": 1.459050667182255e-05, + "loss": 0.5553, + "step": 4807 + }, + { + "epoch": 0.7357306809487376, + "grad_norm": 2.2204854583000886, + "learning_rate": 1.4588304888061597e-05, + "loss": 0.6874, + "step": 4808 + }, + { + "epoch": 0.7358837031369548, + "grad_norm": 2.2720313280348208, + "learning_rate": 1.4586102822514896e-05, + "loss": 0.7076, + "step": 4809 + }, + { + "epoch": 0.7360367253251722, + "grad_norm": 2.387822079584415, + "learning_rate": 1.458390047531767e-05, + "loss": 0.7585, + "step": 4810 + }, + { + "epoch": 0.7361897475133894, + "grad_norm": 2.428520327068487, + "learning_rate": 1.4581697846605192e-05, + "loss": 0.7851, + "step": 4811 + }, + { + "epoch": 0.7363427697016067, + "grad_norm": 2.1643869130717475, + "learning_rate": 1.4579494936512722e-05, + "loss": 0.7187, + "step": 4812 + }, + { + "epoch": 0.7364957918898241, + "grad_norm": 2.2809620466631193, + "learning_rate": 1.4577291745175555e-05, + "loss": 0.6697, + "step": 4813 + }, + { + "epoch": 0.7366488140780413, + "grad_norm": 2.015931630069814, + "learning_rate": 1.4575088272728992e-05, + "loss": 0.7241, + "step": 4814 + }, + { + "epoch": 0.7368018362662586, + "grad_norm": 2.48264062580387, + "learning_rate": 1.4572884519308363e-05, + "loss": 0.7079, + "step": 4815 + }, + { + "epoch": 0.736954858454476, + "grad_norm": 2.5497171898501088, + "learning_rate": 1.4570680485049007e-05, + "loss": 0.7401, + "step": 4816 + }, + { + "epoch": 0.7371078806426932, + "grad_norm": 2.2285617237271182, + "learning_rate": 1.4568476170086281e-05, + "loss": 0.6802, + "step": 4817 + }, + { + "epoch": 0.7372609028309105, + "grad_norm": 2.4025875558068925, + "learning_rate": 1.4566271574555559e-05, + "loss": 0.6301, + "step": 4818 + }, + { + "epoch": 0.7374139250191277, + "grad_norm": 2.4112017394461644, + "learning_rate": 1.4564066698592238e-05, + "loss": 0.6759, + "step": 4819 + }, + { + "epoch": 0.7375669472073451, + "grad_norm": 2.195257542503888, + "learning_rate": 1.4561861542331725e-05, + "loss": 0.6785, + "step": 4820 + }, + { + "epoch": 0.7377199693955624, + "grad_norm": 2.134046236215962, + "learning_rate": 1.4559656105909449e-05, + "loss": 0.66, + "step": 4821 + }, + { + "epoch": 0.7378729915837796, + "grad_norm": 1.7848717620286476, + "learning_rate": 1.4557450389460859e-05, + "loss": 0.592, + "step": 4822 + }, + { + "epoch": 0.738026013771997, + "grad_norm": 2.1999375705173656, + "learning_rate": 1.4555244393121406e-05, + "loss": 0.6376, + "step": 4823 + }, + { + "epoch": 0.7381790359602143, + "grad_norm": 2.192595297089319, + "learning_rate": 1.4553038117026577e-05, + "loss": 0.6949, + "step": 4824 + }, + { + "epoch": 0.7383320581484315, + "grad_norm": 2.5214156134793764, + "learning_rate": 1.4550831561311864e-05, + "loss": 0.8018, + "step": 4825 + }, + { + "epoch": 0.7384850803366488, + "grad_norm": 2.8632598195698655, + "learning_rate": 1.4548624726112782e-05, + "loss": 0.7681, + "step": 4826 + }, + { + "epoch": 0.7386381025248661, + "grad_norm": 2.310980789411813, + "learning_rate": 1.4546417611564864e-05, + "loss": 0.7135, + "step": 4827 + }, + { + "epoch": 0.7387911247130834, + "grad_norm": 2.324914918749278, + "learning_rate": 1.4544210217803651e-05, + "loss": 0.6451, + "step": 4828 + }, + { + "epoch": 0.7389441469013007, + "grad_norm": 2.2455011773034212, + "learning_rate": 1.4542002544964713e-05, + "loss": 0.7251, + "step": 4829 + }, + { + "epoch": 0.739097169089518, + "grad_norm": 2.3997081873057353, + "learning_rate": 1.4539794593183634e-05, + "loss": 0.6291, + "step": 4830 + }, + { + "epoch": 0.7392501912777353, + "grad_norm": 2.4566939192312356, + "learning_rate": 1.4537586362596005e-05, + "loss": 0.7435, + "step": 4831 + }, + { + "epoch": 0.7394032134659526, + "grad_norm": 2.4455663976068145, + "learning_rate": 1.4535377853337448e-05, + "loss": 0.6491, + "step": 4832 + }, + { + "epoch": 0.7395562356541698, + "grad_norm": 2.1715569659281044, + "learning_rate": 1.4533169065543593e-05, + "loss": 0.6749, + "step": 4833 + }, + { + "epoch": 0.7397092578423872, + "grad_norm": 2.291412184131578, + "learning_rate": 1.4530959999350095e-05, + "loss": 0.643, + "step": 4834 + }, + { + "epoch": 0.7398622800306044, + "grad_norm": 2.204427629847068, + "learning_rate": 1.4528750654892614e-05, + "loss": 0.8016, + "step": 4835 + }, + { + "epoch": 0.7400153022188217, + "grad_norm": 1.9802730994342306, + "learning_rate": 1.452654103230684e-05, + "loss": 0.7306, + "step": 4836 + }, + { + "epoch": 0.7401683244070391, + "grad_norm": 2.4833975120925977, + "learning_rate": 1.4524331131728472e-05, + "loss": 0.7693, + "step": 4837 + }, + { + "epoch": 0.7403213465952563, + "grad_norm": 2.2973589798555634, + "learning_rate": 1.4522120953293233e-05, + "loss": 0.6689, + "step": 4838 + }, + { + "epoch": 0.7404743687834736, + "grad_norm": 2.025223238613172, + "learning_rate": 1.4519910497136856e-05, + "loss": 0.6473, + "step": 4839 + }, + { + "epoch": 0.740627390971691, + "grad_norm": 2.348915095079573, + "learning_rate": 1.451769976339509e-05, + "loss": 0.8239, + "step": 4840 + }, + { + "epoch": 0.7407804131599082, + "grad_norm": 2.309855719910271, + "learning_rate": 1.4515488752203713e-05, + "loss": 0.7992, + "step": 4841 + }, + { + "epoch": 0.7409334353481255, + "grad_norm": 2.206226950757133, + "learning_rate": 1.4513277463698503e-05, + "loss": 0.6737, + "step": 4842 + }, + { + "epoch": 0.7410864575363427, + "grad_norm": 2.2352741297777756, + "learning_rate": 1.4511065898015269e-05, + "loss": 0.7212, + "step": 4843 + }, + { + "epoch": 0.7412394797245601, + "grad_norm": 2.38240113752834, + "learning_rate": 1.450885405528983e-05, + "loss": 0.8639, + "step": 4844 + }, + { + "epoch": 0.7413925019127774, + "grad_norm": 2.2151203809554496, + "learning_rate": 1.4506641935658023e-05, + "loss": 0.7709, + "step": 4845 + }, + { + "epoch": 0.7415455241009946, + "grad_norm": 2.156453587248878, + "learning_rate": 1.4504429539255708e-05, + "loss": 0.7863, + "step": 4846 + }, + { + "epoch": 0.741698546289212, + "grad_norm": 2.139602715597007, + "learning_rate": 1.4502216866218755e-05, + "loss": 0.7202, + "step": 4847 + }, + { + "epoch": 0.7418515684774293, + "grad_norm": 2.2784471326319156, + "learning_rate": 1.4500003916683047e-05, + "loss": 0.8374, + "step": 4848 + }, + { + "epoch": 0.7420045906656465, + "grad_norm": 2.3563875101559217, + "learning_rate": 1.4497790690784498e-05, + "loss": 0.6719, + "step": 4849 + }, + { + "epoch": 0.7421576128538638, + "grad_norm": 2.2202100939612825, + "learning_rate": 1.4495577188659028e-05, + "loss": 0.6286, + "step": 4850 + }, + { + "epoch": 0.7423106350420811, + "grad_norm": 2.5782599669858586, + "learning_rate": 1.449336341044257e-05, + "loss": 0.7125, + "step": 4851 + }, + { + "epoch": 0.7424636572302984, + "grad_norm": 2.374270686758012, + "learning_rate": 1.4491149356271092e-05, + "loss": 0.6643, + "step": 4852 + }, + { + "epoch": 0.7426166794185157, + "grad_norm": 2.5637372901568094, + "learning_rate": 1.4488935026280561e-05, + "loss": 0.6725, + "step": 4853 + }, + { + "epoch": 0.742769701606733, + "grad_norm": 2.1071033399459136, + "learning_rate": 1.448672042060697e-05, + "loss": 0.676, + "step": 4854 + }, + { + "epoch": 0.7429227237949503, + "grad_norm": 2.4993248270173747, + "learning_rate": 1.4484505539386324e-05, + "loss": 0.7052, + "step": 4855 + }, + { + "epoch": 0.7430757459831676, + "grad_norm": 2.076983925023967, + "learning_rate": 1.448229038275465e-05, + "loss": 0.6189, + "step": 4856 + }, + { + "epoch": 0.7432287681713848, + "grad_norm": 2.433934235940275, + "learning_rate": 1.4480074950847991e-05, + "loss": 0.8599, + "step": 4857 + }, + { + "epoch": 0.7433817903596022, + "grad_norm": 2.2124457746802637, + "learning_rate": 1.4477859243802401e-05, + "loss": 0.7823, + "step": 4858 + }, + { + "epoch": 0.7435348125478194, + "grad_norm": 2.570932508736398, + "learning_rate": 1.447564326175396e-05, + "loss": 0.8181, + "step": 4859 + }, + { + "epoch": 0.7436878347360367, + "grad_norm": 2.4117017713360904, + "learning_rate": 1.4473427004838754e-05, + "loss": 0.7226, + "step": 4860 + }, + { + "epoch": 0.7438408569242541, + "grad_norm": 2.176505200952037, + "learning_rate": 1.4471210473192896e-05, + "loss": 0.7055, + "step": 4861 + }, + { + "epoch": 0.7439938791124713, + "grad_norm": 2.1857445379147755, + "learning_rate": 1.4468993666952512e-05, + "loss": 0.6834, + "step": 4862 + }, + { + "epoch": 0.7441469013006886, + "grad_norm": 2.635693368966689, + "learning_rate": 1.4466776586253745e-05, + "loss": 0.8412, + "step": 4863 + }, + { + "epoch": 0.744299923488906, + "grad_norm": 2.0451933975323193, + "learning_rate": 1.446455923123275e-05, + "loss": 0.6854, + "step": 4864 + }, + { + "epoch": 0.7444529456771232, + "grad_norm": 2.275358040160873, + "learning_rate": 1.4462341602025714e-05, + "loss": 0.7035, + "step": 4865 + }, + { + "epoch": 0.7446059678653405, + "grad_norm": 2.4710038242202956, + "learning_rate": 1.4460123698768817e-05, + "loss": 0.8401, + "step": 4866 + }, + { + "epoch": 0.7447589900535577, + "grad_norm": 2.4289044690905897, + "learning_rate": 1.4457905521598279e-05, + "loss": 0.7351, + "step": 4867 + }, + { + "epoch": 0.7449120122417751, + "grad_norm": 2.31025768005515, + "learning_rate": 1.4455687070650324e-05, + "loss": 0.782, + "step": 4868 + }, + { + "epoch": 0.7450650344299924, + "grad_norm": 2.1544459445175232, + "learning_rate": 1.4453468346061193e-05, + "loss": 0.7421, + "step": 4869 + }, + { + "epoch": 0.7452180566182096, + "grad_norm": 2.3447105605966807, + "learning_rate": 1.445124934796715e-05, + "loss": 0.7559, + "step": 4870 + }, + { + "epoch": 0.745371078806427, + "grad_norm": 2.239958777557601, + "learning_rate": 1.444903007650447e-05, + "loss": 0.6981, + "step": 4871 + }, + { + "epoch": 0.7455241009946443, + "grad_norm": 2.249734548223549, + "learning_rate": 1.444681053180945e-05, + "loss": 0.751, + "step": 4872 + }, + { + "epoch": 0.7456771231828615, + "grad_norm": 2.340819325628896, + "learning_rate": 1.4444590714018398e-05, + "loss": 0.7303, + "step": 4873 + }, + { + "epoch": 0.7458301453710788, + "grad_norm": 2.34414004865152, + "learning_rate": 1.4442370623267643e-05, + "loss": 0.7181, + "step": 4874 + }, + { + "epoch": 0.7459831675592961, + "grad_norm": 3.0286066038526562, + "learning_rate": 1.444015025969353e-05, + "loss": 0.831, + "step": 4875 + }, + { + "epoch": 0.7461361897475134, + "grad_norm": 2.425379330178627, + "learning_rate": 1.4437929623432418e-05, + "loss": 0.7453, + "step": 4876 + }, + { + "epoch": 0.7462892119357307, + "grad_norm": 2.228493718435368, + "learning_rate": 1.443570871462069e-05, + "loss": 0.6826, + "step": 4877 + }, + { + "epoch": 0.746442234123948, + "grad_norm": 2.1177422595743685, + "learning_rate": 1.4433487533394734e-05, + "loss": 0.6157, + "step": 4878 + }, + { + "epoch": 0.7465952563121653, + "grad_norm": 2.0833369615704513, + "learning_rate": 1.4431266079890964e-05, + "loss": 0.5789, + "step": 4879 + }, + { + "epoch": 0.7467482785003825, + "grad_norm": 2.8278316899269873, + "learning_rate": 1.4429044354245812e-05, + "loss": 0.7478, + "step": 4880 + }, + { + "epoch": 0.7469013006885998, + "grad_norm": 2.478404645618318, + "learning_rate": 1.4426822356595718e-05, + "loss": 0.6872, + "step": 4881 + }, + { + "epoch": 0.7470543228768172, + "grad_norm": 2.3329379350497508, + "learning_rate": 1.4424600087077148e-05, + "loss": 0.7983, + "step": 4882 + }, + { + "epoch": 0.7472073450650344, + "grad_norm": 2.2267621885651003, + "learning_rate": 1.4422377545826574e-05, + "loss": 0.7222, + "step": 4883 + }, + { + "epoch": 0.7473603672532517, + "grad_norm": 2.4536454346196424, + "learning_rate": 1.4420154732980493e-05, + "loss": 0.7625, + "step": 4884 + }, + { + "epoch": 0.7475133894414691, + "grad_norm": 2.022708265593278, + "learning_rate": 1.4417931648675423e-05, + "loss": 0.7153, + "step": 4885 + }, + { + "epoch": 0.7476664116296863, + "grad_norm": 2.256460123168098, + "learning_rate": 1.4415708293047884e-05, + "loss": 0.7022, + "step": 4886 + }, + { + "epoch": 0.7478194338179036, + "grad_norm": 2.136078642899449, + "learning_rate": 1.4413484666234427e-05, + "loss": 0.6409, + "step": 4887 + }, + { + "epoch": 0.7479724560061208, + "grad_norm": 2.2325055404407994, + "learning_rate": 1.4411260768371609e-05, + "loss": 0.729, + "step": 4888 + }, + { + "epoch": 0.7481254781943382, + "grad_norm": 2.3585569399038695, + "learning_rate": 1.440903659959601e-05, + "loss": 0.7476, + "step": 4889 + }, + { + "epoch": 0.7482785003825555, + "grad_norm": 2.3183366182918888, + "learning_rate": 1.4406812160044226e-05, + "loss": 0.7876, + "step": 4890 + }, + { + "epoch": 0.7484315225707727, + "grad_norm": 2.4632969026460687, + "learning_rate": 1.4404587449852865e-05, + "loss": 0.7183, + "step": 4891 + }, + { + "epoch": 0.7485845447589901, + "grad_norm": 2.265010236192547, + "learning_rate": 1.4402362469158562e-05, + "loss": 0.7735, + "step": 4892 + }, + { + "epoch": 0.7487375669472074, + "grad_norm": 2.2999201065686266, + "learning_rate": 1.4400137218097956e-05, + "loss": 0.7304, + "step": 4893 + }, + { + "epoch": 0.7488905891354246, + "grad_norm": 2.2799436177017216, + "learning_rate": 1.4397911696807708e-05, + "loss": 0.7187, + "step": 4894 + }, + { + "epoch": 0.749043611323642, + "grad_norm": 2.133201532746618, + "learning_rate": 1.43956859054245e-05, + "loss": 0.735, + "step": 4895 + }, + { + "epoch": 0.7491966335118592, + "grad_norm": 2.3822318079150535, + "learning_rate": 1.4393459844085027e-05, + "loss": 0.751, + "step": 4896 + }, + { + "epoch": 0.7493496557000765, + "grad_norm": 2.276622654679582, + "learning_rate": 1.4391233512925994e-05, + "loss": 0.6863, + "step": 4897 + }, + { + "epoch": 0.7495026778882938, + "grad_norm": 2.4161787604889193, + "learning_rate": 1.4389006912084136e-05, + "loss": 0.8374, + "step": 4898 + }, + { + "epoch": 0.7496557000765111, + "grad_norm": 2.27888042381393, + "learning_rate": 1.4386780041696189e-05, + "loss": 0.6775, + "step": 4899 + }, + { + "epoch": 0.7498087222647284, + "grad_norm": 2.520476013025121, + "learning_rate": 1.438455290189892e-05, + "loss": 0.7428, + "step": 4900 + }, + { + "epoch": 0.7499617444529457, + "grad_norm": 2.232440986785114, + "learning_rate": 1.4382325492829108e-05, + "loss": 0.6929, + "step": 4901 + }, + { + "epoch": 0.7501147666411629, + "grad_norm": 2.2194515852033816, + "learning_rate": 1.4380097814623539e-05, + "loss": 0.7759, + "step": 4902 + }, + { + "epoch": 0.7502677888293803, + "grad_norm": 2.493029827031643, + "learning_rate": 1.437786986741903e-05, + "loss": 0.7942, + "step": 4903 + }, + { + "epoch": 0.7504208110175975, + "grad_norm": 2.532507217587678, + "learning_rate": 1.4375641651352408e-05, + "loss": 0.7536, + "step": 4904 + }, + { + "epoch": 0.7505738332058148, + "grad_norm": 2.1324616550429343, + "learning_rate": 1.4373413166560512e-05, + "loss": 0.6899, + "step": 4905 + }, + { + "epoch": 0.7507268553940322, + "grad_norm": 2.404454010344963, + "learning_rate": 1.4371184413180205e-05, + "loss": 0.7995, + "step": 4906 + }, + { + "epoch": 0.7508798775822494, + "grad_norm": 2.212175857528018, + "learning_rate": 1.4368955391348366e-05, + "loss": 0.5926, + "step": 4907 + }, + { + "epoch": 0.7510328997704667, + "grad_norm": 2.155592680667354, + "learning_rate": 1.436672610120188e-05, + "loss": 0.6284, + "step": 4908 + }, + { + "epoch": 0.751185921958684, + "grad_norm": 2.3741727185360832, + "learning_rate": 1.4364496542877663e-05, + "loss": 0.7462, + "step": 4909 + }, + { + "epoch": 0.7513389441469013, + "grad_norm": 2.1469671874999325, + "learning_rate": 1.4362266716512643e-05, + "loss": 0.7229, + "step": 4910 + }, + { + "epoch": 0.7514919663351186, + "grad_norm": 2.121527275206317, + "learning_rate": 1.4360036622243754e-05, + "loss": 0.7239, + "step": 4911 + }, + { + "epoch": 0.7516449885233358, + "grad_norm": 2.2376730417194812, + "learning_rate": 1.4357806260207962e-05, + "loss": 0.696, + "step": 4912 + }, + { + "epoch": 0.7517980107115532, + "grad_norm": 2.2821150028836574, + "learning_rate": 1.435557563054224e-05, + "loss": 0.785, + "step": 4913 + }, + { + "epoch": 0.7519510328997705, + "grad_norm": 2.0970063152782066, + "learning_rate": 1.4353344733383576e-05, + "loss": 0.7399, + "step": 4914 + }, + { + "epoch": 0.7521040550879877, + "grad_norm": 2.083207329954238, + "learning_rate": 1.4351113568868987e-05, + "loss": 0.5428, + "step": 4915 + }, + { + "epoch": 0.752257077276205, + "grad_norm": 2.245970612654284, + "learning_rate": 1.434888213713549e-05, + "loss": 0.7392, + "step": 4916 + }, + { + "epoch": 0.7524100994644224, + "grad_norm": 2.3609964289459113, + "learning_rate": 1.4346650438320124e-05, + "loss": 0.6814, + "step": 4917 + }, + { + "epoch": 0.7525631216526396, + "grad_norm": 2.1525419459253556, + "learning_rate": 1.4344418472559956e-05, + "loss": 0.6854, + "step": 4918 + }, + { + "epoch": 0.7527161438408569, + "grad_norm": 2.330687789914893, + "learning_rate": 1.434218623999205e-05, + "loss": 0.6747, + "step": 4919 + }, + { + "epoch": 0.7528691660290742, + "grad_norm": 2.337433181523013, + "learning_rate": 1.4339953740753502e-05, + "loss": 0.6729, + "step": 4920 + }, + { + "epoch": 0.7530221882172915, + "grad_norm": 2.165023230727938, + "learning_rate": 1.4337720974981417e-05, + "loss": 0.6102, + "step": 4921 + }, + { + "epoch": 0.7531752104055088, + "grad_norm": 2.095474532830923, + "learning_rate": 1.4335487942812919e-05, + "loss": 0.7267, + "step": 4922 + }, + { + "epoch": 0.753328232593726, + "grad_norm": 2.1802082090993777, + "learning_rate": 1.4333254644385144e-05, + "loss": 0.6848, + "step": 4923 + }, + { + "epoch": 0.7534812547819434, + "grad_norm": 2.6591708663481715, + "learning_rate": 1.4331021079835249e-05, + "loss": 0.8153, + "step": 4924 + }, + { + "epoch": 0.7536342769701607, + "grad_norm": 2.3136679846972665, + "learning_rate": 1.432878724930041e-05, + "loss": 0.69, + "step": 4925 + }, + { + "epoch": 0.7537872991583779, + "grad_norm": 2.2807435166362113, + "learning_rate": 1.4326553152917808e-05, + "loss": 0.6869, + "step": 4926 + }, + { + "epoch": 0.7539403213465953, + "grad_norm": 2.1228252902705402, + "learning_rate": 1.432431879082465e-05, + "loss": 0.7477, + "step": 4927 + }, + { + "epoch": 0.7540933435348125, + "grad_norm": 2.4278219077579473, + "learning_rate": 1.432208416315816e-05, + "loss": 0.7974, + "step": 4928 + }, + { + "epoch": 0.7542463657230298, + "grad_norm": 2.5232419497439396, + "learning_rate": 1.4319849270055576e-05, + "loss": 0.732, + "step": 4929 + }, + { + "epoch": 0.7543993879112472, + "grad_norm": 2.0991061394057406, + "learning_rate": 1.4317614111654145e-05, + "loss": 0.7423, + "step": 4930 + }, + { + "epoch": 0.7545524100994644, + "grad_norm": 2.3815083789734164, + "learning_rate": 1.4315378688091143e-05, + "loss": 0.9114, + "step": 4931 + }, + { + "epoch": 0.7547054322876817, + "grad_norm": 2.185730306642332, + "learning_rate": 1.4313142999503855e-05, + "loss": 0.7058, + "step": 4932 + }, + { + "epoch": 0.754858454475899, + "grad_norm": 2.1376989838870113, + "learning_rate": 1.4310907046029581e-05, + "loss": 0.702, + "step": 4933 + }, + { + "epoch": 0.7550114766641163, + "grad_norm": 2.185315720970579, + "learning_rate": 1.4308670827805642e-05, + "loss": 0.6436, + "step": 4934 + }, + { + "epoch": 0.7551644988523336, + "grad_norm": 2.325819141198931, + "learning_rate": 1.430643434496937e-05, + "loss": 0.7432, + "step": 4935 + }, + { + "epoch": 0.7553175210405508, + "grad_norm": 2.022900910011311, + "learning_rate": 1.4304197597658119e-05, + "loss": 0.7569, + "step": 4936 + }, + { + "epoch": 0.7554705432287682, + "grad_norm": 2.2978717235749917, + "learning_rate": 1.4301960586009255e-05, + "loss": 0.8398, + "step": 4937 + }, + { + "epoch": 0.7556235654169855, + "grad_norm": 2.0323787781258114, + "learning_rate": 1.4299723310160163e-05, + "loss": 0.8232, + "step": 4938 + }, + { + "epoch": 0.7557765876052027, + "grad_norm": 2.4687308360656344, + "learning_rate": 1.429748577024824e-05, + "loss": 0.7811, + "step": 4939 + }, + { + "epoch": 0.75592960979342, + "grad_norm": 2.416728876586808, + "learning_rate": 1.4295247966410903e-05, + "loss": 0.7862, + "step": 4940 + }, + { + "epoch": 0.7560826319816374, + "grad_norm": 2.430436233578812, + "learning_rate": 1.429300989878559e-05, + "loss": 0.7025, + "step": 4941 + }, + { + "epoch": 0.7562356541698546, + "grad_norm": 2.2106978733410556, + "learning_rate": 1.4290771567509745e-05, + "loss": 0.6891, + "step": 4942 + }, + { + "epoch": 0.7563886763580719, + "grad_norm": 2.4306496637568484, + "learning_rate": 1.4288532972720825e-05, + "loss": 0.8306, + "step": 4943 + }, + { + "epoch": 0.7565416985462892, + "grad_norm": 2.469829207837791, + "learning_rate": 1.4286294114556325e-05, + "loss": 0.692, + "step": 4944 + }, + { + "epoch": 0.7566947207345065, + "grad_norm": 2.4309218792187273, + "learning_rate": 1.4284054993153735e-05, + "loss": 0.7137, + "step": 4945 + }, + { + "epoch": 0.7568477429227238, + "grad_norm": 2.579777669684564, + "learning_rate": 1.4281815608650565e-05, + "loss": 0.7477, + "step": 4946 + }, + { + "epoch": 0.757000765110941, + "grad_norm": 2.0971978445223227, + "learning_rate": 1.4279575961184348e-05, + "loss": 0.6919, + "step": 4947 + }, + { + "epoch": 0.7571537872991584, + "grad_norm": 2.2553951720139422, + "learning_rate": 1.4277336050892631e-05, + "loss": 0.7135, + "step": 4948 + }, + { + "epoch": 0.7573068094873757, + "grad_norm": 2.149299088970806, + "learning_rate": 1.4275095877912976e-05, + "loss": 0.6381, + "step": 4949 + }, + { + "epoch": 0.7574598316755929, + "grad_norm": 2.1629746985410523, + "learning_rate": 1.4272855442382957e-05, + "loss": 0.6806, + "step": 4950 + }, + { + "epoch": 0.7576128538638103, + "grad_norm": 2.3261663770030157, + "learning_rate": 1.427061474444017e-05, + "loss": 0.6916, + "step": 4951 + }, + { + "epoch": 0.7577658760520275, + "grad_norm": 2.1499157181080566, + "learning_rate": 1.4268373784222225e-05, + "loss": 0.676, + "step": 4952 + }, + { + "epoch": 0.7579188982402448, + "grad_norm": 2.4744089486881293, + "learning_rate": 1.4266132561866747e-05, + "loss": 0.8978, + "step": 4953 + }, + { + "epoch": 0.7580719204284622, + "grad_norm": 2.168743668047201, + "learning_rate": 1.4263891077511383e-05, + "loss": 0.6623, + "step": 4954 + }, + { + "epoch": 0.7582249426166794, + "grad_norm": 2.3449514837783063, + "learning_rate": 1.4261649331293781e-05, + "loss": 0.6634, + "step": 4955 + }, + { + "epoch": 0.7583779648048967, + "grad_norm": 2.5034072035723023, + "learning_rate": 1.4259407323351626e-05, + "loss": 0.7541, + "step": 4956 + }, + { + "epoch": 0.758530986993114, + "grad_norm": 2.3696350755990485, + "learning_rate": 1.4257165053822605e-05, + "loss": 0.6219, + "step": 4957 + }, + { + "epoch": 0.7586840091813313, + "grad_norm": 2.3382755188679742, + "learning_rate": 1.425492252284442e-05, + "loss": 0.7696, + "step": 4958 + }, + { + "epoch": 0.7588370313695486, + "grad_norm": 2.135467705544331, + "learning_rate": 1.4252679730554801e-05, + "loss": 0.7307, + "step": 4959 + }, + { + "epoch": 0.7589900535577658, + "grad_norm": 2.408284072064693, + "learning_rate": 1.4250436677091482e-05, + "loss": 0.7607, + "step": 4960 + }, + { + "epoch": 0.7591430757459832, + "grad_norm": 2.2487141060628337, + "learning_rate": 1.424819336259222e-05, + "loss": 0.7047, + "step": 4961 + }, + { + "epoch": 0.7592960979342005, + "grad_norm": 2.174625864768563, + "learning_rate": 1.4245949787194783e-05, + "loss": 0.7431, + "step": 4962 + }, + { + "epoch": 0.7594491201224177, + "grad_norm": 2.3367292934749084, + "learning_rate": 1.4243705951036961e-05, + "loss": 0.7039, + "step": 4963 + }, + { + "epoch": 0.759602142310635, + "grad_norm": 2.3698750474828425, + "learning_rate": 1.4241461854256553e-05, + "loss": 0.692, + "step": 4964 + }, + { + "epoch": 0.7597551644988524, + "grad_norm": 2.3837948494422525, + "learning_rate": 1.423921749699138e-05, + "loss": 0.682, + "step": 4965 + }, + { + "epoch": 0.7599081866870696, + "grad_norm": 1.9568026488800232, + "learning_rate": 1.423697287937928e-05, + "loss": 0.5177, + "step": 4966 + }, + { + "epoch": 0.7600612088752869, + "grad_norm": 2.011750007271983, + "learning_rate": 1.4234728001558098e-05, + "loss": 0.7043, + "step": 4967 + }, + { + "epoch": 0.7602142310635042, + "grad_norm": 2.523991868179148, + "learning_rate": 1.42324828636657e-05, + "loss": 0.8005, + "step": 4968 + }, + { + "epoch": 0.7603672532517215, + "grad_norm": 1.9438899634689, + "learning_rate": 1.4230237465839975e-05, + "loss": 0.6676, + "step": 4969 + }, + { + "epoch": 0.7605202754399388, + "grad_norm": 2.224899265987228, + "learning_rate": 1.422799180821882e-05, + "loss": 0.6246, + "step": 4970 + }, + { + "epoch": 0.760673297628156, + "grad_norm": 2.2936026183785567, + "learning_rate": 1.4225745890940145e-05, + "loss": 0.7018, + "step": 4971 + }, + { + "epoch": 0.7608263198163734, + "grad_norm": 2.283389788103775, + "learning_rate": 1.4223499714141885e-05, + "loss": 0.7678, + "step": 4972 + }, + { + "epoch": 0.7609793420045907, + "grad_norm": 2.1905560444534546, + "learning_rate": 1.4221253277961987e-05, + "loss": 0.696, + "step": 4973 + }, + { + "epoch": 0.7611323641928079, + "grad_norm": 2.2357422103556504, + "learning_rate": 1.4219006582538409e-05, + "loss": 0.7681, + "step": 4974 + }, + { + "epoch": 0.7612853863810253, + "grad_norm": 2.104374866205293, + "learning_rate": 1.4216759628009132e-05, + "loss": 0.6724, + "step": 4975 + }, + { + "epoch": 0.7614384085692425, + "grad_norm": 2.376649516428986, + "learning_rate": 1.421451241451215e-05, + "loss": 0.7457, + "step": 4976 + }, + { + "epoch": 0.7615914307574598, + "grad_norm": 2.5026682947159773, + "learning_rate": 1.4212264942185473e-05, + "loss": 0.7707, + "step": 4977 + }, + { + "epoch": 0.7617444529456772, + "grad_norm": 2.339951422144216, + "learning_rate": 1.421001721116713e-05, + "loss": 0.7215, + "step": 4978 + }, + { + "epoch": 0.7618974751338944, + "grad_norm": 2.5119998784613524, + "learning_rate": 1.4207769221595157e-05, + "loss": 0.8083, + "step": 4979 + }, + { + "epoch": 0.7620504973221117, + "grad_norm": 2.503174353934065, + "learning_rate": 1.4205520973607618e-05, + "loss": 0.6887, + "step": 4980 + }, + { + "epoch": 0.762203519510329, + "grad_norm": 2.4742332404941387, + "learning_rate": 1.4203272467342582e-05, + "loss": 0.7633, + "step": 4981 + }, + { + "epoch": 0.7623565416985463, + "grad_norm": 2.4392406082909655, + "learning_rate": 1.4201023702938143e-05, + "loss": 0.6757, + "step": 4982 + }, + { + "epoch": 0.7625095638867636, + "grad_norm": 2.3098987205477064, + "learning_rate": 1.4198774680532403e-05, + "loss": 0.7513, + "step": 4983 + }, + { + "epoch": 0.7626625860749808, + "grad_norm": 2.258614987166009, + "learning_rate": 1.4196525400263482e-05, + "loss": 0.7191, + "step": 4984 + }, + { + "epoch": 0.7628156082631982, + "grad_norm": 2.3404733220525196, + "learning_rate": 1.419427586226952e-05, + "loss": 0.799, + "step": 4985 + }, + { + "epoch": 0.7629686304514155, + "grad_norm": 2.2779284684505354, + "learning_rate": 1.4192026066688673e-05, + "loss": 0.8147, + "step": 4986 + }, + { + "epoch": 0.7631216526396327, + "grad_norm": 2.31643846075932, + "learning_rate": 1.41897760136591e-05, + "loss": 0.7568, + "step": 4987 + }, + { + "epoch": 0.76327467482785, + "grad_norm": 2.2663948410577337, + "learning_rate": 1.4187525703318996e-05, + "loss": 0.6588, + "step": 4988 + }, + { + "epoch": 0.7634276970160673, + "grad_norm": 2.295491923173907, + "learning_rate": 1.4185275135806555e-05, + "loss": 0.8117, + "step": 4989 + }, + { + "epoch": 0.7635807192042846, + "grad_norm": 2.353699212264283, + "learning_rate": 1.4183024311259997e-05, + "loss": 0.6288, + "step": 4990 + }, + { + "epoch": 0.7637337413925019, + "grad_norm": 2.1939178575915124, + "learning_rate": 1.4180773229817548e-05, + "loss": 0.6682, + "step": 4991 + }, + { + "epoch": 0.7638867635807192, + "grad_norm": 2.2979720867197377, + "learning_rate": 1.4178521891617462e-05, + "loss": 0.7026, + "step": 4992 + }, + { + "epoch": 0.7640397857689365, + "grad_norm": 2.4221273394850487, + "learning_rate": 1.4176270296797998e-05, + "loss": 0.7267, + "step": 4993 + }, + { + "epoch": 0.7641928079571538, + "grad_norm": 2.3662257487773797, + "learning_rate": 1.4174018445497439e-05, + "loss": 0.7745, + "step": 4994 + }, + { + "epoch": 0.764345830145371, + "grad_norm": 2.2759670548636133, + "learning_rate": 1.4171766337854083e-05, + "loss": 0.6579, + "step": 4995 + }, + { + "epoch": 0.7644988523335884, + "grad_norm": 1.9781887539285967, + "learning_rate": 1.416951397400623e-05, + "loss": 0.6649, + "step": 4996 + }, + { + "epoch": 0.7646518745218056, + "grad_norm": 2.2189360667232414, + "learning_rate": 1.4167261354092214e-05, + "loss": 0.7571, + "step": 4997 + }, + { + "epoch": 0.7648048967100229, + "grad_norm": 2.280749037561788, + "learning_rate": 1.4165008478250377e-05, + "loss": 0.717, + "step": 4998 + }, + { + "epoch": 0.7649579188982403, + "grad_norm": 2.362059249926859, + "learning_rate": 1.4162755346619075e-05, + "loss": 0.6807, + "step": 4999 + }, + { + "epoch": 0.7651109410864575, + "grad_norm": 2.2248228523984084, + "learning_rate": 1.4160501959336684e-05, + "loss": 0.6791, + "step": 5000 + }, + { + "epoch": 0.7652639632746748, + "grad_norm": 2.2767134208389717, + "learning_rate": 1.415824831654159e-05, + "loss": 0.7279, + "step": 5001 + }, + { + "epoch": 0.7654169854628922, + "grad_norm": 2.259109443503308, + "learning_rate": 1.41559944183722e-05, + "loss": 0.6783, + "step": 5002 + }, + { + "epoch": 0.7655700076511094, + "grad_norm": 2.119358016956952, + "learning_rate": 1.4153740264966935e-05, + "loss": 0.7382, + "step": 5003 + }, + { + "epoch": 0.7657230298393267, + "grad_norm": 2.2868498026059507, + "learning_rate": 1.4151485856464231e-05, + "loss": 0.728, + "step": 5004 + }, + { + "epoch": 0.7658760520275439, + "grad_norm": 2.3259370090069074, + "learning_rate": 1.414923119300254e-05, + "loss": 0.7322, + "step": 5005 + }, + { + "epoch": 0.7660290742157613, + "grad_norm": 2.0952658010872325, + "learning_rate": 1.4146976274720325e-05, + "loss": 0.6611, + "step": 5006 + }, + { + "epoch": 0.7661820964039786, + "grad_norm": 2.2507490230036615, + "learning_rate": 1.414472110175608e-05, + "loss": 0.6851, + "step": 5007 + }, + { + "epoch": 0.7663351185921958, + "grad_norm": 2.3596475176429577, + "learning_rate": 1.4142465674248295e-05, + "loss": 0.7103, + "step": 5008 + }, + { + "epoch": 0.7664881407804132, + "grad_norm": 2.2428654694936165, + "learning_rate": 1.4140209992335488e-05, + "loss": 0.7419, + "step": 5009 + }, + { + "epoch": 0.7666411629686305, + "grad_norm": 2.412219562600667, + "learning_rate": 1.4137954056156189e-05, + "loss": 0.822, + "step": 5010 + }, + { + "epoch": 0.7667941851568477, + "grad_norm": 2.469683847966789, + "learning_rate": 1.4135697865848945e-05, + "loss": 0.821, + "step": 5011 + }, + { + "epoch": 0.766947207345065, + "grad_norm": 2.380874074809561, + "learning_rate": 1.4133441421552312e-05, + "loss": 0.7444, + "step": 5012 + }, + { + "epoch": 0.7671002295332823, + "grad_norm": 2.54268340852517, + "learning_rate": 1.4131184723404876e-05, + "loss": 0.8583, + "step": 5013 + }, + { + "epoch": 0.7672532517214996, + "grad_norm": 2.1625190284078863, + "learning_rate": 1.4128927771545222e-05, + "loss": 0.6829, + "step": 5014 + }, + { + "epoch": 0.7674062739097169, + "grad_norm": 2.2366367904796176, + "learning_rate": 1.4126670566111962e-05, + "loss": 0.7066, + "step": 5015 + }, + { + "epoch": 0.7675592960979342, + "grad_norm": 2.2869932661750005, + "learning_rate": 1.4124413107243718e-05, + "loss": 0.7291, + "step": 5016 + }, + { + "epoch": 0.7677123182861515, + "grad_norm": 2.461913309036648, + "learning_rate": 1.4122155395079132e-05, + "loss": 0.7526, + "step": 5017 + }, + { + "epoch": 0.7678653404743688, + "grad_norm": 2.4062027749990755, + "learning_rate": 1.4119897429756855e-05, + "loss": 0.742, + "step": 5018 + }, + { + "epoch": 0.768018362662586, + "grad_norm": 2.1281105958816373, + "learning_rate": 1.4117639211415561e-05, + "loss": 0.6535, + "step": 5019 + }, + { + "epoch": 0.7681713848508034, + "grad_norm": 2.442149676668343, + "learning_rate": 1.4115380740193936e-05, + "loss": 0.6923, + "step": 5020 + }, + { + "epoch": 0.7683244070390206, + "grad_norm": 2.3490210282430426, + "learning_rate": 1.4113122016230678e-05, + "loss": 0.7034, + "step": 5021 + }, + { + "epoch": 0.7684774292272379, + "grad_norm": 2.18584937837184, + "learning_rate": 1.4110863039664506e-05, + "loss": 0.654, + "step": 5022 + }, + { + "epoch": 0.7686304514154553, + "grad_norm": 2.132444677822337, + "learning_rate": 1.4108603810634157e-05, + "loss": 0.6469, + "step": 5023 + }, + { + "epoch": 0.7687834736036725, + "grad_norm": 2.2211029054390634, + "learning_rate": 1.4106344329278372e-05, + "loss": 0.7746, + "step": 5024 + }, + { + "epoch": 0.7689364957918898, + "grad_norm": 2.5642196111069344, + "learning_rate": 1.4104084595735916e-05, + "loss": 0.7419, + "step": 5025 + }, + { + "epoch": 0.7690895179801072, + "grad_norm": 2.2008072174788715, + "learning_rate": 1.410182461014557e-05, + "loss": 0.7026, + "step": 5026 + }, + { + "epoch": 0.7692425401683244, + "grad_norm": 2.1514935715003087, + "learning_rate": 1.4099564372646132e-05, + "loss": 0.7478, + "step": 5027 + }, + { + "epoch": 0.7693955623565417, + "grad_norm": 2.1472496992000836, + "learning_rate": 1.4097303883376405e-05, + "loss": 0.7736, + "step": 5028 + }, + { + "epoch": 0.7695485845447589, + "grad_norm": 2.1083158311876997, + "learning_rate": 1.409504314247522e-05, + "loss": 0.7575, + "step": 5029 + }, + { + "epoch": 0.7697016067329763, + "grad_norm": 2.3859948402117506, + "learning_rate": 1.4092782150081415e-05, + "loss": 0.8054, + "step": 5030 + }, + { + "epoch": 0.7698546289211936, + "grad_norm": 2.2696789787921308, + "learning_rate": 1.4090520906333844e-05, + "loss": 0.6825, + "step": 5031 + }, + { + "epoch": 0.7700076511094108, + "grad_norm": 2.5043142859591954, + "learning_rate": 1.4088259411371388e-05, + "loss": 0.7305, + "step": 5032 + }, + { + "epoch": 0.7701606732976282, + "grad_norm": 2.8269162934283716, + "learning_rate": 1.4085997665332925e-05, + "loss": 0.6334, + "step": 5033 + }, + { + "epoch": 0.7703136954858455, + "grad_norm": 2.363998579657459, + "learning_rate": 1.4083735668357359e-05, + "loss": 0.717, + "step": 5034 + }, + { + "epoch": 0.7704667176740627, + "grad_norm": 2.308357524429582, + "learning_rate": 1.4081473420583612e-05, + "loss": 0.7569, + "step": 5035 + }, + { + "epoch": 0.77061973986228, + "grad_norm": 2.1405563499565625, + "learning_rate": 1.4079210922150615e-05, + "loss": 0.7978, + "step": 5036 + }, + { + "epoch": 0.7707727620504973, + "grad_norm": 2.2014674249133113, + "learning_rate": 1.4076948173197316e-05, + "loss": 0.7363, + "step": 5037 + }, + { + "epoch": 0.7709257842387146, + "grad_norm": 2.5632111129243795, + "learning_rate": 1.4074685173862684e-05, + "loss": 0.7573, + "step": 5038 + }, + { + "epoch": 0.7710788064269319, + "grad_norm": 2.440105505427757, + "learning_rate": 1.4072421924285693e-05, + "loss": 0.7406, + "step": 5039 + }, + { + "epoch": 0.7712318286151492, + "grad_norm": 2.328805243013259, + "learning_rate": 1.4070158424605338e-05, + "loss": 0.7432, + "step": 5040 + }, + { + "epoch": 0.7713848508033665, + "grad_norm": 2.265426601466516, + "learning_rate": 1.4067894674960637e-05, + "loss": 0.6968, + "step": 5041 + }, + { + "epoch": 0.7715378729915838, + "grad_norm": 2.268614568748264, + "learning_rate": 1.4065630675490605e-05, + "loss": 0.6453, + "step": 5042 + }, + { + "epoch": 0.771690895179801, + "grad_norm": 2.2010842207373305, + "learning_rate": 1.4063366426334293e-05, + "loss": 0.7092, + "step": 5043 + }, + { + "epoch": 0.7718439173680184, + "grad_norm": 2.0805941230225424, + "learning_rate": 1.4061101927630749e-05, + "loss": 0.5924, + "step": 5044 + }, + { + "epoch": 0.7719969395562356, + "grad_norm": 2.1821898010478353, + "learning_rate": 1.405883717951905e-05, + "loss": 0.6982, + "step": 5045 + }, + { + "epoch": 0.7721499617444529, + "grad_norm": 2.093645143185147, + "learning_rate": 1.4056572182138281e-05, + "loss": 0.6132, + "step": 5046 + }, + { + "epoch": 0.7723029839326703, + "grad_norm": 2.253233064275212, + "learning_rate": 1.4054306935627544e-05, + "loss": 0.7959, + "step": 5047 + }, + { + "epoch": 0.7724560061208875, + "grad_norm": 2.2821754594763073, + "learning_rate": 1.405204144012596e-05, + "loss": 0.7634, + "step": 5048 + }, + { + "epoch": 0.7726090283091048, + "grad_norm": 2.5505280501375576, + "learning_rate": 1.404977569577266e-05, + "loss": 0.775, + "step": 5049 + }, + { + "epoch": 0.7727620504973222, + "grad_norm": 2.1972396897268847, + "learning_rate": 1.404750970270679e-05, + "loss": 0.6405, + "step": 5050 + }, + { + "epoch": 0.7729150726855394, + "grad_norm": 2.0449200641167264, + "learning_rate": 1.4045243461067514e-05, + "loss": 0.6206, + "step": 5051 + }, + { + "epoch": 0.7730680948737567, + "grad_norm": 2.386168396663282, + "learning_rate": 1.4042976970994015e-05, + "loss": 0.6859, + "step": 5052 + }, + { + "epoch": 0.7732211170619739, + "grad_norm": 2.514122014683401, + "learning_rate": 1.4040710232625481e-05, + "loss": 0.7822, + "step": 5053 + }, + { + "epoch": 0.7733741392501913, + "grad_norm": 2.5342070484797476, + "learning_rate": 1.4038443246101125e-05, + "loss": 0.6788, + "step": 5054 + }, + { + "epoch": 0.7735271614384086, + "grad_norm": 2.345617758130968, + "learning_rate": 1.4036176011560172e-05, + "loss": 0.8571, + "step": 5055 + }, + { + "epoch": 0.7736801836266258, + "grad_norm": 2.18626517984038, + "learning_rate": 1.403390852914186e-05, + "loss": 0.5995, + "step": 5056 + }, + { + "epoch": 0.7738332058148432, + "grad_norm": 2.23239353461061, + "learning_rate": 1.4031640798985446e-05, + "loss": 0.7064, + "step": 5057 + }, + { + "epoch": 0.7739862280030605, + "grad_norm": 2.1588061241982417, + "learning_rate": 1.4029372821230196e-05, + "loss": 0.622, + "step": 5058 + }, + { + "epoch": 0.7741392501912777, + "grad_norm": 2.321759106687769, + "learning_rate": 1.40271045960154e-05, + "loss": 0.746, + "step": 5059 + }, + { + "epoch": 0.774292272379495, + "grad_norm": 2.3401183762900266, + "learning_rate": 1.4024836123480356e-05, + "loss": 0.7694, + "step": 5060 + }, + { + "epoch": 0.7744452945677123, + "grad_norm": 2.353276037005396, + "learning_rate": 1.402256740376438e-05, + "loss": 0.7548, + "step": 5061 + }, + { + "epoch": 0.7745983167559296, + "grad_norm": 2.2155907355110984, + "learning_rate": 1.4020298437006803e-05, + "loss": 0.6635, + "step": 5062 + }, + { + "epoch": 0.7747513389441469, + "grad_norm": 2.254233811440101, + "learning_rate": 1.4018029223346972e-05, + "loss": 0.5666, + "step": 5063 + }, + { + "epoch": 0.7749043611323642, + "grad_norm": 2.2481526640750764, + "learning_rate": 1.4015759762924246e-05, + "loss": 0.6752, + "step": 5064 + }, + { + "epoch": 0.7750573833205815, + "grad_norm": 2.148794153992223, + "learning_rate": 1.4013490055878008e-05, + "loss": 0.7136, + "step": 5065 + }, + { + "epoch": 0.7752104055087988, + "grad_norm": 2.3975560671750022, + "learning_rate": 1.401122010234764e-05, + "loss": 0.7725, + "step": 5066 + }, + { + "epoch": 0.775363427697016, + "grad_norm": 2.2347633781944407, + "learning_rate": 1.4008949902472554e-05, + "loss": 0.637, + "step": 5067 + }, + { + "epoch": 0.7755164498852334, + "grad_norm": 2.285143070661817, + "learning_rate": 1.4006679456392174e-05, + "loss": 0.7154, + "step": 5068 + }, + { + "epoch": 0.7756694720734506, + "grad_norm": 2.2031275325618553, + "learning_rate": 1.4004408764245934e-05, + "loss": 0.7259, + "step": 5069 + }, + { + "epoch": 0.7758224942616679, + "grad_norm": 2.315668163634487, + "learning_rate": 1.4002137826173286e-05, + "loss": 0.7905, + "step": 5070 + }, + { + "epoch": 0.7759755164498853, + "grad_norm": 2.541307986854028, + "learning_rate": 1.3999866642313698e-05, + "loss": 0.6911, + "step": 5071 + }, + { + "epoch": 0.7761285386381025, + "grad_norm": 2.27496047507574, + "learning_rate": 1.3997595212806648e-05, + "loss": 0.6393, + "step": 5072 + }, + { + "epoch": 0.7762815608263198, + "grad_norm": 2.247769499454639, + "learning_rate": 1.3995323537791643e-05, + "loss": 0.76, + "step": 5073 + }, + { + "epoch": 0.7764345830145372, + "grad_norm": 2.2888853081384974, + "learning_rate": 1.3993051617408186e-05, + "loss": 0.7552, + "step": 5074 + }, + { + "epoch": 0.7765876052027544, + "grad_norm": 2.2001010956721037, + "learning_rate": 1.3990779451795808e-05, + "loss": 0.6797, + "step": 5075 + }, + { + "epoch": 0.7767406273909717, + "grad_norm": 2.4115578977098164, + "learning_rate": 1.3988507041094055e-05, + "loss": 0.7154, + "step": 5076 + }, + { + "epoch": 0.7768936495791889, + "grad_norm": 2.1463792083802273, + "learning_rate": 1.3986234385442481e-05, + "loss": 0.6909, + "step": 5077 + }, + { + "epoch": 0.7770466717674063, + "grad_norm": 2.0897406819197863, + "learning_rate": 1.3983961484980656e-05, + "loss": 0.751, + "step": 5078 + }, + { + "epoch": 0.7771996939556236, + "grad_norm": 2.1608247723805687, + "learning_rate": 1.3981688339848174e-05, + "loss": 0.6893, + "step": 5079 + }, + { + "epoch": 0.7773527161438408, + "grad_norm": 2.882363155314608, + "learning_rate": 1.3979414950184632e-05, + "loss": 0.6289, + "step": 5080 + }, + { + "epoch": 0.7775057383320582, + "grad_norm": 2.5337887564427883, + "learning_rate": 1.3977141316129653e-05, + "loss": 0.7907, + "step": 5081 + }, + { + "epoch": 0.7776587605202755, + "grad_norm": 2.368738369887408, + "learning_rate": 1.3974867437822866e-05, + "loss": 0.777, + "step": 5082 + }, + { + "epoch": 0.7778117827084927, + "grad_norm": 2.3466429855351505, + "learning_rate": 1.3972593315403919e-05, + "loss": 0.748, + "step": 5083 + }, + { + "epoch": 0.77796480489671, + "grad_norm": 2.269874457550218, + "learning_rate": 1.3970318949012475e-05, + "loss": 0.7797, + "step": 5084 + }, + { + "epoch": 0.7781178270849273, + "grad_norm": 2.284119463233079, + "learning_rate": 1.3968044338788216e-05, + "loss": 0.7871, + "step": 5085 + }, + { + "epoch": 0.7782708492731446, + "grad_norm": 2.2696960329330813, + "learning_rate": 1.3965769484870829e-05, + "loss": 0.6889, + "step": 5086 + }, + { + "epoch": 0.7784238714613619, + "grad_norm": 2.420011775174571, + "learning_rate": 1.3963494387400023e-05, + "loss": 0.6882, + "step": 5087 + }, + { + "epoch": 0.7785768936495792, + "grad_norm": 2.1957556052360916, + "learning_rate": 1.3961219046515519e-05, + "loss": 0.7425, + "step": 5088 + }, + { + "epoch": 0.7787299158377965, + "grad_norm": 2.119582758763422, + "learning_rate": 1.3958943462357065e-05, + "loss": 0.6109, + "step": 5089 + }, + { + "epoch": 0.7788829380260138, + "grad_norm": 2.6124106631663984, + "learning_rate": 1.3956667635064398e-05, + "loss": 0.7887, + "step": 5090 + }, + { + "epoch": 0.779035960214231, + "grad_norm": 2.2893517601019107, + "learning_rate": 1.3954391564777295e-05, + "loss": 0.6306, + "step": 5091 + }, + { + "epoch": 0.7791889824024484, + "grad_norm": 2.1777151683208444, + "learning_rate": 1.3952115251635536e-05, + "loss": 0.7121, + "step": 5092 + }, + { + "epoch": 0.7793420045906656, + "grad_norm": 2.2899237702858746, + "learning_rate": 1.3949838695778921e-05, + "loss": 0.7505, + "step": 5093 + }, + { + "epoch": 0.7794950267788829, + "grad_norm": 2.42586559436103, + "learning_rate": 1.3947561897347257e-05, + "loss": 0.7946, + "step": 5094 + }, + { + "epoch": 0.7796480489671003, + "grad_norm": 1.9766272067621482, + "learning_rate": 1.3945284856480376e-05, + "loss": 0.6901, + "step": 5095 + }, + { + "epoch": 0.7798010711553175, + "grad_norm": 2.156148901403677, + "learning_rate": 1.3943007573318117e-05, + "loss": 0.7859, + "step": 5096 + }, + { + "epoch": 0.7799540933435348, + "grad_norm": 2.5202800450554865, + "learning_rate": 1.3940730048000338e-05, + "loss": 0.7896, + "step": 5097 + }, + { + "epoch": 0.780107115531752, + "grad_norm": 2.4171048421326207, + "learning_rate": 1.3938452280666909e-05, + "loss": 0.7879, + "step": 5098 + }, + { + "epoch": 0.7802601377199694, + "grad_norm": 2.4109847004996174, + "learning_rate": 1.3936174271457721e-05, + "loss": 0.8669, + "step": 5099 + }, + { + "epoch": 0.7804131599081867, + "grad_norm": 2.0993258641086228, + "learning_rate": 1.3933896020512671e-05, + "loss": 0.727, + "step": 5100 + }, + { + "epoch": 0.7805661820964039, + "grad_norm": 2.027143937230042, + "learning_rate": 1.3931617527971674e-05, + "loss": 0.6448, + "step": 5101 + }, + { + "epoch": 0.7807192042846213, + "grad_norm": 1.984107315848014, + "learning_rate": 1.3929338793974671e-05, + "loss": 0.6419, + "step": 5102 + }, + { + "epoch": 0.7808722264728386, + "grad_norm": 1.9833282224258104, + "learning_rate": 1.3927059818661596e-05, + "loss": 0.5596, + "step": 5103 + }, + { + "epoch": 0.7810252486610558, + "grad_norm": 2.1670221414876756, + "learning_rate": 1.3924780602172413e-05, + "loss": 0.717, + "step": 5104 + }, + { + "epoch": 0.7811782708492732, + "grad_norm": 2.2583166303336237, + "learning_rate": 1.3922501144647105e-05, + "loss": 0.7974, + "step": 5105 + }, + { + "epoch": 0.7813312930374904, + "grad_norm": 2.380217191342555, + "learning_rate": 1.3920221446225654e-05, + "loss": 0.7113, + "step": 5106 + }, + { + "epoch": 0.7814843152257077, + "grad_norm": 2.220484377761435, + "learning_rate": 1.3917941507048068e-05, + "loss": 0.7526, + "step": 5107 + }, + { + "epoch": 0.781637337413925, + "grad_norm": 2.298676239650065, + "learning_rate": 1.3915661327254367e-05, + "loss": 0.7043, + "step": 5108 + }, + { + "epoch": 0.7817903596021423, + "grad_norm": 2.1637013431269447, + "learning_rate": 1.3913380906984586e-05, + "loss": 0.7548, + "step": 5109 + }, + { + "epoch": 0.7819433817903596, + "grad_norm": 2.1322701567201006, + "learning_rate": 1.3911100246378775e-05, + "loss": 0.758, + "step": 5110 + }, + { + "epoch": 0.7820964039785769, + "grad_norm": 2.192596038504427, + "learning_rate": 1.3908819345576996e-05, + "loss": 0.6943, + "step": 5111 + }, + { + "epoch": 0.7822494261667942, + "grad_norm": 2.2990794845292317, + "learning_rate": 1.3906538204719329e-05, + "loss": 0.6867, + "step": 5112 + }, + { + "epoch": 0.7824024483550115, + "grad_norm": 2.3025056442381735, + "learning_rate": 1.3904256823945868e-05, + "loss": 0.7531, + "step": 5113 + }, + { + "epoch": 0.7825554705432287, + "grad_norm": 2.318696680589184, + "learning_rate": 1.3901975203396724e-05, + "loss": 0.6702, + "step": 5114 + }, + { + "epoch": 0.782708492731446, + "grad_norm": 2.0791679798970897, + "learning_rate": 1.389969334321202e-05, + "loss": 0.7031, + "step": 5115 + }, + { + "epoch": 0.7828615149196634, + "grad_norm": 2.2745500029402694, + "learning_rate": 1.3897411243531886e-05, + "loss": 0.6795, + "step": 5116 + }, + { + "epoch": 0.7830145371078806, + "grad_norm": 2.4518559288103554, + "learning_rate": 1.3895128904496486e-05, + "loss": 0.7113, + "step": 5117 + }, + { + "epoch": 0.7831675592960979, + "grad_norm": 2.2388389113594553, + "learning_rate": 1.3892846326245984e-05, + "loss": 0.7826, + "step": 5118 + }, + { + "epoch": 0.7833205814843153, + "grad_norm": 2.0864895233637775, + "learning_rate": 1.3890563508920554e-05, + "loss": 0.7918, + "step": 5119 + }, + { + "epoch": 0.7834736036725325, + "grad_norm": 2.250652586337949, + "learning_rate": 1.3888280452660401e-05, + "loss": 0.8105, + "step": 5120 + }, + { + "epoch": 0.7836266258607498, + "grad_norm": 2.5348100337657438, + "learning_rate": 1.3885997157605737e-05, + "loss": 0.7097, + "step": 5121 + }, + { + "epoch": 0.783779648048967, + "grad_norm": 2.2521060385487277, + "learning_rate": 1.3883713623896782e-05, + "loss": 0.8175, + "step": 5122 + }, + { + "epoch": 0.7839326702371844, + "grad_norm": 2.4476407046502446, + "learning_rate": 1.3881429851673781e-05, + "loss": 0.7518, + "step": 5123 + }, + { + "epoch": 0.7840856924254017, + "grad_norm": 2.512045611078853, + "learning_rate": 1.3879145841076991e-05, + "loss": 0.7752, + "step": 5124 + }, + { + "epoch": 0.7842387146136189, + "grad_norm": 2.278880822195872, + "learning_rate": 1.3876861592246678e-05, + "loss": 0.7291, + "step": 5125 + }, + { + "epoch": 0.7843917368018363, + "grad_norm": 1.9592367955208152, + "learning_rate": 1.3874577105323127e-05, + "loss": 0.5866, + "step": 5126 + }, + { + "epoch": 0.7845447589900536, + "grad_norm": 2.1204944930182466, + "learning_rate": 1.3872292380446641e-05, + "loss": 0.6941, + "step": 5127 + }, + { + "epoch": 0.7846977811782708, + "grad_norm": 2.436918071377942, + "learning_rate": 1.3870007417757529e-05, + "loss": 0.7549, + "step": 5128 + }, + { + "epoch": 0.7848508033664882, + "grad_norm": 2.049946768306998, + "learning_rate": 1.3867722217396122e-05, + "loss": 0.7674, + "step": 5129 + }, + { + "epoch": 0.7850038255547054, + "grad_norm": 2.5063329923729554, + "learning_rate": 1.3865436779502767e-05, + "loss": 0.7467, + "step": 5130 + }, + { + "epoch": 0.7851568477429227, + "grad_norm": 1.9886281059847166, + "learning_rate": 1.3863151104217816e-05, + "loss": 0.6673, + "step": 5131 + }, + { + "epoch": 0.78530986993114, + "grad_norm": 2.1184066490210727, + "learning_rate": 1.3860865191681639e-05, + "loss": 0.6644, + "step": 5132 + }, + { + "epoch": 0.7854628921193573, + "grad_norm": 2.2524087014750336, + "learning_rate": 1.385857904203463e-05, + "loss": 0.8056, + "step": 5133 + }, + { + "epoch": 0.7856159143075746, + "grad_norm": 2.1941572669081326, + "learning_rate": 1.3856292655417187e-05, + "loss": 0.7362, + "step": 5134 + }, + { + "epoch": 0.7857689364957919, + "grad_norm": 2.1730871002177836, + "learning_rate": 1.3854006031969727e-05, + "loss": 0.6781, + "step": 5135 + }, + { + "epoch": 0.7859219586840092, + "grad_norm": 2.4598351761040256, + "learning_rate": 1.3851719171832678e-05, + "loss": 0.8042, + "step": 5136 + }, + { + "epoch": 0.7860749808722265, + "grad_norm": 2.38153534428331, + "learning_rate": 1.3849432075146485e-05, + "loss": 0.7745, + "step": 5137 + }, + { + "epoch": 0.7862280030604437, + "grad_norm": 2.1360826368406354, + "learning_rate": 1.3847144742051613e-05, + "loss": 0.8381, + "step": 5138 + }, + { + "epoch": 0.786381025248661, + "grad_norm": 2.85778086002553, + "learning_rate": 1.3844857172688531e-05, + "loss": 0.8439, + "step": 5139 + }, + { + "epoch": 0.7865340474368784, + "grad_norm": 2.2874345985423896, + "learning_rate": 1.3842569367197726e-05, + "loss": 0.7498, + "step": 5140 + }, + { + "epoch": 0.7866870696250956, + "grad_norm": 2.132202408900261, + "learning_rate": 1.3840281325719708e-05, + "loss": 0.6593, + "step": 5141 + }, + { + "epoch": 0.7868400918133129, + "grad_norm": 2.224486844047528, + "learning_rate": 1.3837993048394988e-05, + "loss": 0.7379, + "step": 5142 + }, + { + "epoch": 0.7869931140015303, + "grad_norm": 2.2894622734424734, + "learning_rate": 1.3835704535364103e-05, + "loss": 0.7231, + "step": 5143 + }, + { + "epoch": 0.7871461361897475, + "grad_norm": 2.100328372449939, + "learning_rate": 1.3833415786767596e-05, + "loss": 0.6338, + "step": 5144 + }, + { + "epoch": 0.7872991583779648, + "grad_norm": 2.4355056805050217, + "learning_rate": 1.3831126802746026e-05, + "loss": 0.6756, + "step": 5145 + }, + { + "epoch": 0.787452180566182, + "grad_norm": 2.2133365219845618, + "learning_rate": 1.3828837583439975e-05, + "loss": 0.7254, + "step": 5146 + }, + { + "epoch": 0.7876052027543994, + "grad_norm": 2.286610066393195, + "learning_rate": 1.3826548128990031e-05, + "loss": 0.7471, + "step": 5147 + }, + { + "epoch": 0.7877582249426167, + "grad_norm": 2.545738125909192, + "learning_rate": 1.3824258439536793e-05, + "loss": 0.7792, + "step": 5148 + }, + { + "epoch": 0.7879112471308339, + "grad_norm": 2.1321283437860665, + "learning_rate": 1.3821968515220885e-05, + "loss": 0.6472, + "step": 5149 + }, + { + "epoch": 0.7880642693190513, + "grad_norm": 2.7061865458512995, + "learning_rate": 1.381967835618294e-05, + "loss": 0.791, + "step": 5150 + }, + { + "epoch": 0.7882172915072686, + "grad_norm": 2.5000426098798427, + "learning_rate": 1.3817387962563605e-05, + "loss": 0.7419, + "step": 5151 + }, + { + "epoch": 0.7883703136954858, + "grad_norm": 2.1413176672534, + "learning_rate": 1.381509733450354e-05, + "loss": 0.6886, + "step": 5152 + }, + { + "epoch": 0.7885233358837032, + "grad_norm": 2.0937281811946438, + "learning_rate": 1.3812806472143423e-05, + "loss": 0.6495, + "step": 5153 + }, + { + "epoch": 0.7886763580719204, + "grad_norm": 2.350309442778104, + "learning_rate": 1.3810515375623944e-05, + "loss": 0.6501, + "step": 5154 + }, + { + "epoch": 0.7888293802601377, + "grad_norm": 2.152716288815514, + "learning_rate": 1.3808224045085812e-05, + "loss": 0.7354, + "step": 5155 + }, + { + "epoch": 0.788982402448355, + "grad_norm": 2.15154503852824, + "learning_rate": 1.3805932480669739e-05, + "loss": 0.7494, + "step": 5156 + }, + { + "epoch": 0.7891354246365723, + "grad_norm": 2.0973581972487745, + "learning_rate": 1.3803640682516466e-05, + "loss": 0.7602, + "step": 5157 + }, + { + "epoch": 0.7892884468247896, + "grad_norm": 2.711881599173585, + "learning_rate": 1.3801348650766739e-05, + "loss": 0.6601, + "step": 5158 + }, + { + "epoch": 0.7894414690130069, + "grad_norm": 2.184034530375545, + "learning_rate": 1.379905638556132e-05, + "loss": 0.6898, + "step": 5159 + }, + { + "epoch": 0.7895944912012242, + "grad_norm": 2.095943948257656, + "learning_rate": 1.3796763887040987e-05, + "loss": 0.7355, + "step": 5160 + }, + { + "epoch": 0.7897475133894415, + "grad_norm": 2.2534986734403595, + "learning_rate": 1.3794471155346529e-05, + "loss": 0.7118, + "step": 5161 + }, + { + "epoch": 0.7899005355776587, + "grad_norm": 2.1206432371445545, + "learning_rate": 1.3792178190618754e-05, + "loss": 0.6405, + "step": 5162 + }, + { + "epoch": 0.790053557765876, + "grad_norm": 1.9695127305630176, + "learning_rate": 1.3789884992998484e-05, + "loss": 0.6778, + "step": 5163 + }, + { + "epoch": 0.7902065799540934, + "grad_norm": 2.3350051931180227, + "learning_rate": 1.3787591562626545e-05, + "loss": 0.8381, + "step": 5164 + }, + { + "epoch": 0.7903596021423106, + "grad_norm": 2.52231424297307, + "learning_rate": 1.3785297899643797e-05, + "loss": 0.8362, + "step": 5165 + }, + { + "epoch": 0.7905126243305279, + "grad_norm": 2.1652917778433753, + "learning_rate": 1.3783004004191095e-05, + "loss": 0.825, + "step": 5166 + }, + { + "epoch": 0.7906656465187453, + "grad_norm": 2.156387877386113, + "learning_rate": 1.3780709876409315e-05, + "loss": 0.5602, + "step": 5167 + }, + { + "epoch": 0.7908186687069625, + "grad_norm": 2.42839264979406, + "learning_rate": 1.3778415516439352e-05, + "loss": 0.7292, + "step": 5168 + }, + { + "epoch": 0.7909716908951798, + "grad_norm": 2.4513243449235134, + "learning_rate": 1.3776120924422114e-05, + "loss": 0.8545, + "step": 5169 + }, + { + "epoch": 0.791124713083397, + "grad_norm": 2.2296488916634756, + "learning_rate": 1.3773826100498512e-05, + "loss": 0.641, + "step": 5170 + }, + { + "epoch": 0.7912777352716144, + "grad_norm": 2.2384319518642677, + "learning_rate": 1.377153104480949e-05, + "loss": 0.7482, + "step": 5171 + }, + { + "epoch": 0.7914307574598317, + "grad_norm": 2.2378257415185385, + "learning_rate": 1.3769235757495994e-05, + "loss": 0.7268, + "step": 5172 + }, + { + "epoch": 0.7915837796480489, + "grad_norm": 2.1791575768084375, + "learning_rate": 1.3766940238698983e-05, + "loss": 0.6738, + "step": 5173 + }, + { + "epoch": 0.7917368018362663, + "grad_norm": 2.214967735911992, + "learning_rate": 1.3764644488559433e-05, + "loss": 0.6761, + "step": 5174 + }, + { + "epoch": 0.7918898240244836, + "grad_norm": 2.0880221650456083, + "learning_rate": 1.3762348507218342e-05, + "loss": 0.701, + "step": 5175 + }, + { + "epoch": 0.7920428462127008, + "grad_norm": 1.9249158885011985, + "learning_rate": 1.3760052294816708e-05, + "loss": 0.6566, + "step": 5176 + }, + { + "epoch": 0.7921958684009182, + "grad_norm": 2.304539470343536, + "learning_rate": 1.3757755851495553e-05, + "loss": 0.7316, + "step": 5177 + }, + { + "epoch": 0.7923488905891354, + "grad_norm": 2.040611815803622, + "learning_rate": 1.3755459177395911e-05, + "loss": 0.7166, + "step": 5178 + }, + { + "epoch": 0.7925019127773527, + "grad_norm": 2.3706391834263942, + "learning_rate": 1.3753162272658832e-05, + "loss": 0.6723, + "step": 5179 + }, + { + "epoch": 0.79265493496557, + "grad_norm": 2.1589457526111877, + "learning_rate": 1.3750865137425371e-05, + "loss": 0.7957, + "step": 5180 + }, + { + "epoch": 0.7928079571537873, + "grad_norm": 2.357150090202171, + "learning_rate": 1.3748567771836612e-05, + "loss": 0.7333, + "step": 5181 + }, + { + "epoch": 0.7929609793420046, + "grad_norm": 2.035270416220051, + "learning_rate": 1.374627017603364e-05, + "loss": 0.6303, + "step": 5182 + }, + { + "epoch": 0.7931140015302219, + "grad_norm": 2.310334406515395, + "learning_rate": 1.374397235015756e-05, + "loss": 0.6799, + "step": 5183 + }, + { + "epoch": 0.7932670237184392, + "grad_norm": 2.5329925277281378, + "learning_rate": 1.3741674294349494e-05, + "loss": 0.7163, + "step": 5184 + }, + { + "epoch": 0.7934200459066565, + "grad_norm": 2.3209415935722078, + "learning_rate": 1.373937600875057e-05, + "loss": 0.6513, + "step": 5185 + }, + { + "epoch": 0.7935730680948737, + "grad_norm": 2.2392121898655044, + "learning_rate": 1.3737077493501939e-05, + "loss": 0.6232, + "step": 5186 + }, + { + "epoch": 0.793726090283091, + "grad_norm": 2.053324943153721, + "learning_rate": 1.373477874874476e-05, + "loss": 0.6207, + "step": 5187 + }, + { + "epoch": 0.7938791124713084, + "grad_norm": 2.3153062662700026, + "learning_rate": 1.3732479774620206e-05, + "loss": 0.7312, + "step": 5188 + }, + { + "epoch": 0.7940321346595256, + "grad_norm": 2.202811664872602, + "learning_rate": 1.3730180571269465e-05, + "loss": 0.736, + "step": 5189 + }, + { + "epoch": 0.7941851568477429, + "grad_norm": 2.095998770625852, + "learning_rate": 1.3727881138833746e-05, + "loss": 0.6731, + "step": 5190 + }, + { + "epoch": 0.7943381790359603, + "grad_norm": 1.841345240252431, + "learning_rate": 1.3725581477454262e-05, + "loss": 0.5946, + "step": 5191 + }, + { + "epoch": 0.7944912012241775, + "grad_norm": 2.390734382027951, + "learning_rate": 1.3723281587272243e-05, + "loss": 0.6396, + "step": 5192 + }, + { + "epoch": 0.7946442234123948, + "grad_norm": 2.2841584319362758, + "learning_rate": 1.3720981468428938e-05, + "loss": 0.7108, + "step": 5193 + }, + { + "epoch": 0.794797245600612, + "grad_norm": 2.3818717367728075, + "learning_rate": 1.3718681121065605e-05, + "loss": 0.7718, + "step": 5194 + }, + { + "epoch": 0.7949502677888294, + "grad_norm": 2.5859476893129165, + "learning_rate": 1.3716380545323516e-05, + "loss": 0.7855, + "step": 5195 + }, + { + "epoch": 0.7951032899770467, + "grad_norm": 2.3940293661363654, + "learning_rate": 1.371407974134396e-05, + "loss": 0.7146, + "step": 5196 + }, + { + "epoch": 0.7952563121652639, + "grad_norm": 2.2903896079503316, + "learning_rate": 1.3711778709268235e-05, + "loss": 0.7184, + "step": 5197 + }, + { + "epoch": 0.7954093343534813, + "grad_norm": 2.01581293118149, + "learning_rate": 1.3709477449237661e-05, + "loss": 0.6431, + "step": 5198 + }, + { + "epoch": 0.7955623565416986, + "grad_norm": 1.8836934333964197, + "learning_rate": 1.3707175961393564e-05, + "loss": 0.616, + "step": 5199 + }, + { + "epoch": 0.7957153787299158, + "grad_norm": 2.2744019198062815, + "learning_rate": 1.370487424587729e-05, + "loss": 0.6745, + "step": 5200 + }, + { + "epoch": 0.7958684009181332, + "grad_norm": 2.117675246910853, + "learning_rate": 1.3702572302830194e-05, + "loss": 0.6382, + "step": 5201 + }, + { + "epoch": 0.7960214231063504, + "grad_norm": 2.125438352072235, + "learning_rate": 1.370027013239365e-05, + "loss": 0.6448, + "step": 5202 + }, + { + "epoch": 0.7961744452945677, + "grad_norm": 2.174518702017577, + "learning_rate": 1.369796773470904e-05, + "loss": 0.5696, + "step": 5203 + }, + { + "epoch": 0.796327467482785, + "grad_norm": 2.6649237891451216, + "learning_rate": 1.3695665109917764e-05, + "loss": 0.7357, + "step": 5204 + }, + { + "epoch": 0.7964804896710023, + "grad_norm": 2.130917757031721, + "learning_rate": 1.3693362258161239e-05, + "loss": 0.7103, + "step": 5205 + }, + { + "epoch": 0.7966335118592196, + "grad_norm": 2.308278946221109, + "learning_rate": 1.3691059179580888e-05, + "loss": 0.7996, + "step": 5206 + }, + { + "epoch": 0.7967865340474368, + "grad_norm": 2.2292739591762567, + "learning_rate": 1.3688755874318154e-05, + "loss": 0.7414, + "step": 5207 + }, + { + "epoch": 0.7969395562356542, + "grad_norm": 2.3250780563176474, + "learning_rate": 1.3686452342514486e-05, + "loss": 0.6129, + "step": 5208 + }, + { + "epoch": 0.7970925784238715, + "grad_norm": 2.0736440654210644, + "learning_rate": 1.3684148584311365e-05, + "loss": 0.6746, + "step": 5209 + }, + { + "epoch": 0.7972456006120887, + "grad_norm": 2.3603589942528678, + "learning_rate": 1.3681844599850265e-05, + "loss": 0.7034, + "step": 5210 + }, + { + "epoch": 0.797398622800306, + "grad_norm": 2.349702397627265, + "learning_rate": 1.3679540389272683e-05, + "loss": 0.6011, + "step": 5211 + }, + { + "epoch": 0.7975516449885234, + "grad_norm": 2.366788617289836, + "learning_rate": 1.3677235952720132e-05, + "loss": 0.6788, + "step": 5212 + }, + { + "epoch": 0.7977046671767406, + "grad_norm": 2.0913280489885233, + "learning_rate": 1.3674931290334137e-05, + "loss": 0.6981, + "step": 5213 + }, + { + "epoch": 0.7978576893649579, + "grad_norm": 2.5304173735852045, + "learning_rate": 1.3672626402256233e-05, + "loss": 0.7291, + "step": 5214 + }, + { + "epoch": 0.7980107115531752, + "grad_norm": 2.3922657289236673, + "learning_rate": 1.3670321288627975e-05, + "loss": 0.7516, + "step": 5215 + }, + { + "epoch": 0.7981637337413925, + "grad_norm": 2.198486199079782, + "learning_rate": 1.3668015949590929e-05, + "loss": 0.6851, + "step": 5216 + }, + { + "epoch": 0.7983167559296098, + "grad_norm": 2.3466456024492053, + "learning_rate": 1.3665710385286674e-05, + "loss": 0.6804, + "step": 5217 + }, + { + "epoch": 0.798469778117827, + "grad_norm": 2.166891644272971, + "learning_rate": 1.3663404595856804e-05, + "loss": 0.7362, + "step": 5218 + }, + { + "epoch": 0.7986228003060444, + "grad_norm": 2.3442111181100946, + "learning_rate": 1.3661098581442924e-05, + "loss": 0.7576, + "step": 5219 + }, + { + "epoch": 0.7987758224942617, + "grad_norm": 2.2049727285082357, + "learning_rate": 1.3658792342186662e-05, + "loss": 0.632, + "step": 5220 + }, + { + "epoch": 0.7989288446824789, + "grad_norm": 2.4747037218029666, + "learning_rate": 1.3656485878229646e-05, + "loss": 0.8662, + "step": 5221 + }, + { + "epoch": 0.7990818668706963, + "grad_norm": 2.154683342466119, + "learning_rate": 1.365417918971353e-05, + "loss": 0.7382, + "step": 5222 + }, + { + "epoch": 0.7992348890589135, + "grad_norm": 2.3297008480161994, + "learning_rate": 1.3651872276779975e-05, + "loss": 0.7879, + "step": 5223 + }, + { + "epoch": 0.7993879112471308, + "grad_norm": 2.678458704148208, + "learning_rate": 1.3649565139570653e-05, + "loss": 0.8128, + "step": 5224 + }, + { + "epoch": 0.7995409334353482, + "grad_norm": 2.4406095933985035, + "learning_rate": 1.3647257778227263e-05, + "loss": 0.6893, + "step": 5225 + }, + { + "epoch": 0.7996939556235654, + "grad_norm": 2.533590774082434, + "learning_rate": 1.3644950192891502e-05, + "loss": 0.8111, + "step": 5226 + }, + { + "epoch": 0.7998469778117827, + "grad_norm": 2.1737368302955056, + "learning_rate": 1.3642642383705092e-05, + "loss": 0.7153, + "step": 5227 + }, + { + "epoch": 0.8, + "grad_norm": 2.033682378494723, + "learning_rate": 1.3640334350809763e-05, + "loss": 0.7538, + "step": 5228 + }, + { + "epoch": 0.8001530221882173, + "grad_norm": 2.2980150041237635, + "learning_rate": 1.3638026094347261e-05, + "loss": 0.7585, + "step": 5229 + }, + { + "epoch": 0.8003060443764346, + "grad_norm": 2.139295198153623, + "learning_rate": 1.3635717614459342e-05, + "loss": 0.6941, + "step": 5230 + }, + { + "epoch": 0.8004590665646518, + "grad_norm": 2.331590755207613, + "learning_rate": 1.3633408911287785e-05, + "loss": 0.7219, + "step": 5231 + }, + { + "epoch": 0.8006120887528692, + "grad_norm": 2.132296410347176, + "learning_rate": 1.363109998497437e-05, + "loss": 0.742, + "step": 5232 + }, + { + "epoch": 0.8007651109410865, + "grad_norm": 2.2753739670012743, + "learning_rate": 1.3628790835660901e-05, + "loss": 0.6547, + "step": 5233 + }, + { + "epoch": 0.8009181331293037, + "grad_norm": 2.213855353645808, + "learning_rate": 1.3626481463489189e-05, + "loss": 0.6624, + "step": 5234 + }, + { + "epoch": 0.801071155317521, + "grad_norm": 2.3897025233653797, + "learning_rate": 1.3624171868601066e-05, + "loss": 0.7068, + "step": 5235 + }, + { + "epoch": 0.8012241775057384, + "grad_norm": 2.161798259768198, + "learning_rate": 1.3621862051138368e-05, + "loss": 0.6786, + "step": 5236 + }, + { + "epoch": 0.8013771996939556, + "grad_norm": 2.063352788797973, + "learning_rate": 1.3619552011242956e-05, + "loss": 0.6605, + "step": 5237 + }, + { + "epoch": 0.8015302218821729, + "grad_norm": 2.191978633874912, + "learning_rate": 1.3617241749056693e-05, + "loss": 0.6603, + "step": 5238 + }, + { + "epoch": 0.8016832440703902, + "grad_norm": 2.236976470780728, + "learning_rate": 1.3614931264721464e-05, + "loss": 0.7457, + "step": 5239 + }, + { + "epoch": 0.8018362662586075, + "grad_norm": 2.348713734882274, + "learning_rate": 1.3612620558379164e-05, + "loss": 0.788, + "step": 5240 + }, + { + "epoch": 0.8019892884468248, + "grad_norm": 2.1540503355147136, + "learning_rate": 1.3610309630171703e-05, + "loss": 0.6961, + "step": 5241 + }, + { + "epoch": 0.802142310635042, + "grad_norm": 2.2481983626353297, + "learning_rate": 1.3607998480241005e-05, + "loss": 0.7852, + "step": 5242 + }, + { + "epoch": 0.8022953328232594, + "grad_norm": 2.223866399302239, + "learning_rate": 1.3605687108729005e-05, + "loss": 0.7054, + "step": 5243 + }, + { + "epoch": 0.8024483550114767, + "grad_norm": 2.2526593082756587, + "learning_rate": 1.3603375515777654e-05, + "loss": 0.7195, + "step": 5244 + }, + { + "epoch": 0.8026013771996939, + "grad_norm": 2.1762612602974545, + "learning_rate": 1.3601063701528916e-05, + "loss": 0.7121, + "step": 5245 + }, + { + "epoch": 0.8027543993879113, + "grad_norm": 2.19828314276474, + "learning_rate": 1.359875166612477e-05, + "loss": 0.6465, + "step": 5246 + }, + { + "epoch": 0.8029074215761285, + "grad_norm": 2.1003393586466186, + "learning_rate": 1.3596439409707205e-05, + "loss": 0.6349, + "step": 5247 + }, + { + "epoch": 0.8030604437643458, + "grad_norm": 2.481715046553168, + "learning_rate": 1.3594126932418226e-05, + "loss": 0.7672, + "step": 5248 + }, + { + "epoch": 0.8032134659525632, + "grad_norm": 2.055351929826298, + "learning_rate": 1.359181423439985e-05, + "loss": 0.6777, + "step": 5249 + }, + { + "epoch": 0.8033664881407804, + "grad_norm": 1.9408465558535808, + "learning_rate": 1.3589501315794115e-05, + "loss": 0.5616, + "step": 5250 + }, + { + "epoch": 0.8035195103289977, + "grad_norm": 2.168856933193797, + "learning_rate": 1.358718817674306e-05, + "loss": 0.7286, + "step": 5251 + }, + { + "epoch": 0.803672532517215, + "grad_norm": 2.2388441038555706, + "learning_rate": 1.3584874817388744e-05, + "loss": 0.6583, + "step": 5252 + }, + { + "epoch": 0.8038255547054323, + "grad_norm": 2.638753746731687, + "learning_rate": 1.3582561237873244e-05, + "loss": 0.7871, + "step": 5253 + }, + { + "epoch": 0.8039785768936496, + "grad_norm": 2.173253012550754, + "learning_rate": 1.3580247438338643e-05, + "loss": 0.6701, + "step": 5254 + }, + { + "epoch": 0.8041315990818668, + "grad_norm": 2.373102282978098, + "learning_rate": 1.3577933418927039e-05, + "loss": 0.8085, + "step": 5255 + }, + { + "epoch": 0.8042846212700842, + "grad_norm": 2.1330407791676613, + "learning_rate": 1.3575619179780549e-05, + "loss": 0.747, + "step": 5256 + }, + { + "epoch": 0.8044376434583015, + "grad_norm": 2.1281332913564484, + "learning_rate": 1.3573304721041294e-05, + "loss": 0.7252, + "step": 5257 + }, + { + "epoch": 0.8045906656465187, + "grad_norm": 2.1458569602813062, + "learning_rate": 1.3570990042851419e-05, + "loss": 0.7456, + "step": 5258 + }, + { + "epoch": 0.804743687834736, + "grad_norm": 2.3701157521990894, + "learning_rate": 1.3568675145353076e-05, + "loss": 0.8018, + "step": 5259 + }, + { + "epoch": 0.8048967100229534, + "grad_norm": 2.478264328190266, + "learning_rate": 1.3566360028688432e-05, + "loss": 0.7114, + "step": 5260 + }, + { + "epoch": 0.8050497322111706, + "grad_norm": 2.2845510559676074, + "learning_rate": 1.3564044692999667e-05, + "loss": 0.7534, + "step": 5261 + }, + { + "epoch": 0.8052027543993879, + "grad_norm": 2.0041111447825783, + "learning_rate": 1.3561729138428974e-05, + "loss": 0.6483, + "step": 5262 + }, + { + "epoch": 0.8053557765876052, + "grad_norm": 2.419128035536536, + "learning_rate": 1.3559413365118563e-05, + "loss": 0.6902, + "step": 5263 + }, + { + "epoch": 0.8055087987758225, + "grad_norm": 2.154837398243689, + "learning_rate": 1.3557097373210651e-05, + "loss": 0.7418, + "step": 5264 + }, + { + "epoch": 0.8056618209640398, + "grad_norm": 2.6319673770565983, + "learning_rate": 1.3554781162847472e-05, + "loss": 0.7431, + "step": 5265 + }, + { + "epoch": 0.805814843152257, + "grad_norm": 2.1323745296359387, + "learning_rate": 1.355246473417128e-05, + "loss": 0.6952, + "step": 5266 + }, + { + "epoch": 0.8059678653404744, + "grad_norm": 2.1242915789939865, + "learning_rate": 1.3550148087324329e-05, + "loss": 0.7627, + "step": 5267 + }, + { + "epoch": 0.8061208875286917, + "grad_norm": 2.72482323781392, + "learning_rate": 1.3547831222448893e-05, + "loss": 0.789, + "step": 5268 + }, + { + "epoch": 0.8062739097169089, + "grad_norm": 2.357414573913534, + "learning_rate": 1.3545514139687264e-05, + "loss": 0.7448, + "step": 5269 + }, + { + "epoch": 0.8064269319051263, + "grad_norm": 2.169457925241073, + "learning_rate": 1.3543196839181745e-05, + "loss": 0.6584, + "step": 5270 + }, + { + "epoch": 0.8065799540933435, + "grad_norm": 2.036547827305924, + "learning_rate": 1.3540879321074642e-05, + "loss": 0.5961, + "step": 5271 + }, + { + "epoch": 0.8067329762815608, + "grad_norm": 2.3964724268378363, + "learning_rate": 1.3538561585508292e-05, + "loss": 0.8957, + "step": 5272 + }, + { + "epoch": 0.8068859984697782, + "grad_norm": 2.3452218195555137, + "learning_rate": 1.353624363262503e-05, + "loss": 0.6594, + "step": 5273 + }, + { + "epoch": 0.8070390206579954, + "grad_norm": 2.1038872850157735, + "learning_rate": 1.3533925462567213e-05, + "loss": 0.6078, + "step": 5274 + }, + { + "epoch": 0.8071920428462127, + "grad_norm": 2.356954054341592, + "learning_rate": 1.3531607075477209e-05, + "loss": 0.7797, + "step": 5275 + }, + { + "epoch": 0.80734506503443, + "grad_norm": 2.0115247230251816, + "learning_rate": 1.3529288471497399e-05, + "loss": 0.5769, + "step": 5276 + }, + { + "epoch": 0.8074980872226473, + "grad_norm": 2.276904775969954, + "learning_rate": 1.3526969650770175e-05, + "loss": 0.7503, + "step": 5277 + }, + { + "epoch": 0.8076511094108646, + "grad_norm": 2.167203694480653, + "learning_rate": 1.3524650613437948e-05, + "loss": 0.6904, + "step": 5278 + }, + { + "epoch": 0.8078041315990818, + "grad_norm": 1.9616634696078104, + "learning_rate": 1.3522331359643141e-05, + "loss": 0.6396, + "step": 5279 + }, + { + "epoch": 0.8079571537872992, + "grad_norm": 2.4533852708261406, + "learning_rate": 1.3520011889528185e-05, + "loss": 0.7826, + "step": 5280 + }, + { + "epoch": 0.8081101759755165, + "grad_norm": 2.5398426188480645, + "learning_rate": 1.3517692203235526e-05, + "loss": 0.7062, + "step": 5281 + }, + { + "epoch": 0.8082631981637337, + "grad_norm": 2.0377247734089594, + "learning_rate": 1.351537230090763e-05, + "loss": 0.6129, + "step": 5282 + }, + { + "epoch": 0.808416220351951, + "grad_norm": 2.5926738747286517, + "learning_rate": 1.3513052182686968e-05, + "loss": 0.7073, + "step": 5283 + }, + { + "epoch": 0.8085692425401684, + "grad_norm": 2.104975770665454, + "learning_rate": 1.3510731848716028e-05, + "loss": 0.8069, + "step": 5284 + }, + { + "epoch": 0.8087222647283856, + "grad_norm": 2.1377050518059746, + "learning_rate": 1.350841129913731e-05, + "loss": 0.717, + "step": 5285 + }, + { + "epoch": 0.8088752869166029, + "grad_norm": 2.2039599414867874, + "learning_rate": 1.350609053409333e-05, + "loss": 0.6571, + "step": 5286 + }, + { + "epoch": 0.8090283091048202, + "grad_norm": 2.1174602800748397, + "learning_rate": 1.3503769553726616e-05, + "loss": 0.6212, + "step": 5287 + }, + { + "epoch": 0.8091813312930375, + "grad_norm": 2.547319193448088, + "learning_rate": 1.3501448358179705e-05, + "loss": 0.8006, + "step": 5288 + }, + { + "epoch": 0.8093343534812548, + "grad_norm": 2.3880553798277693, + "learning_rate": 1.3499126947595154e-05, + "loss": 0.6515, + "step": 5289 + }, + { + "epoch": 0.809487375669472, + "grad_norm": 2.220555933056926, + "learning_rate": 1.3496805322115525e-05, + "loss": 0.7402, + "step": 5290 + }, + { + "epoch": 0.8096403978576894, + "grad_norm": 2.1418695856415404, + "learning_rate": 1.3494483481883408e-05, + "loss": 0.6952, + "step": 5291 + }, + { + "epoch": 0.8097934200459067, + "grad_norm": 2.280869004587261, + "learning_rate": 1.3492161427041385e-05, + "loss": 0.6293, + "step": 5292 + }, + { + "epoch": 0.8099464422341239, + "grad_norm": 2.1942676446078466, + "learning_rate": 1.3489839157732067e-05, + "loss": 0.6146, + "step": 5293 + }, + { + "epoch": 0.8100994644223413, + "grad_norm": 1.964131045316613, + "learning_rate": 1.3487516674098076e-05, + "loss": 0.6257, + "step": 5294 + }, + { + "epoch": 0.8102524866105585, + "grad_norm": 2.192523042159634, + "learning_rate": 1.3485193976282045e-05, + "loss": 0.7152, + "step": 5295 + }, + { + "epoch": 0.8104055087987758, + "grad_norm": 2.2224555260794325, + "learning_rate": 1.3482871064426612e-05, + "loss": 0.7834, + "step": 5296 + }, + { + "epoch": 0.8105585309869932, + "grad_norm": 2.3486851929486092, + "learning_rate": 1.3480547938674446e-05, + "loss": 0.7778, + "step": 5297 + }, + { + "epoch": 0.8107115531752104, + "grad_norm": 2.066083356496167, + "learning_rate": 1.3478224599168215e-05, + "loss": 0.6438, + "step": 5298 + }, + { + "epoch": 0.8108645753634277, + "grad_norm": 2.172729196692573, + "learning_rate": 1.3475901046050603e-05, + "loss": 0.7957, + "step": 5299 + }, + { + "epoch": 0.811017597551645, + "grad_norm": 2.0121964473406675, + "learning_rate": 1.3473577279464309e-05, + "loss": 0.7376, + "step": 5300 + }, + { + "epoch": 0.8111706197398623, + "grad_norm": 2.5865207720030368, + "learning_rate": 1.3471253299552049e-05, + "loss": 0.7715, + "step": 5301 + }, + { + "epoch": 0.8113236419280796, + "grad_norm": 2.2695859310630047, + "learning_rate": 1.3468929106456543e-05, + "loss": 0.7316, + "step": 5302 + }, + { + "epoch": 0.8114766641162968, + "grad_norm": 2.212371993004032, + "learning_rate": 1.3466604700320529e-05, + "loss": 0.8111, + "step": 5303 + }, + { + "epoch": 0.8116296863045142, + "grad_norm": 2.7667086132112297, + "learning_rate": 1.346428008128676e-05, + "loss": 0.7738, + "step": 5304 + }, + { + "epoch": 0.8117827084927315, + "grad_norm": 2.242386698700782, + "learning_rate": 1.3461955249497998e-05, + "loss": 0.6502, + "step": 5305 + }, + { + "epoch": 0.8119357306809487, + "grad_norm": 2.3940374260343606, + "learning_rate": 1.3459630205097018e-05, + "loss": 0.6801, + "step": 5306 + }, + { + "epoch": 0.812088752869166, + "grad_norm": 2.4788937521650114, + "learning_rate": 1.345730494822662e-05, + "loss": 0.6994, + "step": 5307 + }, + { + "epoch": 0.8122417750573834, + "grad_norm": 2.135678351797468, + "learning_rate": 1.3454979479029595e-05, + "loss": 0.7326, + "step": 5308 + }, + { + "epoch": 0.8123947972456006, + "grad_norm": 2.380508428475471, + "learning_rate": 1.3452653797648765e-05, + "loss": 0.7547, + "step": 5309 + }, + { + "epoch": 0.8125478194338179, + "grad_norm": 2.292228512939073, + "learning_rate": 1.3450327904226956e-05, + "loss": 0.7265, + "step": 5310 + }, + { + "epoch": 0.8127008416220352, + "grad_norm": 2.6999034724742144, + "learning_rate": 1.3448001798907017e-05, + "loss": 0.6915, + "step": 5311 + }, + { + "epoch": 0.8128538638102525, + "grad_norm": 2.277333877537579, + "learning_rate": 1.3445675481831798e-05, + "loss": 0.61, + "step": 5312 + }, + { + "epoch": 0.8130068859984698, + "grad_norm": 2.179731789967011, + "learning_rate": 1.3443348953144166e-05, + "loss": 0.6815, + "step": 5313 + }, + { + "epoch": 0.813159908186687, + "grad_norm": 2.1453547823586185, + "learning_rate": 1.3441022212987008e-05, + "loss": 0.6212, + "step": 5314 + }, + { + "epoch": 0.8133129303749044, + "grad_norm": 2.0990096504871953, + "learning_rate": 1.343869526150321e-05, + "loss": 0.621, + "step": 5315 + }, + { + "epoch": 0.8134659525631217, + "grad_norm": 2.181671391425356, + "learning_rate": 1.3436368098835689e-05, + "loss": 0.612, + "step": 5316 + }, + { + "epoch": 0.8136189747513389, + "grad_norm": 2.16318928001326, + "learning_rate": 1.3434040725127355e-05, + "loss": 0.7547, + "step": 5317 + }, + { + "epoch": 0.8137719969395563, + "grad_norm": 2.5676933895963674, + "learning_rate": 1.3431713140521146e-05, + "loss": 0.9013, + "step": 5318 + }, + { + "epoch": 0.8139250191277735, + "grad_norm": 2.342905564848808, + "learning_rate": 1.3429385345160012e-05, + "loss": 0.7104, + "step": 5319 + }, + { + "epoch": 0.8140780413159908, + "grad_norm": 2.142110911327244, + "learning_rate": 1.3427057339186906e-05, + "loss": 0.7011, + "step": 5320 + }, + { + "epoch": 0.8142310635042082, + "grad_norm": 2.2067029947839423, + "learning_rate": 1.34247291227448e-05, + "loss": 0.6377, + "step": 5321 + }, + { + "epoch": 0.8143840856924254, + "grad_norm": 2.2040511888427496, + "learning_rate": 1.3422400695976685e-05, + "loss": 0.5819, + "step": 5322 + }, + { + "epoch": 0.8145371078806427, + "grad_norm": 2.1632943457971745, + "learning_rate": 1.3420072059025552e-05, + "loss": 0.6302, + "step": 5323 + }, + { + "epoch": 0.8146901300688599, + "grad_norm": 2.2258275335537303, + "learning_rate": 1.3417743212034415e-05, + "loss": 0.8637, + "step": 5324 + }, + { + "epoch": 0.8148431522570773, + "grad_norm": 2.3153241322316, + "learning_rate": 1.3415414155146299e-05, + "loss": 0.8324, + "step": 5325 + }, + { + "epoch": 0.8149961744452946, + "grad_norm": 2.3974769360130863, + "learning_rate": 1.3413084888504236e-05, + "loss": 0.755, + "step": 5326 + }, + { + "epoch": 0.8151491966335118, + "grad_norm": 2.2510661996362904, + "learning_rate": 1.341075541225128e-05, + "loss": 0.6549, + "step": 5327 + }, + { + "epoch": 0.8153022188217292, + "grad_norm": 2.279833694007899, + "learning_rate": 1.340842572653049e-05, + "loss": 0.7515, + "step": 5328 + }, + { + "epoch": 0.8154552410099465, + "grad_norm": 2.2168201833394465, + "learning_rate": 1.3406095831484943e-05, + "loss": 0.748, + "step": 5329 + }, + { + "epoch": 0.8156082631981637, + "grad_norm": 2.302319672441321, + "learning_rate": 1.3403765727257724e-05, + "loss": 0.7128, + "step": 5330 + }, + { + "epoch": 0.815761285386381, + "grad_norm": 2.3865943289140295, + "learning_rate": 1.3401435413991936e-05, + "loss": 0.7773, + "step": 5331 + }, + { + "epoch": 0.8159143075745983, + "grad_norm": 2.2644122607612918, + "learning_rate": 1.3399104891830698e-05, + "loss": 0.6749, + "step": 5332 + }, + { + "epoch": 0.8160673297628156, + "grad_norm": 2.220283646827807, + "learning_rate": 1.3396774160917128e-05, + "loss": 0.6206, + "step": 5333 + }, + { + "epoch": 0.8162203519510329, + "grad_norm": 2.2493947518081145, + "learning_rate": 1.3394443221394366e-05, + "loss": 0.743, + "step": 5334 + }, + { + "epoch": 0.8163733741392502, + "grad_norm": 2.4348049499278828, + "learning_rate": 1.339211207340557e-05, + "loss": 0.8518, + "step": 5335 + }, + { + "epoch": 0.8165263963274675, + "grad_norm": 2.2335021156921036, + "learning_rate": 1.33897807170939e-05, + "loss": 0.8099, + "step": 5336 + }, + { + "epoch": 0.8166794185156848, + "grad_norm": 2.2602615444390515, + "learning_rate": 1.3387449152602536e-05, + "loss": 0.744, + "step": 5337 + }, + { + "epoch": 0.816832440703902, + "grad_norm": 2.483487434991995, + "learning_rate": 1.3385117380074668e-05, + "loss": 0.6676, + "step": 5338 + }, + { + "epoch": 0.8169854628921194, + "grad_norm": 2.252465139032877, + "learning_rate": 1.3382785399653498e-05, + "loss": 0.6504, + "step": 5339 + }, + { + "epoch": 0.8171384850803366, + "grad_norm": 2.1653703088602487, + "learning_rate": 1.3380453211482243e-05, + "loss": 0.6743, + "step": 5340 + }, + { + "epoch": 0.8172915072685539, + "grad_norm": 2.36517463966179, + "learning_rate": 1.3378120815704132e-05, + "loss": 0.7109, + "step": 5341 + }, + { + "epoch": 0.8174445294567713, + "grad_norm": 2.15790281403125, + "learning_rate": 1.3375788212462408e-05, + "loss": 0.6949, + "step": 5342 + }, + { + "epoch": 0.8175975516449885, + "grad_norm": 2.0850769500930286, + "learning_rate": 1.3373455401900323e-05, + "loss": 0.6332, + "step": 5343 + }, + { + "epoch": 0.8177505738332058, + "grad_norm": 2.1616905772965547, + "learning_rate": 1.3371122384161144e-05, + "loss": 0.7172, + "step": 5344 + }, + { + "epoch": 0.8179035960214232, + "grad_norm": 2.4995256490816633, + "learning_rate": 1.3368789159388151e-05, + "loss": 0.8572, + "step": 5345 + }, + { + "epoch": 0.8180566182096404, + "grad_norm": 2.178755972070649, + "learning_rate": 1.3366455727724638e-05, + "loss": 0.6471, + "step": 5346 + }, + { + "epoch": 0.8182096403978577, + "grad_norm": 2.061965745724041, + "learning_rate": 1.3364122089313907e-05, + "loss": 0.6209, + "step": 5347 + }, + { + "epoch": 0.8183626625860749, + "grad_norm": 2.0978327032670046, + "learning_rate": 1.3361788244299281e-05, + "loss": 0.6965, + "step": 5348 + }, + { + "epoch": 0.8185156847742923, + "grad_norm": 2.1655873920099435, + "learning_rate": 1.3359454192824088e-05, + "loss": 0.7474, + "step": 5349 + }, + { + "epoch": 0.8186687069625096, + "grad_norm": 2.1707296467790003, + "learning_rate": 1.3357119935031668e-05, + "loss": 0.6475, + "step": 5350 + }, + { + "epoch": 0.8188217291507268, + "grad_norm": 2.4511219553404624, + "learning_rate": 1.3354785471065382e-05, + "loss": 0.7043, + "step": 5351 + }, + { + "epoch": 0.8189747513389442, + "grad_norm": 2.5976694677220986, + "learning_rate": 1.3352450801068595e-05, + "loss": 0.86, + "step": 5352 + }, + { + "epoch": 0.8191277735271615, + "grad_norm": 2.395094174010329, + "learning_rate": 1.3350115925184688e-05, + "loss": 0.7383, + "step": 5353 + }, + { + "epoch": 0.8192807957153787, + "grad_norm": 2.3064301975807098, + "learning_rate": 1.334778084355706e-05, + "loss": 0.7484, + "step": 5354 + }, + { + "epoch": 0.819433817903596, + "grad_norm": 2.2933955948779174, + "learning_rate": 1.3345445556329111e-05, + "loss": 0.6784, + "step": 5355 + }, + { + "epoch": 0.8195868400918133, + "grad_norm": 2.2922480591575276, + "learning_rate": 1.334311006364426e-05, + "loss": 0.793, + "step": 5356 + }, + { + "epoch": 0.8197398622800306, + "grad_norm": 2.157361043385925, + "learning_rate": 1.3340774365645948e-05, + "loss": 0.649, + "step": 5357 + }, + { + "epoch": 0.8198928844682479, + "grad_norm": 2.4028534857255877, + "learning_rate": 1.333843846247761e-05, + "loss": 0.7454, + "step": 5358 + }, + { + "epoch": 0.8200459066564652, + "grad_norm": 2.2779711254618897, + "learning_rate": 1.3336102354282706e-05, + "loss": 0.6664, + "step": 5359 + }, + { + "epoch": 0.8201989288446825, + "grad_norm": 2.1582367759018704, + "learning_rate": 1.3333766041204705e-05, + "loss": 0.6144, + "step": 5360 + }, + { + "epoch": 0.8203519510328998, + "grad_norm": 2.302800374343068, + "learning_rate": 1.3331429523387091e-05, + "loss": 0.5859, + "step": 5361 + }, + { + "epoch": 0.820504973221117, + "grad_norm": 2.1501206310043766, + "learning_rate": 1.3329092800973356e-05, + "loss": 0.6121, + "step": 5362 + }, + { + "epoch": 0.8206579954093344, + "grad_norm": 2.3631056301619995, + "learning_rate": 1.332675587410701e-05, + "loss": 0.7051, + "step": 5363 + }, + { + "epoch": 0.8208110175975516, + "grad_norm": 2.3421798133262715, + "learning_rate": 1.3324418742931568e-05, + "loss": 0.6834, + "step": 5364 + }, + { + "epoch": 0.8209640397857689, + "grad_norm": 2.438015095671974, + "learning_rate": 1.332208140759057e-05, + "loss": 0.686, + "step": 5365 + }, + { + "epoch": 0.8211170619739863, + "grad_norm": 2.3411328591375584, + "learning_rate": 1.3319743868227555e-05, + "loss": 0.7772, + "step": 5366 + }, + { + "epoch": 0.8212700841622035, + "grad_norm": 2.4243773349249, + "learning_rate": 1.3317406124986083e-05, + "loss": 0.7733, + "step": 5367 + }, + { + "epoch": 0.8214231063504208, + "grad_norm": 2.6127991053589907, + "learning_rate": 1.3315068178009722e-05, + "loss": 0.611, + "step": 5368 + }, + { + "epoch": 0.8215761285386382, + "grad_norm": 2.3139864560744186, + "learning_rate": 1.3312730027442057e-05, + "loss": 0.7795, + "step": 5369 + }, + { + "epoch": 0.8217291507268554, + "grad_norm": 2.3885664294242295, + "learning_rate": 1.3310391673426681e-05, + "loss": 0.7107, + "step": 5370 + }, + { + "epoch": 0.8218821729150727, + "grad_norm": 2.4219894732303304, + "learning_rate": 1.3308053116107202e-05, + "loss": 0.6879, + "step": 5371 + }, + { + "epoch": 0.8220351951032899, + "grad_norm": 2.5090665576091546, + "learning_rate": 1.3305714355627237e-05, + "loss": 0.7902, + "step": 5372 + }, + { + "epoch": 0.8221882172915073, + "grad_norm": 2.470907543441836, + "learning_rate": 1.3303375392130427e-05, + "loss": 0.6743, + "step": 5373 + }, + { + "epoch": 0.8223412394797246, + "grad_norm": 1.9891713032992344, + "learning_rate": 1.3301036225760408e-05, + "loss": 0.5842, + "step": 5374 + }, + { + "epoch": 0.8224942616679418, + "grad_norm": 2.1719841885826554, + "learning_rate": 1.329869685666084e-05, + "loss": 0.7156, + "step": 5375 + }, + { + "epoch": 0.8226472838561592, + "grad_norm": 2.6093916724723134, + "learning_rate": 1.3296357284975395e-05, + "loss": 0.8111, + "step": 5376 + }, + { + "epoch": 0.8228003060443765, + "grad_norm": 2.348866234796436, + "learning_rate": 1.3294017510847754e-05, + "loss": 0.7498, + "step": 5377 + }, + { + "epoch": 0.8229533282325937, + "grad_norm": 2.4779202227153503, + "learning_rate": 1.3291677534421608e-05, + "loss": 0.7308, + "step": 5378 + }, + { + "epoch": 0.823106350420811, + "grad_norm": 2.439470281437543, + "learning_rate": 1.328933735584067e-05, + "loss": 0.8282, + "step": 5379 + }, + { + "epoch": 0.8232593726090283, + "grad_norm": 1.9490028637280303, + "learning_rate": 1.3286996975248659e-05, + "loss": 0.5773, + "step": 5380 + }, + { + "epoch": 0.8234123947972456, + "grad_norm": 2.17675951263559, + "learning_rate": 1.3284656392789301e-05, + "loss": 0.6001, + "step": 5381 + }, + { + "epoch": 0.8235654169854629, + "grad_norm": 2.5997970873418765, + "learning_rate": 1.3282315608606347e-05, + "loss": 0.6896, + "step": 5382 + }, + { + "epoch": 0.8237184391736802, + "grad_norm": 2.1650822698710694, + "learning_rate": 1.327997462284355e-05, + "loss": 0.624, + "step": 5383 + }, + { + "epoch": 0.8238714613618975, + "grad_norm": 2.1112801781705475, + "learning_rate": 1.3277633435644681e-05, + "loss": 0.6039, + "step": 5384 + }, + { + "epoch": 0.8240244835501148, + "grad_norm": 2.3871543648388194, + "learning_rate": 1.3275292047153516e-05, + "loss": 0.7194, + "step": 5385 + }, + { + "epoch": 0.824177505738332, + "grad_norm": 2.2010526920746276, + "learning_rate": 1.327295045751386e-05, + "loss": 0.7322, + "step": 5386 + }, + { + "epoch": 0.8243305279265494, + "grad_norm": 2.775859697782073, + "learning_rate": 1.3270608666869512e-05, + "loss": 0.7952, + "step": 5387 + }, + { + "epoch": 0.8244835501147666, + "grad_norm": 2.276716590063767, + "learning_rate": 1.3268266675364285e-05, + "loss": 0.6301, + "step": 5388 + }, + { + "epoch": 0.8246365723029839, + "grad_norm": 2.2539606958094787, + "learning_rate": 1.3265924483142021e-05, + "loss": 0.726, + "step": 5389 + }, + { + "epoch": 0.8247895944912013, + "grad_norm": 2.127389941524374, + "learning_rate": 1.3263582090346559e-05, + "loss": 0.607, + "step": 5390 + }, + { + "epoch": 0.8249426166794185, + "grad_norm": 2.2618517630927593, + "learning_rate": 1.3261239497121747e-05, + "loss": 0.7103, + "step": 5391 + }, + { + "epoch": 0.8250956388676358, + "grad_norm": 2.301198924576191, + "learning_rate": 1.3258896703611465e-05, + "loss": 0.674, + "step": 5392 + }, + { + "epoch": 0.8252486610558531, + "grad_norm": 2.2519800323299823, + "learning_rate": 1.3256553709959588e-05, + "loss": 0.7118, + "step": 5393 + }, + { + "epoch": 0.8254016832440704, + "grad_norm": 2.1158962008831104, + "learning_rate": 1.3254210516310004e-05, + "loss": 0.6324, + "step": 5394 + }, + { + "epoch": 0.8255547054322877, + "grad_norm": 2.1458773057151816, + "learning_rate": 1.3251867122806625e-05, + "loss": 0.6327, + "step": 5395 + }, + { + "epoch": 0.8257077276205049, + "grad_norm": 2.6875604486528704, + "learning_rate": 1.3249523529593364e-05, + "loss": 0.7069, + "step": 5396 + }, + { + "epoch": 0.8258607498087223, + "grad_norm": 2.4063745136874592, + "learning_rate": 1.3247179736814149e-05, + "loss": 0.6958, + "step": 5397 + }, + { + "epoch": 0.8260137719969396, + "grad_norm": 2.2108085408792446, + "learning_rate": 1.3244835744612926e-05, + "loss": 0.7354, + "step": 5398 + }, + { + "epoch": 0.8261667941851568, + "grad_norm": 2.1647786138967207, + "learning_rate": 1.3242491553133646e-05, + "loss": 0.7654, + "step": 5399 + }, + { + "epoch": 0.8263198163733741, + "grad_norm": 2.2781294249844355, + "learning_rate": 1.3240147162520272e-05, + "loss": 0.69, + "step": 5400 + }, + { + "epoch": 0.8264728385615915, + "grad_norm": 2.205081890912215, + "learning_rate": 1.3237802572916786e-05, + "loss": 0.7011, + "step": 5401 + }, + { + "epoch": 0.8266258607498087, + "grad_norm": 2.156765525326034, + "learning_rate": 1.3235457784467183e-05, + "loss": 0.7145, + "step": 5402 + }, + { + "epoch": 0.826778882938026, + "grad_norm": 2.0555102520992077, + "learning_rate": 1.3233112797315456e-05, + "loss": 0.5552, + "step": 5403 + }, + { + "epoch": 0.8269319051262433, + "grad_norm": 2.178522116336929, + "learning_rate": 1.3230767611605622e-05, + "loss": 0.7269, + "step": 5404 + }, + { + "epoch": 0.8270849273144606, + "grad_norm": 2.0747364623531306, + "learning_rate": 1.3228422227481712e-05, + "loss": 0.6244, + "step": 5405 + }, + { + "epoch": 0.8272379495026779, + "grad_norm": 2.2732975783479596, + "learning_rate": 1.3226076645087765e-05, + "loss": 0.6528, + "step": 5406 + }, + { + "epoch": 0.8273909716908951, + "grad_norm": 2.032788279412093, + "learning_rate": 1.3223730864567829e-05, + "loss": 0.6503, + "step": 5407 + }, + { + "epoch": 0.8275439938791125, + "grad_norm": 2.186247863682011, + "learning_rate": 1.3221384886065968e-05, + "loss": 0.7322, + "step": 5408 + }, + { + "epoch": 0.8276970160673298, + "grad_norm": 2.048505319281304, + "learning_rate": 1.3219038709726261e-05, + "loss": 0.6557, + "step": 5409 + }, + { + "epoch": 0.827850038255547, + "grad_norm": 2.1651304187113487, + "learning_rate": 1.3216692335692796e-05, + "loss": 0.6042, + "step": 5410 + }, + { + "epoch": 0.8280030604437644, + "grad_norm": 2.3706714888571576, + "learning_rate": 1.3214345764109668e-05, + "loss": 0.7171, + "step": 5411 + }, + { + "epoch": 0.8281560826319816, + "grad_norm": 2.144590330064656, + "learning_rate": 1.321199899512099e-05, + "loss": 0.5572, + "step": 5412 + }, + { + "epoch": 0.8283091048201989, + "grad_norm": 2.3176192449312727, + "learning_rate": 1.3209652028870891e-05, + "loss": 0.6542, + "step": 5413 + }, + { + "epoch": 0.8284621270084163, + "grad_norm": 2.4363610612490225, + "learning_rate": 1.3207304865503508e-05, + "loss": 0.6725, + "step": 5414 + }, + { + "epoch": 0.8286151491966335, + "grad_norm": 2.239550593859622, + "learning_rate": 1.3204957505162983e-05, + "loss": 0.6656, + "step": 5415 + }, + { + "epoch": 0.8287681713848508, + "grad_norm": 2.2066515473547743, + "learning_rate": 1.3202609947993477e-05, + "loss": 0.7237, + "step": 5416 + }, + { + "epoch": 0.8289211935730681, + "grad_norm": 2.4805510088889613, + "learning_rate": 1.3200262194139168e-05, + "loss": 0.7478, + "step": 5417 + }, + { + "epoch": 0.8290742157612854, + "grad_norm": 2.1960028509697733, + "learning_rate": 1.3197914243744237e-05, + "loss": 0.6824, + "step": 5418 + }, + { + "epoch": 0.8292272379495027, + "grad_norm": 2.3511494818838456, + "learning_rate": 1.3195566096952883e-05, + "loss": 0.7456, + "step": 5419 + }, + { + "epoch": 0.8293802601377199, + "grad_norm": 2.249239523564754, + "learning_rate": 1.3193217753909316e-05, + "loss": 0.7507, + "step": 5420 + }, + { + "epoch": 0.8295332823259373, + "grad_norm": 1.9415800244217254, + "learning_rate": 1.3190869214757751e-05, + "loss": 0.684, + "step": 5421 + }, + { + "epoch": 0.8296863045141546, + "grad_norm": 2.3205062630225353, + "learning_rate": 1.3188520479642426e-05, + "loss": 0.7321, + "step": 5422 + }, + { + "epoch": 0.8298393267023718, + "grad_norm": 2.400336439716699, + "learning_rate": 1.3186171548707587e-05, + "loss": 0.6574, + "step": 5423 + }, + { + "epoch": 0.8299923488905891, + "grad_norm": 2.0677198849039273, + "learning_rate": 1.3183822422097486e-05, + "loss": 0.634, + "step": 5424 + }, + { + "epoch": 0.8301453710788065, + "grad_norm": 2.2584330809992315, + "learning_rate": 1.3181473099956397e-05, + "loss": 0.6519, + "step": 5425 + }, + { + "epoch": 0.8302983932670237, + "grad_norm": 2.124240068152974, + "learning_rate": 1.3179123582428594e-05, + "loss": 0.6199, + "step": 5426 + }, + { + "epoch": 0.830451415455241, + "grad_norm": 2.3273164439976473, + "learning_rate": 1.3176773869658383e-05, + "loss": 0.6734, + "step": 5427 + }, + { + "epoch": 0.8306044376434583, + "grad_norm": 2.149535999528806, + "learning_rate": 1.3174423961790057e-05, + "loss": 0.6557, + "step": 5428 + }, + { + "epoch": 0.8307574598316756, + "grad_norm": 2.481319214602762, + "learning_rate": 1.3172073858967935e-05, + "loss": 0.7535, + "step": 5429 + }, + { + "epoch": 0.8309104820198929, + "grad_norm": 1.9994495591491446, + "learning_rate": 1.3169723561336349e-05, + "loss": 0.6636, + "step": 5430 + }, + { + "epoch": 0.8310635042081101, + "grad_norm": 2.023320842609098, + "learning_rate": 1.3167373069039644e-05, + "loss": 0.6146, + "step": 5431 + }, + { + "epoch": 0.8312165263963275, + "grad_norm": 2.3556441047611125, + "learning_rate": 1.3165022382222161e-05, + "loss": 0.6654, + "step": 5432 + }, + { + "epoch": 0.8313695485845447, + "grad_norm": 2.325532575404804, + "learning_rate": 1.3162671501028275e-05, + "loss": 0.7858, + "step": 5433 + }, + { + "epoch": 0.831522570772762, + "grad_norm": 2.230886816421429, + "learning_rate": 1.316032042560236e-05, + "loss": 0.7242, + "step": 5434 + }, + { + "epoch": 0.8316755929609794, + "grad_norm": 2.1374285377576334, + "learning_rate": 1.3157969156088806e-05, + "loss": 0.6857, + "step": 5435 + }, + { + "epoch": 0.8318286151491966, + "grad_norm": 2.2024531130944087, + "learning_rate": 1.315561769263201e-05, + "loss": 0.7603, + "step": 5436 + }, + { + "epoch": 0.8319816373374139, + "grad_norm": 2.4725523502828186, + "learning_rate": 1.3153266035376387e-05, + "loss": 0.7892, + "step": 5437 + }, + { + "epoch": 0.8321346595256313, + "grad_norm": 1.942259678346756, + "learning_rate": 1.3150914184466359e-05, + "loss": 0.6643, + "step": 5438 + }, + { + "epoch": 0.8322876817138485, + "grad_norm": 1.9805552537307942, + "learning_rate": 1.3148562140046368e-05, + "loss": 0.6216, + "step": 5439 + }, + { + "epoch": 0.8324407039020658, + "grad_norm": 2.0651368907886094, + "learning_rate": 1.3146209902260858e-05, + "loss": 0.6408, + "step": 5440 + }, + { + "epoch": 0.832593726090283, + "grad_norm": 1.8716904461752568, + "learning_rate": 1.3143857471254284e-05, + "loss": 0.6518, + "step": 5441 + }, + { + "epoch": 0.8327467482785004, + "grad_norm": 1.9940867590996474, + "learning_rate": 1.3141504847171131e-05, + "loss": 0.6708, + "step": 5442 + }, + { + "epoch": 0.8328997704667177, + "grad_norm": 2.321143832807135, + "learning_rate": 1.3139152030155875e-05, + "loss": 0.6505, + "step": 5443 + }, + { + "epoch": 0.8330527926549349, + "grad_norm": 2.3643297951532474, + "learning_rate": 1.3136799020353009e-05, + "loss": 0.7394, + "step": 5444 + }, + { + "epoch": 0.8332058148431523, + "grad_norm": 2.4184612753665466, + "learning_rate": 1.3134445817907047e-05, + "loss": 0.6883, + "step": 5445 + }, + { + "epoch": 0.8333588370313696, + "grad_norm": 2.3561896414391144, + "learning_rate": 1.3132092422962503e-05, + "loss": 0.7043, + "step": 5446 + }, + { + "epoch": 0.8335118592195868, + "grad_norm": 2.362497581248584, + "learning_rate": 1.3129738835663911e-05, + "loss": 0.7542, + "step": 5447 + }, + { + "epoch": 0.8336648814078041, + "grad_norm": 2.4107911094529677, + "learning_rate": 1.3127385056155818e-05, + "loss": 0.7097, + "step": 5448 + }, + { + "epoch": 0.8338179035960214, + "grad_norm": 2.151079347885521, + "learning_rate": 1.3125031084582769e-05, + "loss": 0.677, + "step": 5449 + }, + { + "epoch": 0.8339709257842387, + "grad_norm": 2.113665557525543, + "learning_rate": 1.3122676921089338e-05, + "loss": 0.6911, + "step": 5450 + }, + { + "epoch": 0.834123947972456, + "grad_norm": 2.089972690894945, + "learning_rate": 1.3120322565820098e-05, + "loss": 0.6565, + "step": 5451 + }, + { + "epoch": 0.8342769701606733, + "grad_norm": 2.3308970486598404, + "learning_rate": 1.3117968018919646e-05, + "loss": 0.8009, + "step": 5452 + }, + { + "epoch": 0.8344299923488906, + "grad_norm": 2.233752104710811, + "learning_rate": 1.3115613280532581e-05, + "loss": 0.6564, + "step": 5453 + }, + { + "epoch": 0.8345830145371079, + "grad_norm": 2.341965814021548, + "learning_rate": 1.3113258350803513e-05, + "loss": 0.7691, + "step": 5454 + }, + { + "epoch": 0.8347360367253251, + "grad_norm": 2.495347947770681, + "learning_rate": 1.3110903229877073e-05, + "loss": 0.7702, + "step": 5455 + }, + { + "epoch": 0.8348890589135425, + "grad_norm": 2.229124461211568, + "learning_rate": 1.3108547917897896e-05, + "loss": 0.779, + "step": 5456 + }, + { + "epoch": 0.8350420811017597, + "grad_norm": 2.404664452229642, + "learning_rate": 1.3106192415010628e-05, + "loss": 0.7439, + "step": 5457 + }, + { + "epoch": 0.835195103289977, + "grad_norm": 2.144379029046011, + "learning_rate": 1.3103836721359934e-05, + "loss": 0.7058, + "step": 5458 + }, + { + "epoch": 0.8353481254781944, + "grad_norm": 2.2023641817367934, + "learning_rate": 1.3101480837090485e-05, + "loss": 0.6623, + "step": 5459 + }, + { + "epoch": 0.8355011476664116, + "grad_norm": 2.0170490475868936, + "learning_rate": 1.3099124762346965e-05, + "loss": 0.6674, + "step": 5460 + }, + { + "epoch": 0.8356541698546289, + "grad_norm": 1.9195685937200395, + "learning_rate": 1.3096768497274069e-05, + "loss": 0.6357, + "step": 5461 + }, + { + "epoch": 0.8358071920428463, + "grad_norm": 2.024436830598696, + "learning_rate": 1.3094412042016504e-05, + "loss": 0.5846, + "step": 5462 + }, + { + "epoch": 0.8359602142310635, + "grad_norm": 2.198755705976839, + "learning_rate": 1.3092055396718992e-05, + "loss": 0.7809, + "step": 5463 + }, + { + "epoch": 0.8361132364192808, + "grad_norm": 2.2113261586270485, + "learning_rate": 1.3089698561526261e-05, + "loss": 0.7037, + "step": 5464 + }, + { + "epoch": 0.836266258607498, + "grad_norm": 2.354039463335677, + "learning_rate": 1.3087341536583054e-05, + "loss": 0.8019, + "step": 5465 + }, + { + "epoch": 0.8364192807957154, + "grad_norm": 2.2564806325771194, + "learning_rate": 1.3084984322034124e-05, + "loss": 0.9169, + "step": 5466 + }, + { + "epoch": 0.8365723029839327, + "grad_norm": 3.4017957008245157, + "learning_rate": 1.3082626918024239e-05, + "loss": 0.6874, + "step": 5467 + }, + { + "epoch": 0.8367253251721499, + "grad_norm": 2.2833142256607086, + "learning_rate": 1.3080269324698179e-05, + "loss": 0.6265, + "step": 5468 + }, + { + "epoch": 0.8368783473603673, + "grad_norm": 2.057805293848059, + "learning_rate": 1.3077911542200727e-05, + "loss": 0.6179, + "step": 5469 + }, + { + "epoch": 0.8370313695485846, + "grad_norm": 2.3137603844967924, + "learning_rate": 1.3075553570676685e-05, + "loss": 0.7012, + "step": 5470 + }, + { + "epoch": 0.8371843917368018, + "grad_norm": 2.348687565853293, + "learning_rate": 1.3073195410270869e-05, + "loss": 0.8127, + "step": 5471 + }, + { + "epoch": 0.8373374139250191, + "grad_norm": 2.1805314170172223, + "learning_rate": 1.3070837061128101e-05, + "loss": 0.7328, + "step": 5472 + }, + { + "epoch": 0.8374904361132364, + "grad_norm": 2.298133329589823, + "learning_rate": 1.3068478523393213e-05, + "loss": 0.7008, + "step": 5473 + }, + { + "epoch": 0.8376434583014537, + "grad_norm": 2.130736044585862, + "learning_rate": 1.3066119797211056e-05, + "loss": 0.5503, + "step": 5474 + }, + { + "epoch": 0.837796480489671, + "grad_norm": 2.348461660163417, + "learning_rate": 1.3063760882726488e-05, + "loss": 0.6586, + "step": 5475 + }, + { + "epoch": 0.8379495026778883, + "grad_norm": 2.457109691397983, + "learning_rate": 1.3061401780084379e-05, + "loss": 0.8545, + "step": 5476 + }, + { + "epoch": 0.8381025248661056, + "grad_norm": 2.081413285430723, + "learning_rate": 1.305904248942961e-05, + "loss": 0.7044, + "step": 5477 + }, + { + "epoch": 0.8382555470543229, + "grad_norm": 2.2761005336360443, + "learning_rate": 1.3056683010907074e-05, + "loss": 0.7146, + "step": 5478 + }, + { + "epoch": 0.8384085692425401, + "grad_norm": 2.4427202264353096, + "learning_rate": 1.3054323344661675e-05, + "loss": 0.7789, + "step": 5479 + }, + { + "epoch": 0.8385615914307575, + "grad_norm": 2.5737808915465648, + "learning_rate": 1.3051963490838336e-05, + "loss": 0.747, + "step": 5480 + }, + { + "epoch": 0.8387146136189747, + "grad_norm": 2.3321630387102474, + "learning_rate": 1.3049603449581976e-05, + "loss": 0.7329, + "step": 5481 + }, + { + "epoch": 0.838867635807192, + "grad_norm": 2.2436821204503996, + "learning_rate": 1.3047243221037537e-05, + "loss": 0.8297, + "step": 5482 + }, + { + "epoch": 0.8390206579954094, + "grad_norm": 2.1236816351700916, + "learning_rate": 1.3044882805349972e-05, + "loss": 0.6615, + "step": 5483 + }, + { + "epoch": 0.8391736801836266, + "grad_norm": 2.1357048998496286, + "learning_rate": 1.3042522202664247e-05, + "loss": 0.7149, + "step": 5484 + }, + { + "epoch": 0.8393267023718439, + "grad_norm": 2.276524858878027, + "learning_rate": 1.3040161413125325e-05, + "loss": 0.7097, + "step": 5485 + }, + { + "epoch": 0.8394797245600613, + "grad_norm": 2.20618942502793, + "learning_rate": 1.3037800436878199e-05, + "loss": 0.6871, + "step": 5486 + }, + { + "epoch": 0.8396327467482785, + "grad_norm": 2.2991077527046513, + "learning_rate": 1.3035439274067865e-05, + "loss": 0.6966, + "step": 5487 + }, + { + "epoch": 0.8397857689364958, + "grad_norm": 2.3943691068471344, + "learning_rate": 1.303307792483933e-05, + "loss": 0.774, + "step": 5488 + }, + { + "epoch": 0.839938791124713, + "grad_norm": 2.4172819833948886, + "learning_rate": 1.3030716389337614e-05, + "loss": 0.7621, + "step": 5489 + }, + { + "epoch": 0.8400918133129304, + "grad_norm": 2.0557590442430276, + "learning_rate": 1.302835466770775e-05, + "loss": 0.6875, + "step": 5490 + }, + { + "epoch": 0.8402448355011477, + "grad_norm": 2.1172952300702517, + "learning_rate": 1.3025992760094778e-05, + "loss": 0.7656, + "step": 5491 + }, + { + "epoch": 0.8403978576893649, + "grad_norm": 2.1069628676461667, + "learning_rate": 1.302363066664375e-05, + "loss": 0.6424, + "step": 5492 + }, + { + "epoch": 0.8405508798775823, + "grad_norm": 2.0851667419856033, + "learning_rate": 1.302126838749974e-05, + "loss": 0.6485, + "step": 5493 + }, + { + "epoch": 0.8407039020657996, + "grad_norm": 2.345547762432767, + "learning_rate": 1.3018905922807814e-05, + "loss": 0.7328, + "step": 5494 + }, + { + "epoch": 0.8408569242540168, + "grad_norm": 2.418693674830027, + "learning_rate": 1.3016543272713066e-05, + "loss": 0.7418, + "step": 5495 + }, + { + "epoch": 0.8410099464422341, + "grad_norm": 2.371023973660028, + "learning_rate": 1.3014180437360595e-05, + "loss": 0.5673, + "step": 5496 + }, + { + "epoch": 0.8411629686304514, + "grad_norm": 2.265839060637262, + "learning_rate": 1.3011817416895516e-05, + "loss": 0.6141, + "step": 5497 + }, + { + "epoch": 0.8413159908186687, + "grad_norm": 2.561937095874154, + "learning_rate": 1.3009454211462941e-05, + "loss": 0.7998, + "step": 5498 + }, + { + "epoch": 0.841469013006886, + "grad_norm": 2.264911983777347, + "learning_rate": 1.3007090821208013e-05, + "loss": 0.6413, + "step": 5499 + }, + { + "epoch": 0.8416220351951033, + "grad_norm": 2.38515727129013, + "learning_rate": 1.3004727246275875e-05, + "loss": 0.6904, + "step": 5500 + }, + { + "epoch": 0.8417750573833206, + "grad_norm": 2.2272191122576475, + "learning_rate": 1.300236348681168e-05, + "loss": 0.67, + "step": 5501 + }, + { + "epoch": 0.8419280795715379, + "grad_norm": 2.328983070075472, + "learning_rate": 1.2999999542960597e-05, + "loss": 0.7334, + "step": 5502 + }, + { + "epoch": 0.8420811017597551, + "grad_norm": 2.2285007466959206, + "learning_rate": 1.2997635414867808e-05, + "loss": 0.7039, + "step": 5503 + }, + { + "epoch": 0.8422341239479725, + "grad_norm": 2.475196568822425, + "learning_rate": 1.2995271102678503e-05, + "loss": 0.6824, + "step": 5504 + }, + { + "epoch": 0.8423871461361897, + "grad_norm": 2.145354123730406, + "learning_rate": 1.2992906606537878e-05, + "loss": 0.6814, + "step": 5505 + }, + { + "epoch": 0.842540168324407, + "grad_norm": 2.648650178635418, + "learning_rate": 1.2990541926591153e-05, + "loss": 0.7099, + "step": 5506 + }, + { + "epoch": 0.8426931905126244, + "grad_norm": 2.2924356285560554, + "learning_rate": 1.298817706298355e-05, + "loss": 0.7504, + "step": 5507 + }, + { + "epoch": 0.8428462127008416, + "grad_norm": 2.260371927743499, + "learning_rate": 1.2985812015860297e-05, + "loss": 0.8017, + "step": 5508 + }, + { + "epoch": 0.8429992348890589, + "grad_norm": 2.1438114603897898, + "learning_rate": 1.2983446785366656e-05, + "loss": 0.6886, + "step": 5509 + }, + { + "epoch": 0.8431522570772763, + "grad_norm": 2.328522698350745, + "learning_rate": 1.2981081371647872e-05, + "loss": 0.6653, + "step": 5510 + }, + { + "epoch": 0.8433052792654935, + "grad_norm": 2.2418473468868627, + "learning_rate": 1.297871577484922e-05, + "loss": 0.7969, + "step": 5511 + }, + { + "epoch": 0.8434583014537108, + "grad_norm": 2.079868183628365, + "learning_rate": 1.2976349995115979e-05, + "loss": 0.6576, + "step": 5512 + }, + { + "epoch": 0.843611323641928, + "grad_norm": 2.2551328175381657, + "learning_rate": 1.2973984032593446e-05, + "loss": 0.7583, + "step": 5513 + }, + { + "epoch": 0.8437643458301454, + "grad_norm": 1.9291727183519736, + "learning_rate": 1.2971617887426911e-05, + "loss": 0.6354, + "step": 5514 + }, + { + "epoch": 0.8439173680183627, + "grad_norm": 2.0798484589827226, + "learning_rate": 1.2969251559761702e-05, + "loss": 0.653, + "step": 5515 + }, + { + "epoch": 0.8440703902065799, + "grad_norm": 2.418587228020786, + "learning_rate": 1.2966885049743136e-05, + "loss": 0.6181, + "step": 5516 + }, + { + "epoch": 0.8442234123947973, + "grad_norm": 2.19342314918509, + "learning_rate": 1.2964518357516553e-05, + "loss": 0.6955, + "step": 5517 + }, + { + "epoch": 0.8443764345830146, + "grad_norm": 2.338148065171502, + "learning_rate": 1.2962151483227303e-05, + "loss": 0.7878, + "step": 5518 + }, + { + "epoch": 0.8445294567712318, + "grad_norm": 1.9721749570907623, + "learning_rate": 1.295978442702074e-05, + "loss": 0.6078, + "step": 5519 + }, + { + "epoch": 0.8446824789594491, + "grad_norm": 2.0817212942936845, + "learning_rate": 1.2957417189042237e-05, + "loss": 0.6564, + "step": 5520 + }, + { + "epoch": 0.8448355011476664, + "grad_norm": 2.289786601333639, + "learning_rate": 1.2955049769437173e-05, + "loss": 0.6076, + "step": 5521 + }, + { + "epoch": 0.8449885233358837, + "grad_norm": 2.400216562893098, + "learning_rate": 1.2952682168350949e-05, + "loss": 0.8167, + "step": 5522 + }, + { + "epoch": 0.845141545524101, + "grad_norm": 2.423170432612747, + "learning_rate": 1.2950314385928957e-05, + "loss": 0.7366, + "step": 5523 + }, + { + "epoch": 0.8452945677123183, + "grad_norm": 2.079069856562734, + "learning_rate": 1.2947946422316617e-05, + "loss": 0.6885, + "step": 5524 + }, + { + "epoch": 0.8454475899005356, + "grad_norm": 2.314286472410869, + "learning_rate": 1.2945578277659357e-05, + "loss": 0.7336, + "step": 5525 + }, + { + "epoch": 0.8456006120887529, + "grad_norm": 2.261984315977617, + "learning_rate": 1.2943209952102608e-05, + "loss": 0.6953, + "step": 5526 + }, + { + "epoch": 0.8457536342769701, + "grad_norm": 2.0233508566664793, + "learning_rate": 1.2940841445791826e-05, + "loss": 0.6739, + "step": 5527 + }, + { + "epoch": 0.8459066564651875, + "grad_norm": 2.288650722875104, + "learning_rate": 1.2938472758872464e-05, + "loss": 0.7251, + "step": 5528 + }, + { + "epoch": 0.8460596786534047, + "grad_norm": 2.5941529869810522, + "learning_rate": 1.2936103891489995e-05, + "loss": 0.6892, + "step": 5529 + }, + { + "epoch": 0.846212700841622, + "grad_norm": 2.488575302432984, + "learning_rate": 1.29337348437899e-05, + "loss": 0.6759, + "step": 5530 + }, + { + "epoch": 0.8463657230298394, + "grad_norm": 2.1952089497466396, + "learning_rate": 1.293136561591767e-05, + "loss": 0.748, + "step": 5531 + }, + { + "epoch": 0.8465187452180566, + "grad_norm": 2.2910733629818756, + "learning_rate": 1.2928996208018813e-05, + "loss": 0.6614, + "step": 5532 + }, + { + "epoch": 0.8466717674062739, + "grad_norm": 2.2937953646424787, + "learning_rate": 1.2926626620238836e-05, + "loss": 0.7134, + "step": 5533 + }, + { + "epoch": 0.8468247895944913, + "grad_norm": 2.260542552534389, + "learning_rate": 1.2924256852723276e-05, + "loss": 0.6862, + "step": 5534 + }, + { + "epoch": 0.8469778117827085, + "grad_norm": 2.2815687375045615, + "learning_rate": 1.2921886905617658e-05, + "loss": 0.6566, + "step": 5535 + }, + { + "epoch": 0.8471308339709258, + "grad_norm": 2.355462711241084, + "learning_rate": 1.2919516779067533e-05, + "loss": 0.7204, + "step": 5536 + }, + { + "epoch": 0.847283856159143, + "grad_norm": 2.3383308189087186, + "learning_rate": 1.2917146473218464e-05, + "loss": 0.76, + "step": 5537 + }, + { + "epoch": 0.8474368783473604, + "grad_norm": 2.294371471880201, + "learning_rate": 1.2914775988216021e-05, + "loss": 0.6685, + "step": 5538 + }, + { + "epoch": 0.8475899005355777, + "grad_norm": 2.237221973534265, + "learning_rate": 1.2912405324205776e-05, + "loss": 0.7479, + "step": 5539 + }, + { + "epoch": 0.8477429227237949, + "grad_norm": 2.4085148292399468, + "learning_rate": 1.2910034481333331e-05, + "loss": 0.6868, + "step": 5540 + }, + { + "epoch": 0.8478959449120123, + "grad_norm": 2.1331468654542154, + "learning_rate": 1.2907663459744283e-05, + "loss": 0.6263, + "step": 5541 + }, + { + "epoch": 0.8480489671002295, + "grad_norm": 2.416430462046604, + "learning_rate": 1.2905292259584248e-05, + "loss": 0.5775, + "step": 5542 + }, + { + "epoch": 0.8482019892884468, + "grad_norm": 2.154148517983704, + "learning_rate": 1.2902920880998848e-05, + "loss": 0.704, + "step": 5543 + }, + { + "epoch": 0.8483550114766641, + "grad_norm": 2.221789554064452, + "learning_rate": 1.2900549324133722e-05, + "loss": 0.6048, + "step": 5544 + }, + { + "epoch": 0.8485080336648814, + "grad_norm": 2.48005927838095, + "learning_rate": 1.2898177589134515e-05, + "loss": 0.715, + "step": 5545 + }, + { + "epoch": 0.8486610558530987, + "grad_norm": 2.144770996988103, + "learning_rate": 1.2895805676146882e-05, + "loss": 0.6509, + "step": 5546 + }, + { + "epoch": 0.848814078041316, + "grad_norm": 1.9436185628607763, + "learning_rate": 1.2893433585316497e-05, + "loss": 0.5823, + "step": 5547 + }, + { + "epoch": 0.8489671002295333, + "grad_norm": 2.3484164597566304, + "learning_rate": 1.2891061316789036e-05, + "loss": 0.6889, + "step": 5548 + }, + { + "epoch": 0.8491201224177506, + "grad_norm": 2.4994443211846296, + "learning_rate": 1.2888688870710186e-05, + "loss": 0.7833, + "step": 5549 + }, + { + "epoch": 0.8492731446059678, + "grad_norm": 2.156946521054402, + "learning_rate": 1.2886316247225657e-05, + "loss": 0.76, + "step": 5550 + }, + { + "epoch": 0.8494261667941851, + "grad_norm": 2.1799637821710482, + "learning_rate": 1.2883943446481153e-05, + "loss": 0.6765, + "step": 5551 + }, + { + "epoch": 0.8495791889824025, + "grad_norm": 2.0568551184076886, + "learning_rate": 1.28815704686224e-05, + "loss": 0.6281, + "step": 5552 + }, + { + "epoch": 0.8497322111706197, + "grad_norm": 2.2126970022565726, + "learning_rate": 1.2879197313795131e-05, + "loss": 0.6563, + "step": 5553 + }, + { + "epoch": 0.849885233358837, + "grad_norm": 2.522319909037303, + "learning_rate": 1.2876823982145095e-05, + "loss": 0.8035, + "step": 5554 + }, + { + "epoch": 0.8500382555470544, + "grad_norm": 2.498837348760292, + "learning_rate": 1.287445047381804e-05, + "loss": 0.7085, + "step": 5555 + }, + { + "epoch": 0.8501912777352716, + "grad_norm": 2.363108474252094, + "learning_rate": 1.2872076788959738e-05, + "loss": 0.706, + "step": 5556 + }, + { + "epoch": 0.8503442999234889, + "grad_norm": 2.1018992973643207, + "learning_rate": 1.2869702927715966e-05, + "loss": 0.6308, + "step": 5557 + }, + { + "epoch": 0.8504973221117061, + "grad_norm": 2.5754528738479117, + "learning_rate": 1.2867328890232506e-05, + "loss": 0.7338, + "step": 5558 + }, + { + "epoch": 0.8506503442999235, + "grad_norm": 2.2465891179344237, + "learning_rate": 1.286495467665517e-05, + "loss": 0.7204, + "step": 5559 + }, + { + "epoch": 0.8508033664881408, + "grad_norm": 2.282158755636477, + "learning_rate": 1.2862580287129754e-05, + "loss": 0.709, + "step": 5560 + }, + { + "epoch": 0.850956388676358, + "grad_norm": 2.253648961989343, + "learning_rate": 1.2860205721802083e-05, + "loss": 0.6625, + "step": 5561 + }, + { + "epoch": 0.8511094108645754, + "grad_norm": 2.224500447042734, + "learning_rate": 1.2857830980817994e-05, + "loss": 0.6685, + "step": 5562 + }, + { + "epoch": 0.8512624330527927, + "grad_norm": 2.25169449960483, + "learning_rate": 1.2855456064323323e-05, + "loss": 0.6976, + "step": 5563 + }, + { + "epoch": 0.8514154552410099, + "grad_norm": 2.254082007689388, + "learning_rate": 1.2853080972463923e-05, + "loss": 0.7009, + "step": 5564 + }, + { + "epoch": 0.8515684774292273, + "grad_norm": 2.111325957451995, + "learning_rate": 1.285070570538566e-05, + "loss": 0.5968, + "step": 5565 + }, + { + "epoch": 0.8517214996174445, + "grad_norm": 2.212773489566472, + "learning_rate": 1.2848330263234408e-05, + "loss": 0.6656, + "step": 5566 + }, + { + "epoch": 0.8518745218056618, + "grad_norm": 2.354877615111615, + "learning_rate": 1.2845954646156051e-05, + "loss": 0.7605, + "step": 5567 + }, + { + "epoch": 0.8520275439938791, + "grad_norm": 2.166128846465111, + "learning_rate": 1.2843578854296485e-05, + "loss": 0.7061, + "step": 5568 + }, + { + "epoch": 0.8521805661820964, + "grad_norm": 2.3172994145565164, + "learning_rate": 1.2841202887801618e-05, + "loss": 0.7018, + "step": 5569 + }, + { + "epoch": 0.8523335883703137, + "grad_norm": 2.4927482090533193, + "learning_rate": 1.283882674681737e-05, + "loss": 0.7816, + "step": 5570 + }, + { + "epoch": 0.852486610558531, + "grad_norm": 2.327043385481717, + "learning_rate": 1.2836450431489662e-05, + "loss": 0.7731, + "step": 5571 + }, + { + "epoch": 0.8526396327467483, + "grad_norm": 2.309999254781278, + "learning_rate": 1.2834073941964438e-05, + "loss": 0.756, + "step": 5572 + }, + { + "epoch": 0.8527926549349656, + "grad_norm": 2.4280514724840363, + "learning_rate": 1.2831697278387647e-05, + "loss": 0.7067, + "step": 5573 + }, + { + "epoch": 0.8529456771231828, + "grad_norm": 2.0447441695223914, + "learning_rate": 1.2829320440905244e-05, + "loss": 0.6752, + "step": 5574 + }, + { + "epoch": 0.8530986993114001, + "grad_norm": 2.0921672371070223, + "learning_rate": 1.2826943429663212e-05, + "loss": 0.5819, + "step": 5575 + }, + { + "epoch": 0.8532517214996175, + "grad_norm": 2.0878343760248512, + "learning_rate": 1.2824566244807522e-05, + "loss": 0.7016, + "step": 5576 + }, + { + "epoch": 0.8534047436878347, + "grad_norm": 2.306674208187633, + "learning_rate": 1.2822188886484165e-05, + "loss": 0.754, + "step": 5577 + }, + { + "epoch": 0.853557765876052, + "grad_norm": 2.4509282856897, + "learning_rate": 1.2819811354839151e-05, + "loss": 0.7932, + "step": 5578 + }, + { + "epoch": 0.8537107880642694, + "grad_norm": 2.20326195730903, + "learning_rate": 1.2817433650018493e-05, + "loss": 0.6907, + "step": 5579 + }, + { + "epoch": 0.8538638102524866, + "grad_norm": 2.267707095823843, + "learning_rate": 1.2815055772168208e-05, + "loss": 0.7413, + "step": 5580 + }, + { + "epoch": 0.8540168324407039, + "grad_norm": 2.246442637791065, + "learning_rate": 1.2812677721434338e-05, + "loss": 0.6501, + "step": 5581 + }, + { + "epoch": 0.8541698546289211, + "grad_norm": 2.344967709598487, + "learning_rate": 1.2810299497962925e-05, + "loss": 0.7843, + "step": 5582 + }, + { + "epoch": 0.8543228768171385, + "grad_norm": 2.4466452469658604, + "learning_rate": 1.280792110190003e-05, + "loss": 0.6965, + "step": 5583 + }, + { + "epoch": 0.8544758990053558, + "grad_norm": 2.405752794561989, + "learning_rate": 1.280554253339171e-05, + "loss": 0.7889, + "step": 5584 + }, + { + "epoch": 0.854628921193573, + "grad_norm": 2.349194728354978, + "learning_rate": 1.280316379258405e-05, + "loss": 0.6264, + "step": 5585 + }, + { + "epoch": 0.8547819433817904, + "grad_norm": 2.3163214380738117, + "learning_rate": 1.2800784879623135e-05, + "loss": 0.7097, + "step": 5586 + }, + { + "epoch": 0.8549349655700077, + "grad_norm": 2.205955621353382, + "learning_rate": 1.2798405794655064e-05, + "loss": 0.7304, + "step": 5587 + }, + { + "epoch": 0.8550879877582249, + "grad_norm": 2.2160837810993526, + "learning_rate": 1.2796026537825947e-05, + "loss": 0.5901, + "step": 5588 + }, + { + "epoch": 0.8552410099464423, + "grad_norm": 2.076799915175384, + "learning_rate": 1.2793647109281901e-05, + "loss": 0.6659, + "step": 5589 + }, + { + "epoch": 0.8553940321346595, + "grad_norm": 2.169267298600903, + "learning_rate": 1.2791267509169054e-05, + "loss": 0.707, + "step": 5590 + }, + { + "epoch": 0.8555470543228768, + "grad_norm": 2.0806303549381364, + "learning_rate": 1.2788887737633555e-05, + "loss": 0.648, + "step": 5591 + }, + { + "epoch": 0.8557000765110941, + "grad_norm": 2.1414594681949635, + "learning_rate": 1.278650779482155e-05, + "loss": 0.6572, + "step": 5592 + }, + { + "epoch": 0.8558530986993114, + "grad_norm": 2.3353430053689435, + "learning_rate": 1.2784127680879195e-05, + "loss": 0.6605, + "step": 5593 + }, + { + "epoch": 0.8560061208875287, + "grad_norm": 2.4855997648787653, + "learning_rate": 1.278174739595267e-05, + "loss": 0.6597, + "step": 5594 + }, + { + "epoch": 0.856159143075746, + "grad_norm": 2.139860057474892, + "learning_rate": 1.2779366940188159e-05, + "loss": 0.6893, + "step": 5595 + }, + { + "epoch": 0.8563121652639633, + "grad_norm": 2.229662131253198, + "learning_rate": 1.2776986313731847e-05, + "loss": 0.6621, + "step": 5596 + }, + { + "epoch": 0.8564651874521806, + "grad_norm": 2.482093964955999, + "learning_rate": 1.2774605516729944e-05, + "loss": 0.8149, + "step": 5597 + }, + { + "epoch": 0.8566182096403978, + "grad_norm": 2.3260474144446115, + "learning_rate": 1.277222454932866e-05, + "loss": 0.6988, + "step": 5598 + }, + { + "epoch": 0.8567712318286151, + "grad_norm": 2.238022446025169, + "learning_rate": 1.2769843411674221e-05, + "loss": 0.7234, + "step": 5599 + }, + { + "epoch": 0.8569242540168325, + "grad_norm": 1.9938349722793565, + "learning_rate": 1.2767462103912864e-05, + "loss": 0.597, + "step": 5600 + }, + { + "epoch": 0.8570772762050497, + "grad_norm": 2.4546219970379726, + "learning_rate": 1.2765080626190834e-05, + "loss": 0.7452, + "step": 5601 + }, + { + "epoch": 0.857230298393267, + "grad_norm": 2.3423061899227244, + "learning_rate": 1.2762698978654381e-05, + "loss": 0.7442, + "step": 5602 + }, + { + "epoch": 0.8573833205814844, + "grad_norm": 2.5320781222404176, + "learning_rate": 1.276031716144978e-05, + "loss": 0.772, + "step": 5603 + }, + { + "epoch": 0.8575363427697016, + "grad_norm": 2.3118002124605623, + "learning_rate": 1.2757935174723306e-05, + "loss": 0.6711, + "step": 5604 + }, + { + "epoch": 0.8576893649579189, + "grad_norm": 2.400968927044133, + "learning_rate": 1.2755553018621238e-05, + "loss": 0.6895, + "step": 5605 + }, + { + "epoch": 0.8578423871461361, + "grad_norm": 1.9936304706669457, + "learning_rate": 1.2753170693289882e-05, + "loss": 0.5834, + "step": 5606 + }, + { + "epoch": 0.8579954093343535, + "grad_norm": 2.5147260168293384, + "learning_rate": 1.2750788198875545e-05, + "loss": 0.6648, + "step": 5607 + }, + { + "epoch": 0.8581484315225708, + "grad_norm": 2.202869381008538, + "learning_rate": 1.2748405535524541e-05, + "loss": 0.6608, + "step": 5608 + }, + { + "epoch": 0.858301453710788, + "grad_norm": 2.2267686720968394, + "learning_rate": 1.2746022703383203e-05, + "loss": 0.7458, + "step": 5609 + }, + { + "epoch": 0.8584544758990054, + "grad_norm": 2.1783234217642153, + "learning_rate": 1.2743639702597868e-05, + "loss": 0.7113, + "step": 5610 + }, + { + "epoch": 0.8586074980872227, + "grad_norm": 2.2367728821011785, + "learning_rate": 1.2741256533314885e-05, + "loss": 0.6847, + "step": 5611 + }, + { + "epoch": 0.8587605202754399, + "grad_norm": 2.4930263847632457, + "learning_rate": 1.2738873195680615e-05, + "loss": 0.7769, + "step": 5612 + }, + { + "epoch": 0.8589135424636573, + "grad_norm": 2.3443625908959858, + "learning_rate": 1.2736489689841427e-05, + "loss": 0.7359, + "step": 5613 + }, + { + "epoch": 0.8590665646518745, + "grad_norm": 2.305054821326658, + "learning_rate": 1.2734106015943702e-05, + "loss": 0.6783, + "step": 5614 + }, + { + "epoch": 0.8592195868400918, + "grad_norm": 1.9663379003968546, + "learning_rate": 1.2731722174133827e-05, + "loss": 0.6796, + "step": 5615 + }, + { + "epoch": 0.8593726090283091, + "grad_norm": 2.092756435546732, + "learning_rate": 1.2729338164558214e-05, + "loss": 0.713, + "step": 5616 + }, + { + "epoch": 0.8595256312165264, + "grad_norm": 2.24491708283434, + "learning_rate": 1.2726953987363264e-05, + "loss": 0.6345, + "step": 5617 + }, + { + "epoch": 0.8596786534047437, + "grad_norm": 2.1594981539794276, + "learning_rate": 1.2724569642695399e-05, + "loss": 0.6694, + "step": 5618 + }, + { + "epoch": 0.859831675592961, + "grad_norm": 2.2943585124865384, + "learning_rate": 1.2722185130701057e-05, + "loss": 0.7367, + "step": 5619 + }, + { + "epoch": 0.8599846977811783, + "grad_norm": 2.3976683676703767, + "learning_rate": 1.2719800451526676e-05, + "loss": 0.7081, + "step": 5620 + }, + { + "epoch": 0.8601377199693956, + "grad_norm": 2.2946408098661064, + "learning_rate": 1.2717415605318708e-05, + "loss": 0.7871, + "step": 5621 + }, + { + "epoch": 0.8602907421576128, + "grad_norm": 2.3980471919531876, + "learning_rate": 1.2715030592223619e-05, + "loss": 0.758, + "step": 5622 + }, + { + "epoch": 0.8604437643458301, + "grad_norm": 2.2339319776161037, + "learning_rate": 1.2712645412387878e-05, + "loss": 0.6481, + "step": 5623 + }, + { + "epoch": 0.8605967865340475, + "grad_norm": 2.370078313360965, + "learning_rate": 1.2710260065957973e-05, + "loss": 0.8031, + "step": 5624 + }, + { + "epoch": 0.8607498087222647, + "grad_norm": 2.361702795265424, + "learning_rate": 1.2707874553080393e-05, + "loss": 0.6998, + "step": 5625 + }, + { + "epoch": 0.860902830910482, + "grad_norm": 2.185328558542397, + "learning_rate": 1.2705488873901643e-05, + "loss": 0.6882, + "step": 5626 + }, + { + "epoch": 0.8610558530986994, + "grad_norm": 2.432654884382053, + "learning_rate": 1.270310302856824e-05, + "loss": 0.662, + "step": 5627 + }, + { + "epoch": 0.8612088752869166, + "grad_norm": 2.4952824090910624, + "learning_rate": 1.27007170172267e-05, + "loss": 0.8529, + "step": 5628 + }, + { + "epoch": 0.8613618974751339, + "grad_norm": 2.236267328529113, + "learning_rate": 1.2698330840023569e-05, + "loss": 0.7145, + "step": 5629 + }, + { + "epoch": 0.8615149196633511, + "grad_norm": 2.2009751715707746, + "learning_rate": 1.2695944497105383e-05, + "loss": 0.7445, + "step": 5630 + }, + { + "epoch": 0.8616679418515685, + "grad_norm": 2.671644999870953, + "learning_rate": 1.2693557988618696e-05, + "loss": 0.6987, + "step": 5631 + }, + { + "epoch": 0.8618209640397858, + "grad_norm": 2.1200922727768194, + "learning_rate": 1.2691171314710078e-05, + "loss": 0.6588, + "step": 5632 + }, + { + "epoch": 0.861973986228003, + "grad_norm": 2.20322726646221, + "learning_rate": 1.2688784475526103e-05, + "loss": 0.5955, + "step": 5633 + }, + { + "epoch": 0.8621270084162204, + "grad_norm": 2.0969035364933095, + "learning_rate": 1.2686397471213352e-05, + "loss": 0.6224, + "step": 5634 + }, + { + "epoch": 0.8622800306044377, + "grad_norm": 2.128200156708807, + "learning_rate": 1.2684010301918424e-05, + "loss": 0.6266, + "step": 5635 + }, + { + "epoch": 0.8624330527926549, + "grad_norm": 2.1941487004202114, + "learning_rate": 1.2681622967787925e-05, + "loss": 0.6647, + "step": 5636 + }, + { + "epoch": 0.8625860749808723, + "grad_norm": 1.9520577900142773, + "learning_rate": 1.2679235468968468e-05, + "loss": 0.6282, + "step": 5637 + }, + { + "epoch": 0.8627390971690895, + "grad_norm": 2.36577545218808, + "learning_rate": 1.267684780560668e-05, + "loss": 0.6948, + "step": 5638 + }, + { + "epoch": 0.8628921193573068, + "grad_norm": 2.030714318416365, + "learning_rate": 1.2674459977849197e-05, + "loss": 0.6446, + "step": 5639 + }, + { + "epoch": 0.8630451415455241, + "grad_norm": 2.3029069648527463, + "learning_rate": 1.267207198584266e-05, + "loss": 0.7449, + "step": 5640 + }, + { + "epoch": 0.8631981637337414, + "grad_norm": 2.240610425205807, + "learning_rate": 1.2669683829733734e-05, + "loss": 0.6766, + "step": 5641 + }, + { + "epoch": 0.8633511859219587, + "grad_norm": 2.3727769689851743, + "learning_rate": 1.2667295509669077e-05, + "loss": 0.681, + "step": 5642 + }, + { + "epoch": 0.863504208110176, + "grad_norm": 2.2735964610799932, + "learning_rate": 1.2664907025795369e-05, + "loss": 0.689, + "step": 5643 + }, + { + "epoch": 0.8636572302983933, + "grad_norm": 2.3451657930943632, + "learning_rate": 1.2662518378259297e-05, + "loss": 0.7561, + "step": 5644 + }, + { + "epoch": 0.8638102524866106, + "grad_norm": 2.3808378476510583, + "learning_rate": 1.2660129567207555e-05, + "loss": 0.7952, + "step": 5645 + }, + { + "epoch": 0.8639632746748278, + "grad_norm": 2.264345473222062, + "learning_rate": 1.2657740592786847e-05, + "loss": 0.7002, + "step": 5646 + }, + { + "epoch": 0.8641162968630451, + "grad_norm": 2.079793665356641, + "learning_rate": 1.2655351455143896e-05, + "loss": 0.6575, + "step": 5647 + }, + { + "epoch": 0.8642693190512625, + "grad_norm": 2.4275830013363175, + "learning_rate": 1.2652962154425423e-05, + "loss": 0.7579, + "step": 5648 + }, + { + "epoch": 0.8644223412394797, + "grad_norm": 2.2748583552247994, + "learning_rate": 1.2650572690778164e-05, + "loss": 0.6597, + "step": 5649 + }, + { + "epoch": 0.864575363427697, + "grad_norm": 2.1601232807271673, + "learning_rate": 1.2648183064348868e-05, + "loss": 0.5999, + "step": 5650 + }, + { + "epoch": 0.8647283856159143, + "grad_norm": 2.2760902133080467, + "learning_rate": 1.2645793275284293e-05, + "loss": 0.6744, + "step": 5651 + }, + { + "epoch": 0.8648814078041316, + "grad_norm": 2.5328811384211405, + "learning_rate": 1.2643403323731198e-05, + "loss": 0.7486, + "step": 5652 + }, + { + "epoch": 0.8650344299923489, + "grad_norm": 1.9191237464359976, + "learning_rate": 1.2641013209836369e-05, + "loss": 0.5882, + "step": 5653 + }, + { + "epoch": 0.8651874521805661, + "grad_norm": 2.1442818040053564, + "learning_rate": 1.2638622933746583e-05, + "loss": 0.6574, + "step": 5654 + }, + { + "epoch": 0.8653404743687835, + "grad_norm": 2.313223313915067, + "learning_rate": 1.263623249560864e-05, + "loss": 0.705, + "step": 5655 + }, + { + "epoch": 0.8654934965570008, + "grad_norm": 2.1582983437738363, + "learning_rate": 1.2633841895569346e-05, + "loss": 0.6208, + "step": 5656 + }, + { + "epoch": 0.865646518745218, + "grad_norm": 2.140699620669154, + "learning_rate": 1.263145113377552e-05, + "loss": 0.6216, + "step": 5657 + }, + { + "epoch": 0.8657995409334354, + "grad_norm": 2.51001916251359, + "learning_rate": 1.2629060210373986e-05, + "loss": 0.8365, + "step": 5658 + }, + { + "epoch": 0.8659525631216526, + "grad_norm": 2.1467206121687714, + "learning_rate": 1.2626669125511578e-05, + "loss": 0.6975, + "step": 5659 + }, + { + "epoch": 0.8661055853098699, + "grad_norm": 2.1179526840995955, + "learning_rate": 1.2624277879335142e-05, + "loss": 0.6771, + "step": 5660 + }, + { + "epoch": 0.8662586074980873, + "grad_norm": 2.242352909508502, + "learning_rate": 1.2621886471991538e-05, + "loss": 0.7168, + "step": 5661 + }, + { + "epoch": 0.8664116296863045, + "grad_norm": 2.4601217594406193, + "learning_rate": 1.261949490362763e-05, + "loss": 0.6644, + "step": 5662 + }, + { + "epoch": 0.8665646518745218, + "grad_norm": 2.1506121299772327, + "learning_rate": 1.2617103174390289e-05, + "loss": 0.6518, + "step": 5663 + }, + { + "epoch": 0.8667176740627391, + "grad_norm": 2.535854595188026, + "learning_rate": 1.2614711284426406e-05, + "loss": 0.7968, + "step": 5664 + }, + { + "epoch": 0.8668706962509564, + "grad_norm": 2.3806302754076936, + "learning_rate": 1.2612319233882873e-05, + "loss": 0.6961, + "step": 5665 + }, + { + "epoch": 0.8670237184391737, + "grad_norm": 2.0825689145311705, + "learning_rate": 1.2609927022906598e-05, + "loss": 0.4917, + "step": 5666 + }, + { + "epoch": 0.8671767406273909, + "grad_norm": 2.296561591485694, + "learning_rate": 1.2607534651644493e-05, + "loss": 0.6687, + "step": 5667 + }, + { + "epoch": 0.8673297628156083, + "grad_norm": 2.0016062923376703, + "learning_rate": 1.2605142120243485e-05, + "loss": 0.5966, + "step": 5668 + }, + { + "epoch": 0.8674827850038256, + "grad_norm": 2.2328943779206676, + "learning_rate": 1.2602749428850506e-05, + "loss": 0.6952, + "step": 5669 + }, + { + "epoch": 0.8676358071920428, + "grad_norm": 2.1730370472915164, + "learning_rate": 1.2600356577612507e-05, + "loss": 0.6091, + "step": 5670 + }, + { + "epoch": 0.8677888293802601, + "grad_norm": 2.30791377621154, + "learning_rate": 1.2597963566676435e-05, + "loss": 0.7486, + "step": 5671 + }, + { + "epoch": 0.8679418515684775, + "grad_norm": 2.276237653997365, + "learning_rate": 1.2595570396189254e-05, + "loss": 0.7926, + "step": 5672 + }, + { + "epoch": 0.8680948737566947, + "grad_norm": 2.3754804501301514, + "learning_rate": 1.2593177066297945e-05, + "loss": 0.6826, + "step": 5673 + }, + { + "epoch": 0.868247895944912, + "grad_norm": 2.2435259533338314, + "learning_rate": 1.2590783577149488e-05, + "loss": 0.6384, + "step": 5674 + }, + { + "epoch": 0.8684009181331293, + "grad_norm": 2.087817049910248, + "learning_rate": 1.2588389928890873e-05, + "loss": 0.6228, + "step": 5675 + }, + { + "epoch": 0.8685539403213466, + "grad_norm": 2.3075610370217516, + "learning_rate": 1.258599612166911e-05, + "loss": 0.6741, + "step": 5676 + }, + { + "epoch": 0.8687069625095639, + "grad_norm": 2.105883253531245, + "learning_rate": 1.258360215563121e-05, + "loss": 0.6242, + "step": 5677 + }, + { + "epoch": 0.8688599846977811, + "grad_norm": 2.2765438530308564, + "learning_rate": 1.258120803092419e-05, + "loss": 0.7679, + "step": 5678 + }, + { + "epoch": 0.8690130068859985, + "grad_norm": 2.4505312365492844, + "learning_rate": 1.2578813747695091e-05, + "loss": 0.6735, + "step": 5679 + }, + { + "epoch": 0.8691660290742158, + "grad_norm": 2.3676073507608604, + "learning_rate": 1.2576419306090948e-05, + "loss": 0.6729, + "step": 5680 + }, + { + "epoch": 0.869319051262433, + "grad_norm": 2.44965356160525, + "learning_rate": 1.2574024706258817e-05, + "loss": 0.7476, + "step": 5681 + }, + { + "epoch": 0.8694720734506504, + "grad_norm": 2.1538693697394486, + "learning_rate": 1.2571629948345763e-05, + "loss": 0.6866, + "step": 5682 + }, + { + "epoch": 0.8696250956388676, + "grad_norm": 2.0656645571127887, + "learning_rate": 1.256923503249885e-05, + "loss": 0.7174, + "step": 5683 + }, + { + "epoch": 0.8697781178270849, + "grad_norm": 2.21545987856496, + "learning_rate": 1.2566839958865162e-05, + "loss": 0.6976, + "step": 5684 + }, + { + "epoch": 0.8699311400153023, + "grad_norm": 2.2435429627080268, + "learning_rate": 1.2564444727591791e-05, + "loss": 0.7416, + "step": 5685 + }, + { + "epoch": 0.8700841622035195, + "grad_norm": 2.288122679843375, + "learning_rate": 1.2562049338825842e-05, + "loss": 0.6695, + "step": 5686 + }, + { + "epoch": 0.8702371843917368, + "grad_norm": 2.4078595060538452, + "learning_rate": 1.2559653792714414e-05, + "loss": 0.7819, + "step": 5687 + }, + { + "epoch": 0.8703902065799541, + "grad_norm": 2.4138389650474097, + "learning_rate": 1.2557258089404635e-05, + "loss": 0.6232, + "step": 5688 + }, + { + "epoch": 0.8705432287681714, + "grad_norm": 2.1081792762846945, + "learning_rate": 1.2554862229043633e-05, + "loss": 0.6831, + "step": 5689 + }, + { + "epoch": 0.8706962509563887, + "grad_norm": 2.0366851001301596, + "learning_rate": 1.255246621177855e-05, + "loss": 0.7232, + "step": 5690 + }, + { + "epoch": 0.8708492731446059, + "grad_norm": 2.0327500614427536, + "learning_rate": 1.2550070037756527e-05, + "loss": 0.6628, + "step": 5691 + }, + { + "epoch": 0.8710022953328233, + "grad_norm": 2.361742724788852, + "learning_rate": 1.254767370712473e-05, + "loss": 0.8272, + "step": 5692 + }, + { + "epoch": 0.8711553175210406, + "grad_norm": 2.18870554680478, + "learning_rate": 1.2545277220030324e-05, + "loss": 0.6787, + "step": 5693 + }, + { + "epoch": 0.8713083397092578, + "grad_norm": 2.3066595929772613, + "learning_rate": 1.2542880576620484e-05, + "loss": 0.7596, + "step": 5694 + }, + { + "epoch": 0.8714613618974751, + "grad_norm": 2.3582637357450023, + "learning_rate": 1.2540483777042403e-05, + "loss": 0.6326, + "step": 5695 + }, + { + "epoch": 0.8716143840856925, + "grad_norm": 2.1035213548968312, + "learning_rate": 1.2538086821443273e-05, + "loss": 0.6634, + "step": 5696 + }, + { + "epoch": 0.8717674062739097, + "grad_norm": 2.2947630934368837, + "learning_rate": 1.25356897099703e-05, + "loss": 0.7888, + "step": 5697 + }, + { + "epoch": 0.871920428462127, + "grad_norm": 2.3707709917608204, + "learning_rate": 1.2533292442770705e-05, + "loss": 0.78, + "step": 5698 + }, + { + "epoch": 0.8720734506503443, + "grad_norm": 2.1437011373550003, + "learning_rate": 1.2530895019991714e-05, + "loss": 0.7513, + "step": 5699 + }, + { + "epoch": 0.8722264728385616, + "grad_norm": 2.132187427093857, + "learning_rate": 1.2528497441780554e-05, + "loss": 0.6085, + "step": 5700 + }, + { + "epoch": 0.8723794950267789, + "grad_norm": 2.25455470161129, + "learning_rate": 1.2526099708284476e-05, + "loss": 0.7394, + "step": 5701 + }, + { + "epoch": 0.8725325172149961, + "grad_norm": 2.5745994986985106, + "learning_rate": 1.2523701819650733e-05, + "loss": 0.7067, + "step": 5702 + }, + { + "epoch": 0.8726855394032135, + "grad_norm": 2.139572994127622, + "learning_rate": 1.252130377602659e-05, + "loss": 0.6948, + "step": 5703 + }, + { + "epoch": 0.8728385615914308, + "grad_norm": 2.336370835834088, + "learning_rate": 1.2518905577559317e-05, + "loss": 0.6016, + "step": 5704 + }, + { + "epoch": 0.872991583779648, + "grad_norm": 2.212567829784752, + "learning_rate": 1.2516507224396199e-05, + "loss": 0.7168, + "step": 5705 + }, + { + "epoch": 0.8731446059678654, + "grad_norm": 2.1173283124728144, + "learning_rate": 1.2514108716684527e-05, + "loss": 0.7108, + "step": 5706 + }, + { + "epoch": 0.8732976281560826, + "grad_norm": 2.286564364188799, + "learning_rate": 1.2511710054571603e-05, + "loss": 0.6567, + "step": 5707 + }, + { + "epoch": 0.8734506503442999, + "grad_norm": 2.4319669016285155, + "learning_rate": 1.2509311238204742e-05, + "loss": 0.7369, + "step": 5708 + }, + { + "epoch": 0.8736036725325173, + "grad_norm": 2.3105859541550804, + "learning_rate": 1.250691226773126e-05, + "loss": 0.6155, + "step": 5709 + }, + { + "epoch": 0.8737566947207345, + "grad_norm": 2.299770180808811, + "learning_rate": 1.2504513143298485e-05, + "loss": 0.7094, + "step": 5710 + }, + { + "epoch": 0.8739097169089518, + "grad_norm": 2.0337893937536595, + "learning_rate": 1.2502113865053764e-05, + "loss": 0.6748, + "step": 5711 + }, + { + "epoch": 0.8740627390971691, + "grad_norm": 2.385295588774363, + "learning_rate": 1.2499714433144441e-05, + "loss": 0.7138, + "step": 5712 + }, + { + "epoch": 0.8742157612853864, + "grad_norm": 2.284489191336518, + "learning_rate": 1.2497314847717875e-05, + "loss": 0.743, + "step": 5713 + }, + { + "epoch": 0.8743687834736037, + "grad_norm": 2.3990280322523367, + "learning_rate": 1.2494915108921434e-05, + "loss": 0.6998, + "step": 5714 + }, + { + "epoch": 0.8745218056618209, + "grad_norm": 2.2891843264651905, + "learning_rate": 1.2492515216902499e-05, + "loss": 0.6992, + "step": 5715 + }, + { + "epoch": 0.8746748278500383, + "grad_norm": 2.3026551519865683, + "learning_rate": 1.249011517180845e-05, + "loss": 0.7515, + "step": 5716 + }, + { + "epoch": 0.8748278500382556, + "grad_norm": 2.337520385134906, + "learning_rate": 1.248771497378669e-05, + "loss": 0.7088, + "step": 5717 + }, + { + "epoch": 0.8749808722264728, + "grad_norm": 2.1926541321569006, + "learning_rate": 1.248531462298462e-05, + "loss": 0.7873, + "step": 5718 + }, + { + "epoch": 0.8751338944146901, + "grad_norm": 2.4455148838618244, + "learning_rate": 1.2482914119549658e-05, + "loss": 0.6451, + "step": 5719 + }, + { + "epoch": 0.8752869166029075, + "grad_norm": 2.5339680205683743, + "learning_rate": 1.2480513463629224e-05, + "loss": 0.7793, + "step": 5720 + }, + { + "epoch": 0.8754399387911247, + "grad_norm": 2.3813626401774775, + "learning_rate": 1.2478112655370758e-05, + "loss": 0.5941, + "step": 5721 + }, + { + "epoch": 0.875592960979342, + "grad_norm": 2.4186647306912046, + "learning_rate": 1.2475711694921695e-05, + "loss": 0.723, + "step": 5722 + }, + { + "epoch": 0.8757459831675592, + "grad_norm": 2.033642018028739, + "learning_rate": 1.2473310582429496e-05, + "loss": 0.6569, + "step": 5723 + }, + { + "epoch": 0.8758990053557766, + "grad_norm": 2.050104551897003, + "learning_rate": 1.2470909318041618e-05, + "loss": 0.7118, + "step": 5724 + }, + { + "epoch": 0.8760520275439939, + "grad_norm": 2.098302900404374, + "learning_rate": 1.2468507901905527e-05, + "loss": 0.5665, + "step": 5725 + }, + { + "epoch": 0.8762050497322111, + "grad_norm": 2.238378059736463, + "learning_rate": 1.2466106334168713e-05, + "loss": 0.6099, + "step": 5726 + }, + { + "epoch": 0.8763580719204285, + "grad_norm": 2.264652562759667, + "learning_rate": 1.2463704614978664e-05, + "loss": 0.7528, + "step": 5727 + }, + { + "epoch": 0.8765110941086458, + "grad_norm": 2.315603460621223, + "learning_rate": 1.2461302744482873e-05, + "loss": 0.7416, + "step": 5728 + }, + { + "epoch": 0.876664116296863, + "grad_norm": 1.9928379922302875, + "learning_rate": 1.2458900722828852e-05, + "loss": 0.6208, + "step": 5729 + }, + { + "epoch": 0.8768171384850804, + "grad_norm": 2.196223804754041, + "learning_rate": 1.245649855016412e-05, + "loss": 0.789, + "step": 5730 + }, + { + "epoch": 0.8769701606732976, + "grad_norm": 1.9968093015394444, + "learning_rate": 1.24540962266362e-05, + "loss": 0.6108, + "step": 5731 + }, + { + "epoch": 0.8771231828615149, + "grad_norm": 2.1538731630302754, + "learning_rate": 1.2451693752392634e-05, + "loss": 0.6491, + "step": 5732 + }, + { + "epoch": 0.8772762050497322, + "grad_norm": 2.336782860051827, + "learning_rate": 1.244929112758096e-05, + "loss": 0.6887, + "step": 5733 + }, + { + "epoch": 0.8774292272379495, + "grad_norm": 2.0968379693636265, + "learning_rate": 1.2446888352348738e-05, + "loss": 0.6052, + "step": 5734 + }, + { + "epoch": 0.8775822494261668, + "grad_norm": 2.8413364803514463, + "learning_rate": 1.2444485426843527e-05, + "loss": 0.7297, + "step": 5735 + }, + { + "epoch": 0.8777352716143841, + "grad_norm": 2.414859416385035, + "learning_rate": 1.2442082351212908e-05, + "loss": 0.6594, + "step": 5736 + }, + { + "epoch": 0.8778882938026014, + "grad_norm": 2.1397332273037857, + "learning_rate": 1.2439679125604455e-05, + "loss": 0.6262, + "step": 5737 + }, + { + "epoch": 0.8780413159908187, + "grad_norm": 2.302960630598352, + "learning_rate": 1.2437275750165763e-05, + "loss": 0.6868, + "step": 5738 + }, + { + "epoch": 0.8781943381790359, + "grad_norm": 2.159627088120593, + "learning_rate": 1.2434872225044432e-05, + "loss": 0.6593, + "step": 5739 + }, + { + "epoch": 0.8783473603672532, + "grad_norm": 2.303777890381595, + "learning_rate": 1.2432468550388078e-05, + "loss": 0.7509, + "step": 5740 + }, + { + "epoch": 0.8785003825554706, + "grad_norm": 2.082765533333303, + "learning_rate": 1.2430064726344306e-05, + "loss": 0.6463, + "step": 5741 + }, + { + "epoch": 0.8786534047436878, + "grad_norm": 2.300303736175458, + "learning_rate": 1.2427660753060758e-05, + "loss": 0.7133, + "step": 5742 + }, + { + "epoch": 0.8788064269319051, + "grad_norm": 2.2364378062050254, + "learning_rate": 1.2425256630685063e-05, + "loss": 0.6881, + "step": 5743 + }, + { + "epoch": 0.8789594491201225, + "grad_norm": 2.0318322741252146, + "learning_rate": 1.2422852359364873e-05, + "loss": 0.6728, + "step": 5744 + }, + { + "epoch": 0.8791124713083397, + "grad_norm": 2.2990306189375223, + "learning_rate": 1.2420447939247841e-05, + "loss": 0.623, + "step": 5745 + }, + { + "epoch": 0.879265493496557, + "grad_norm": 2.4502465737162504, + "learning_rate": 1.2418043370481631e-05, + "loss": 0.6824, + "step": 5746 + }, + { + "epoch": 0.8794185156847742, + "grad_norm": 2.2065393096891803, + "learning_rate": 1.2415638653213919e-05, + "loss": 0.631, + "step": 5747 + }, + { + "epoch": 0.8795715378729916, + "grad_norm": 2.3645950652061565, + "learning_rate": 1.2413233787592388e-05, + "loss": 0.7174, + "step": 5748 + }, + { + "epoch": 0.8797245600612089, + "grad_norm": 2.184415402062973, + "learning_rate": 1.241082877376473e-05, + "loss": 0.6933, + "step": 5749 + }, + { + "epoch": 0.8798775822494261, + "grad_norm": 2.2344101746435574, + "learning_rate": 1.2408423611878643e-05, + "loss": 0.7379, + "step": 5750 + }, + { + "epoch": 0.8800306044376435, + "grad_norm": 2.2362564806831107, + "learning_rate": 1.2406018302081841e-05, + "loss": 0.6887, + "step": 5751 + }, + { + "epoch": 0.8801836266258608, + "grad_norm": 2.1149926363107814, + "learning_rate": 1.2403612844522046e-05, + "loss": 0.6424, + "step": 5752 + }, + { + "epoch": 0.880336648814078, + "grad_norm": 2.310775641769557, + "learning_rate": 1.2401207239346982e-05, + "loss": 0.731, + "step": 5753 + }, + { + "epoch": 0.8804896710022954, + "grad_norm": 2.254344287576404, + "learning_rate": 1.2398801486704384e-05, + "loss": 0.7344, + "step": 5754 + }, + { + "epoch": 0.8806426931905126, + "grad_norm": 2.159264355383588, + "learning_rate": 1.2396395586742005e-05, + "loss": 0.7299, + "step": 5755 + }, + { + "epoch": 0.8807957153787299, + "grad_norm": 2.4997945440793528, + "learning_rate": 1.2393989539607601e-05, + "loss": 0.6929, + "step": 5756 + }, + { + "epoch": 0.8809487375669472, + "grad_norm": 2.019321161103983, + "learning_rate": 1.239158334544893e-05, + "loss": 0.5885, + "step": 5757 + }, + { + "epoch": 0.8811017597551645, + "grad_norm": 2.4218191192077176, + "learning_rate": 1.238917700441377e-05, + "loss": 0.7094, + "step": 5758 + }, + { + "epoch": 0.8812547819433818, + "grad_norm": 2.0924771769119808, + "learning_rate": 1.2386770516649904e-05, + "loss": 0.6709, + "step": 5759 + }, + { + "epoch": 0.881407804131599, + "grad_norm": 2.48339718936483, + "learning_rate": 1.2384363882305125e-05, + "loss": 0.7068, + "step": 5760 + }, + { + "epoch": 0.8815608263198164, + "grad_norm": 2.565722271968007, + "learning_rate": 1.2381957101527233e-05, + "loss": 0.6858, + "step": 5761 + }, + { + "epoch": 0.8817138485080337, + "grad_norm": 2.2301004887844513, + "learning_rate": 1.2379550174464034e-05, + "loss": 0.5637, + "step": 5762 + }, + { + "epoch": 0.8818668706962509, + "grad_norm": 2.1331296366939356, + "learning_rate": 1.2377143101263351e-05, + "loss": 0.6533, + "step": 5763 + }, + { + "epoch": 0.8820198928844682, + "grad_norm": 2.0132594420000984, + "learning_rate": 1.2374735882073014e-05, + "loss": 0.6407, + "step": 5764 + }, + { + "epoch": 0.8821729150726856, + "grad_norm": 2.11184387213977, + "learning_rate": 1.2372328517040854e-05, + "loss": 0.7467, + "step": 5765 + }, + { + "epoch": 0.8823259372609028, + "grad_norm": 2.1063721445520156, + "learning_rate": 1.2369921006314716e-05, + "loss": 0.6223, + "step": 5766 + }, + { + "epoch": 0.8824789594491201, + "grad_norm": 2.135649260419935, + "learning_rate": 1.2367513350042461e-05, + "loss": 0.7001, + "step": 5767 + }, + { + "epoch": 0.8826319816373374, + "grad_norm": 2.4573013608444167, + "learning_rate": 1.2365105548371949e-05, + "loss": 0.7085, + "step": 5768 + }, + { + "epoch": 0.8827850038255547, + "grad_norm": 2.4521500514536734, + "learning_rate": 1.2362697601451055e-05, + "loss": 0.6949, + "step": 5769 + }, + { + "epoch": 0.882938026013772, + "grad_norm": 2.3315675840027112, + "learning_rate": 1.2360289509427657e-05, + "loss": 0.6517, + "step": 5770 + }, + { + "epoch": 0.8830910482019892, + "grad_norm": 2.4804572529648365, + "learning_rate": 1.2357881272449645e-05, + "loss": 0.7527, + "step": 5771 + }, + { + "epoch": 0.8832440703902066, + "grad_norm": 2.3055905996779162, + "learning_rate": 1.2355472890664921e-05, + "loss": 0.6958, + "step": 5772 + }, + { + "epoch": 0.8833970925784239, + "grad_norm": 2.2077589919529896, + "learning_rate": 1.2353064364221394e-05, + "loss": 0.7385, + "step": 5773 + }, + { + "epoch": 0.8835501147666411, + "grad_norm": 2.5673498996652393, + "learning_rate": 1.2350655693266977e-05, + "loss": 0.7746, + "step": 5774 + }, + { + "epoch": 0.8837031369548585, + "grad_norm": 2.327867947254067, + "learning_rate": 1.23482468779496e-05, + "loss": 0.7871, + "step": 5775 + }, + { + "epoch": 0.8838561591430757, + "grad_norm": 1.994862966333433, + "learning_rate": 1.2345837918417192e-05, + "loss": 0.5689, + "step": 5776 + }, + { + "epoch": 0.884009181331293, + "grad_norm": 2.1628077023188577, + "learning_rate": 1.2343428814817704e-05, + "loss": 0.6696, + "step": 5777 + }, + { + "epoch": 0.8841622035195104, + "grad_norm": 2.4281221978724177, + "learning_rate": 1.2341019567299084e-05, + "loss": 0.7309, + "step": 5778 + }, + { + "epoch": 0.8843152257077276, + "grad_norm": 2.415771519746245, + "learning_rate": 1.2338610176009294e-05, + "loss": 0.7888, + "step": 5779 + }, + { + "epoch": 0.8844682478959449, + "grad_norm": 2.0146648291440132, + "learning_rate": 1.2336200641096302e-05, + "loss": 0.7183, + "step": 5780 + }, + { + "epoch": 0.8846212700841622, + "grad_norm": 2.101105174830713, + "learning_rate": 1.2333790962708096e-05, + "loss": 0.6244, + "step": 5781 + }, + { + "epoch": 0.8847742922723795, + "grad_norm": 2.747095291584874, + "learning_rate": 1.233138114099265e-05, + "loss": 0.7718, + "step": 5782 + }, + { + "epoch": 0.8849273144605968, + "grad_norm": 2.1914791612843842, + "learning_rate": 1.2328971176097973e-05, + "loss": 0.7515, + "step": 5783 + }, + { + "epoch": 0.885080336648814, + "grad_norm": 2.2870028208167317, + "learning_rate": 1.2326561068172063e-05, + "loss": 0.6468, + "step": 5784 + }, + { + "epoch": 0.8852333588370314, + "grad_norm": 2.27622510477052, + "learning_rate": 1.2324150817362934e-05, + "loss": 0.6705, + "step": 5785 + }, + { + "epoch": 0.8853863810252487, + "grad_norm": 2.3932459194155125, + "learning_rate": 1.2321740423818614e-05, + "loss": 0.72, + "step": 5786 + }, + { + "epoch": 0.8855394032134659, + "grad_norm": 2.0056690093491496, + "learning_rate": 1.2319329887687132e-05, + "loss": 0.6506, + "step": 5787 + }, + { + "epoch": 0.8856924254016832, + "grad_norm": 2.3515431421733064, + "learning_rate": 1.2316919209116527e-05, + "loss": 0.718, + "step": 5788 + }, + { + "epoch": 0.8858454475899006, + "grad_norm": 2.564976261897152, + "learning_rate": 1.2314508388254848e-05, + "loss": 0.7486, + "step": 5789 + }, + { + "epoch": 0.8859984697781178, + "grad_norm": 2.286392560336342, + "learning_rate": 1.2312097425250157e-05, + "loss": 0.5844, + "step": 5790 + }, + { + "epoch": 0.8861514919663351, + "grad_norm": 2.40196016256603, + "learning_rate": 1.230968632025052e-05, + "loss": 0.6324, + "step": 5791 + }, + { + "epoch": 0.8863045141545524, + "grad_norm": 2.1219767397301377, + "learning_rate": 1.2307275073404005e-05, + "loss": 0.6549, + "step": 5792 + }, + { + "epoch": 0.8864575363427697, + "grad_norm": 2.1545723623205353, + "learning_rate": 1.2304863684858708e-05, + "loss": 0.6666, + "step": 5793 + }, + { + "epoch": 0.886610558530987, + "grad_norm": 2.1982785238169624, + "learning_rate": 1.2302452154762711e-05, + "loss": 0.7322, + "step": 5794 + }, + { + "epoch": 0.8867635807192042, + "grad_norm": 1.9445798805354053, + "learning_rate": 1.230004048326412e-05, + "loss": 0.5734, + "step": 5795 + }, + { + "epoch": 0.8869166029074216, + "grad_norm": 2.097172733466339, + "learning_rate": 1.2297628670511046e-05, + "loss": 0.5911, + "step": 5796 + }, + { + "epoch": 0.8870696250956389, + "grad_norm": 2.2876608565036736, + "learning_rate": 1.229521671665161e-05, + "loss": 0.823, + "step": 5797 + }, + { + "epoch": 0.8872226472838561, + "grad_norm": 2.130982677926774, + "learning_rate": 1.2292804621833932e-05, + "loss": 0.6003, + "step": 5798 + }, + { + "epoch": 0.8873756694720735, + "grad_norm": 2.3899589697261012, + "learning_rate": 1.2290392386206153e-05, + "loss": 0.7014, + "step": 5799 + }, + { + "epoch": 0.8875286916602907, + "grad_norm": 2.463939146997431, + "learning_rate": 1.228798000991642e-05, + "loss": 0.6992, + "step": 5800 + }, + { + "epoch": 0.887681713848508, + "grad_norm": 2.405366156872562, + "learning_rate": 1.228556749311288e-05, + "loss": 0.7654, + "step": 5801 + }, + { + "epoch": 0.8878347360367254, + "grad_norm": 1.8746932186360155, + "learning_rate": 1.2283154835943704e-05, + "loss": 0.5216, + "step": 5802 + }, + { + "epoch": 0.8879877582249426, + "grad_norm": 2.1661546901574704, + "learning_rate": 1.2280742038557056e-05, + "loss": 0.7372, + "step": 5803 + }, + { + "epoch": 0.8881407804131599, + "grad_norm": 2.493655678620059, + "learning_rate": 1.2278329101101116e-05, + "loss": 0.7774, + "step": 5804 + }, + { + "epoch": 0.8882938026013772, + "grad_norm": 2.0972516435366106, + "learning_rate": 1.2275916023724072e-05, + "loss": 0.6212, + "step": 5805 + }, + { + "epoch": 0.8884468247895945, + "grad_norm": 2.050606086919499, + "learning_rate": 1.2273502806574126e-05, + "loss": 0.7213, + "step": 5806 + }, + { + "epoch": 0.8885998469778118, + "grad_norm": 2.2270183694432175, + "learning_rate": 1.2271089449799476e-05, + "loss": 0.7031, + "step": 5807 + }, + { + "epoch": 0.888752869166029, + "grad_norm": 2.2753592897527097, + "learning_rate": 1.2268675953548336e-05, + "loss": 0.6464, + "step": 5808 + }, + { + "epoch": 0.8889058913542464, + "grad_norm": 2.2711193654457777, + "learning_rate": 1.2266262317968934e-05, + "loss": 0.7809, + "step": 5809 + }, + { + "epoch": 0.8890589135424637, + "grad_norm": 2.1135815709154793, + "learning_rate": 1.2263848543209496e-05, + "loss": 0.6955, + "step": 5810 + }, + { + "epoch": 0.8892119357306809, + "grad_norm": 2.4245861015340573, + "learning_rate": 1.2261434629418263e-05, + "loss": 0.7699, + "step": 5811 + }, + { + "epoch": 0.8893649579188982, + "grad_norm": 2.102041027083013, + "learning_rate": 1.2259020576743484e-05, + "loss": 0.6489, + "step": 5812 + }, + { + "epoch": 0.8895179801071156, + "grad_norm": 2.0571284723371948, + "learning_rate": 1.2256606385333411e-05, + "loss": 0.7392, + "step": 5813 + }, + { + "epoch": 0.8896710022953328, + "grad_norm": 2.232074088650882, + "learning_rate": 1.2254192055336315e-05, + "loss": 0.7079, + "step": 5814 + }, + { + "epoch": 0.8898240244835501, + "grad_norm": 2.3912417483625164, + "learning_rate": 1.2251777586900466e-05, + "loss": 0.7092, + "step": 5815 + }, + { + "epoch": 0.8899770466717674, + "grad_norm": 2.055983923638346, + "learning_rate": 1.2249362980174144e-05, + "loss": 0.6946, + "step": 5816 + }, + { + "epoch": 0.8901300688599847, + "grad_norm": 2.4091786231449146, + "learning_rate": 1.2246948235305642e-05, + "loss": 0.7588, + "step": 5817 + }, + { + "epoch": 0.890283091048202, + "grad_norm": 2.4550247748254446, + "learning_rate": 1.2244533352443262e-05, + "loss": 0.8143, + "step": 5818 + }, + { + "epoch": 0.8904361132364192, + "grad_norm": 2.44563168398848, + "learning_rate": 1.2242118331735306e-05, + "loss": 0.7611, + "step": 5819 + }, + { + "epoch": 0.8905891354246366, + "grad_norm": 2.0083209726168167, + "learning_rate": 1.223970317333009e-05, + "loss": 0.6456, + "step": 5820 + }, + { + "epoch": 0.8907421576128539, + "grad_norm": 2.261824846327099, + "learning_rate": 1.2237287877375943e-05, + "loss": 0.659, + "step": 5821 + }, + { + "epoch": 0.8908951798010711, + "grad_norm": 2.415318778732395, + "learning_rate": 1.2234872444021197e-05, + "loss": 0.6324, + "step": 5822 + }, + { + "epoch": 0.8910482019892885, + "grad_norm": 2.491753442524876, + "learning_rate": 1.223245687341419e-05, + "loss": 0.8115, + "step": 5823 + }, + { + "epoch": 0.8912012241775057, + "grad_norm": 2.4285888845880415, + "learning_rate": 1.223004116570327e-05, + "loss": 0.7954, + "step": 5824 + }, + { + "epoch": 0.891354246365723, + "grad_norm": 2.238293963593015, + "learning_rate": 1.22276253210368e-05, + "loss": 0.7294, + "step": 5825 + }, + { + "epoch": 0.8915072685539404, + "grad_norm": 2.326987572185359, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.5903, + "step": 5826 + }, + { + "epoch": 0.8916602907421576, + "grad_norm": 2.1780223917114836, + "learning_rate": 1.222279322143068e-05, + "loss": 0.6964, + "step": 5827 + }, + { + "epoch": 0.8918133129303749, + "grad_norm": 2.048347926322804, + "learning_rate": 1.2220376966787785e-05, + "loss": 0.6111, + "step": 5828 + }, + { + "epoch": 0.8919663351185922, + "grad_norm": 2.003970443987533, + "learning_rate": 1.2217960575782856e-05, + "loss": 0.6416, + "step": 5829 + }, + { + "epoch": 0.8921193573068095, + "grad_norm": 2.55460997045148, + "learning_rate": 1.2215544048564294e-05, + "loss": 0.7348, + "step": 5830 + }, + { + "epoch": 0.8922723794950268, + "grad_norm": 2.3815038136371496, + "learning_rate": 1.22131273852805e-05, + "loss": 0.7828, + "step": 5831 + }, + { + "epoch": 0.892425401683244, + "grad_norm": 2.185261352231627, + "learning_rate": 1.22107105860799e-05, + "loss": 0.6699, + "step": 5832 + }, + { + "epoch": 0.8925784238714614, + "grad_norm": 1.9530762615679256, + "learning_rate": 1.220829365111091e-05, + "loss": 0.5621, + "step": 5833 + }, + { + "epoch": 0.8927314460596787, + "grad_norm": 2.5679065405768693, + "learning_rate": 1.2205876580521971e-05, + "loss": 0.7618, + "step": 5834 + }, + { + "epoch": 0.8928844682478959, + "grad_norm": 1.9323644230313388, + "learning_rate": 1.2203459374461522e-05, + "loss": 0.7202, + "step": 5835 + }, + { + "epoch": 0.8930374904361132, + "grad_norm": 2.1426859186740352, + "learning_rate": 1.220104203307801e-05, + "loss": 0.672, + "step": 5836 + }, + { + "epoch": 0.8931905126243306, + "grad_norm": 2.4556090557348873, + "learning_rate": 1.2198624556519899e-05, + "loss": 0.6719, + "step": 5837 + }, + { + "epoch": 0.8933435348125478, + "grad_norm": 2.411343782082261, + "learning_rate": 1.219620694493565e-05, + "loss": 0.8695, + "step": 5838 + }, + { + "epoch": 0.8934965570007651, + "grad_norm": 1.9229137910731127, + "learning_rate": 1.2193789198473743e-05, + "loss": 0.5691, + "step": 5839 + }, + { + "epoch": 0.8936495791889824, + "grad_norm": 2.7917967312533083, + "learning_rate": 1.2191371317282659e-05, + "loss": 0.9142, + "step": 5840 + }, + { + "epoch": 0.8938026013771997, + "grad_norm": 2.7345209669423927, + "learning_rate": 1.2188953301510891e-05, + "loss": 0.7448, + "step": 5841 + }, + { + "epoch": 0.893955623565417, + "grad_norm": 2.2727678554130857, + "learning_rate": 1.2186535151306934e-05, + "loss": 0.6421, + "step": 5842 + }, + { + "epoch": 0.8941086457536342, + "grad_norm": 2.258694735574518, + "learning_rate": 1.2184116866819303e-05, + "loss": 0.7776, + "step": 5843 + }, + { + "epoch": 0.8942616679418516, + "grad_norm": 2.098915575747473, + "learning_rate": 1.2181698448196508e-05, + "loss": 0.6931, + "step": 5844 + }, + { + "epoch": 0.8944146901300689, + "grad_norm": 2.3098715398217706, + "learning_rate": 1.2179279895587078e-05, + "loss": 0.7966, + "step": 5845 + }, + { + "epoch": 0.8945677123182861, + "grad_norm": 2.1839510482142255, + "learning_rate": 1.2176861209139543e-05, + "loss": 0.601, + "step": 5846 + }, + { + "epoch": 0.8947207345065035, + "grad_norm": 2.186745048637725, + "learning_rate": 1.217444238900245e-05, + "loss": 0.6878, + "step": 5847 + }, + { + "epoch": 0.8948737566947207, + "grad_norm": 2.3464306103822428, + "learning_rate": 1.2172023435324336e-05, + "loss": 0.6573, + "step": 5848 + }, + { + "epoch": 0.895026778882938, + "grad_norm": 2.1034600448438043, + "learning_rate": 1.2169604348253772e-05, + "loss": 0.7129, + "step": 5849 + }, + { + "epoch": 0.8951798010711554, + "grad_norm": 2.4272691681079164, + "learning_rate": 1.2167185127939314e-05, + "loss": 0.8234, + "step": 5850 + }, + { + "epoch": 0.8953328232593726, + "grad_norm": 2.3142269494809873, + "learning_rate": 1.2164765774529541e-05, + "loss": 0.8082, + "step": 5851 + }, + { + "epoch": 0.8954858454475899, + "grad_norm": 2.2118916809347557, + "learning_rate": 1.2162346288173033e-05, + "loss": 0.7194, + "step": 5852 + }, + { + "epoch": 0.8956388676358072, + "grad_norm": 2.654478274331671, + "learning_rate": 1.215992666901838e-05, + "loss": 0.7258, + "step": 5853 + }, + { + "epoch": 0.8957918898240245, + "grad_norm": 2.1917989705330174, + "learning_rate": 1.2157506917214179e-05, + "loss": 0.7053, + "step": 5854 + }, + { + "epoch": 0.8959449120122418, + "grad_norm": 2.295237595115881, + "learning_rate": 1.215508703290904e-05, + "loss": 0.8176, + "step": 5855 + }, + { + "epoch": 0.896097934200459, + "grad_norm": 2.0125203316125724, + "learning_rate": 1.2152667016251575e-05, + "loss": 0.6165, + "step": 5856 + }, + { + "epoch": 0.8962509563886764, + "grad_norm": 2.137048919743919, + "learning_rate": 1.2150246867390407e-05, + "loss": 0.6266, + "step": 5857 + }, + { + "epoch": 0.8964039785768937, + "grad_norm": 2.3382790666248825, + "learning_rate": 1.2147826586474168e-05, + "loss": 0.7635, + "step": 5858 + }, + { + "epoch": 0.8965570007651109, + "grad_norm": 2.1318790415160183, + "learning_rate": 1.2145406173651497e-05, + "loss": 0.6476, + "step": 5859 + }, + { + "epoch": 0.8967100229533282, + "grad_norm": 2.169522268623894, + "learning_rate": 1.2142985629071037e-05, + "loss": 0.6822, + "step": 5860 + }, + { + "epoch": 0.8968630451415456, + "grad_norm": 2.0656815045352417, + "learning_rate": 1.2140564952881446e-05, + "loss": 0.5719, + "step": 5861 + }, + { + "epoch": 0.8970160673297628, + "grad_norm": 2.0940298380060023, + "learning_rate": 1.2138144145231387e-05, + "loss": 0.6189, + "step": 5862 + }, + { + "epoch": 0.8971690895179801, + "grad_norm": 2.1912680322446243, + "learning_rate": 1.2135723206269535e-05, + "loss": 0.7313, + "step": 5863 + }, + { + "epoch": 0.8973221117061974, + "grad_norm": 2.042693256307619, + "learning_rate": 1.2133302136144564e-05, + "loss": 0.5883, + "step": 5864 + }, + { + "epoch": 0.8974751338944147, + "grad_norm": 2.162572034162192, + "learning_rate": 1.2130880935005165e-05, + "loss": 0.781, + "step": 5865 + }, + { + "epoch": 0.897628156082632, + "grad_norm": 2.0853415311565113, + "learning_rate": 1.212845960300003e-05, + "loss": 0.6414, + "step": 5866 + }, + { + "epoch": 0.8977811782708492, + "grad_norm": 2.306821724711259, + "learning_rate": 1.2126038140277863e-05, + "loss": 0.6421, + "step": 5867 + }, + { + "epoch": 0.8979342004590666, + "grad_norm": 2.3155477340665698, + "learning_rate": 1.212361654698738e-05, + "loss": 0.6216, + "step": 5868 + }, + { + "epoch": 0.8980872226472839, + "grad_norm": 2.2402002639166936, + "learning_rate": 1.2121194823277294e-05, + "loss": 0.6306, + "step": 5869 + }, + { + "epoch": 0.8982402448355011, + "grad_norm": 2.2497974374833793, + "learning_rate": 1.2118772969296337e-05, + "loss": 0.6934, + "step": 5870 + }, + { + "epoch": 0.8983932670237185, + "grad_norm": 2.2281448098479264, + "learning_rate": 1.2116350985193243e-05, + "loss": 0.6245, + "step": 5871 + }, + { + "epoch": 0.8985462892119357, + "grad_norm": 2.199538303015666, + "learning_rate": 1.2113928871116758e-05, + "loss": 0.7062, + "step": 5872 + }, + { + "epoch": 0.898699311400153, + "grad_norm": 2.0709420388921207, + "learning_rate": 1.211150662721563e-05, + "loss": 0.6909, + "step": 5873 + }, + { + "epoch": 0.8988523335883704, + "grad_norm": 2.5142622500922154, + "learning_rate": 1.2109084253638617e-05, + "loss": 0.8169, + "step": 5874 + }, + { + "epoch": 0.8990053557765876, + "grad_norm": 2.140104777112769, + "learning_rate": 1.210666175053449e-05, + "loss": 0.6359, + "step": 5875 + }, + { + "epoch": 0.8991583779648049, + "grad_norm": 2.164375390414777, + "learning_rate": 1.2104239118052027e-05, + "loss": 0.6977, + "step": 5876 + }, + { + "epoch": 0.8993114001530221, + "grad_norm": 2.1929526697694, + "learning_rate": 1.2101816356340004e-05, + "loss": 0.6797, + "step": 5877 + }, + { + "epoch": 0.8994644223412395, + "grad_norm": 2.3775107486140894, + "learning_rate": 1.2099393465547218e-05, + "loss": 0.8075, + "step": 5878 + }, + { + "epoch": 0.8996174445294568, + "grad_norm": 2.19374146725586, + "learning_rate": 1.2096970445822467e-05, + "loss": 0.6913, + "step": 5879 + }, + { + "epoch": 0.899770466717674, + "grad_norm": 2.356353536060419, + "learning_rate": 1.209454729731456e-05, + "loss": 0.6564, + "step": 5880 + }, + { + "epoch": 0.8999234889058914, + "grad_norm": 2.2970947492536826, + "learning_rate": 1.2092124020172304e-05, + "loss": 0.7165, + "step": 5881 + }, + { + "epoch": 0.9000765110941087, + "grad_norm": 2.2264397828108335, + "learning_rate": 1.208970061454453e-05, + "loss": 0.7145, + "step": 5882 + }, + { + "epoch": 0.9002295332823259, + "grad_norm": 2.2895364047526754, + "learning_rate": 1.2087277080580064e-05, + "loss": 0.7901, + "step": 5883 + }, + { + "epoch": 0.9003825554705432, + "grad_norm": 2.6582101773237494, + "learning_rate": 1.2084853418427754e-05, + "loss": 0.729, + "step": 5884 + }, + { + "epoch": 0.9005355776587605, + "grad_norm": 2.4029001974784734, + "learning_rate": 1.2082429628236433e-05, + "loss": 0.663, + "step": 5885 + }, + { + "epoch": 0.9006885998469778, + "grad_norm": 2.2788746677911753, + "learning_rate": 1.2080005710154962e-05, + "loss": 0.6018, + "step": 5886 + }, + { + "epoch": 0.9008416220351951, + "grad_norm": 2.2037783198820864, + "learning_rate": 1.2077581664332205e-05, + "loss": 0.6541, + "step": 5887 + }, + { + "epoch": 0.9009946442234124, + "grad_norm": 2.5472288046633587, + "learning_rate": 1.2075157490917033e-05, + "loss": 0.7121, + "step": 5888 + }, + { + "epoch": 0.9011476664116297, + "grad_norm": 2.263169854265418, + "learning_rate": 1.2072733190058319e-05, + "loss": 0.723, + "step": 5889 + }, + { + "epoch": 0.901300688599847, + "grad_norm": 2.565421276878311, + "learning_rate": 1.2070308761904949e-05, + "loss": 0.7627, + "step": 5890 + }, + { + "epoch": 0.9014537107880642, + "grad_norm": 2.33516891773541, + "learning_rate": 1.2067884206605821e-05, + "loss": 0.6602, + "step": 5891 + }, + { + "epoch": 0.9016067329762816, + "grad_norm": 2.369632550813917, + "learning_rate": 1.2065459524309838e-05, + "loss": 0.7074, + "step": 5892 + }, + { + "epoch": 0.9017597551644988, + "grad_norm": 2.245894801979679, + "learning_rate": 1.20630347151659e-05, + "loss": 0.7945, + "step": 5893 + }, + { + "epoch": 0.9019127773527161, + "grad_norm": 2.133776733386824, + "learning_rate": 1.2060609779322932e-05, + "loss": 0.6325, + "step": 5894 + }, + { + "epoch": 0.9020657995409335, + "grad_norm": 2.42818076004384, + "learning_rate": 1.2058184716929858e-05, + "loss": 0.8039, + "step": 5895 + }, + { + "epoch": 0.9022188217291507, + "grad_norm": 2.1864609949528937, + "learning_rate": 1.2055759528135604e-05, + "loss": 0.6626, + "step": 5896 + }, + { + "epoch": 0.902371843917368, + "grad_norm": 2.3056942365670166, + "learning_rate": 1.2053334213089119e-05, + "loss": 0.5935, + "step": 5897 + }, + { + "epoch": 0.9025248661055854, + "grad_norm": 2.4474426076241613, + "learning_rate": 1.2050908771939347e-05, + "loss": 0.6779, + "step": 5898 + }, + { + "epoch": 0.9026778882938026, + "grad_norm": 2.245242913146272, + "learning_rate": 1.204848320483524e-05, + "loss": 0.672, + "step": 5899 + }, + { + "epoch": 0.9028309104820199, + "grad_norm": 2.4002007892851376, + "learning_rate": 1.2046057511925773e-05, + "loss": 0.6885, + "step": 5900 + }, + { + "epoch": 0.9029839326702371, + "grad_norm": 2.0513556880468804, + "learning_rate": 1.2043631693359906e-05, + "loss": 0.6375, + "step": 5901 + }, + { + "epoch": 0.9031369548584545, + "grad_norm": 2.185620019488159, + "learning_rate": 1.2041205749286617e-05, + "loss": 0.6056, + "step": 5902 + }, + { + "epoch": 0.9032899770466718, + "grad_norm": 2.5696711510264727, + "learning_rate": 1.2038779679854905e-05, + "loss": 0.8615, + "step": 5903 + }, + { + "epoch": 0.903442999234889, + "grad_norm": 2.427270547334622, + "learning_rate": 1.2036353485213756e-05, + "loss": 0.7441, + "step": 5904 + }, + { + "epoch": 0.9035960214231064, + "grad_norm": 2.6216911775274254, + "learning_rate": 1.2033927165512171e-05, + "loss": 0.6633, + "step": 5905 + }, + { + "epoch": 0.9037490436113237, + "grad_norm": 2.3969872880103655, + "learning_rate": 1.2031500720899164e-05, + "loss": 0.7292, + "step": 5906 + }, + { + "epoch": 0.9039020657995409, + "grad_norm": 2.3206810022930435, + "learning_rate": 1.2029074151523747e-05, + "loss": 0.6184, + "step": 5907 + }, + { + "epoch": 0.9040550879877582, + "grad_norm": 2.1248876858842998, + "learning_rate": 1.2026647457534953e-05, + "loss": 0.6063, + "step": 5908 + }, + { + "epoch": 0.9042081101759755, + "grad_norm": 2.105010540329758, + "learning_rate": 1.2024220639081807e-05, + "loss": 0.6701, + "step": 5909 + }, + { + "epoch": 0.9043611323641928, + "grad_norm": 2.390961362332351, + "learning_rate": 1.2021793696313355e-05, + "loss": 0.8372, + "step": 5910 + }, + { + "epoch": 0.9045141545524101, + "grad_norm": 2.3906835802026634, + "learning_rate": 1.2019366629378642e-05, + "loss": 0.6914, + "step": 5911 + }, + { + "epoch": 0.9046671767406274, + "grad_norm": 2.331666879806483, + "learning_rate": 1.2016939438426722e-05, + "loss": 0.7861, + "step": 5912 + }, + { + "epoch": 0.9048201989288447, + "grad_norm": 2.2358574061590173, + "learning_rate": 1.2014512123606665e-05, + "loss": 0.6219, + "step": 5913 + }, + { + "epoch": 0.904973221117062, + "grad_norm": 2.106600490640814, + "learning_rate": 1.2012084685067535e-05, + "loss": 0.7043, + "step": 5914 + }, + { + "epoch": 0.9051262433052792, + "grad_norm": 2.2539604314687325, + "learning_rate": 1.2009657122958413e-05, + "loss": 0.6648, + "step": 5915 + }, + { + "epoch": 0.9052792654934966, + "grad_norm": 2.0671088263249464, + "learning_rate": 1.2007229437428387e-05, + "loss": 0.5475, + "step": 5916 + }, + { + "epoch": 0.9054322876817138, + "grad_norm": 2.1790247947962507, + "learning_rate": 1.200480162862655e-05, + "loss": 0.735, + "step": 5917 + }, + { + "epoch": 0.9055853098699311, + "grad_norm": 2.160285319705886, + "learning_rate": 1.2002373696701999e-05, + "loss": 0.6605, + "step": 5918 + }, + { + "epoch": 0.9057383320581485, + "grad_norm": 2.669653172417057, + "learning_rate": 1.1999945641803845e-05, + "loss": 0.7651, + "step": 5919 + }, + { + "epoch": 0.9058913542463657, + "grad_norm": 2.332008350846348, + "learning_rate": 1.199751746408121e-05, + "loss": 0.7304, + "step": 5920 + }, + { + "epoch": 0.906044376434583, + "grad_norm": 2.3630854316639627, + "learning_rate": 1.199508916368321e-05, + "loss": 0.7975, + "step": 5921 + }, + { + "epoch": 0.9061973986228004, + "grad_norm": 2.1925117093837962, + "learning_rate": 1.1992660740758981e-05, + "loss": 0.6704, + "step": 5922 + }, + { + "epoch": 0.9063504208110176, + "grad_norm": 2.2723588771999528, + "learning_rate": 1.1990232195457659e-05, + "loss": 0.7037, + "step": 5923 + }, + { + "epoch": 0.9065034429992349, + "grad_norm": 2.3270779882957937, + "learning_rate": 1.1987803527928393e-05, + "loss": 0.6958, + "step": 5924 + }, + { + "epoch": 0.9066564651874521, + "grad_norm": 2.2060500688080227, + "learning_rate": 1.1985374738320337e-05, + "loss": 0.5766, + "step": 5925 + }, + { + "epoch": 0.9068094873756695, + "grad_norm": 2.2317586164719, + "learning_rate": 1.1982945826782651e-05, + "loss": 0.6561, + "step": 5926 + }, + { + "epoch": 0.9069625095638868, + "grad_norm": 2.322289561129461, + "learning_rate": 1.19805167934645e-05, + "loss": 0.6501, + "step": 5927 + }, + { + "epoch": 0.907115531752104, + "grad_norm": 2.515443301034443, + "learning_rate": 1.197808763851507e-05, + "loss": 0.7345, + "step": 5928 + }, + { + "epoch": 0.9072685539403214, + "grad_norm": 2.2527717426764338, + "learning_rate": 1.1975658362083542e-05, + "loss": 0.7293, + "step": 5929 + }, + { + "epoch": 0.9074215761285387, + "grad_norm": 2.444160848812312, + "learning_rate": 1.19732289643191e-05, + "loss": 0.7744, + "step": 5930 + }, + { + "epoch": 0.9075745983167559, + "grad_norm": 2.0008299026167546, + "learning_rate": 1.1970799445370948e-05, + "loss": 0.5623, + "step": 5931 + }, + { + "epoch": 0.9077276205049732, + "grad_norm": 2.2752478064076023, + "learning_rate": 1.1968369805388295e-05, + "loss": 0.6978, + "step": 5932 + }, + { + "epoch": 0.9078806426931905, + "grad_norm": 2.583491640352064, + "learning_rate": 1.196594004452035e-05, + "loss": 0.7084, + "step": 5933 + }, + { + "epoch": 0.9080336648814078, + "grad_norm": 2.6861883035754706, + "learning_rate": 1.1963510162916339e-05, + "loss": 0.754, + "step": 5934 + }, + { + "epoch": 0.9081866870696251, + "grad_norm": 2.094923336464147, + "learning_rate": 1.1961080160725484e-05, + "loss": 0.6658, + "step": 5935 + }, + { + "epoch": 0.9083397092578424, + "grad_norm": 2.3754655574983934, + "learning_rate": 1.1958650038097029e-05, + "loss": 0.7497, + "step": 5936 + }, + { + "epoch": 0.9084927314460597, + "grad_norm": 2.247236567509479, + "learning_rate": 1.1956219795180209e-05, + "loss": 0.6799, + "step": 5937 + }, + { + "epoch": 0.908645753634277, + "grad_norm": 2.176455805806765, + "learning_rate": 1.1953789432124279e-05, + "loss": 0.7143, + "step": 5938 + }, + { + "epoch": 0.9087987758224942, + "grad_norm": 2.1382985224906443, + "learning_rate": 1.1951358949078497e-05, + "loss": 0.6692, + "step": 5939 + }, + { + "epoch": 0.9089517980107116, + "grad_norm": 2.2784821848914487, + "learning_rate": 1.1948928346192128e-05, + "loss": 0.7507, + "step": 5940 + }, + { + "epoch": 0.9091048201989288, + "grad_norm": 2.094814546778022, + "learning_rate": 1.1946497623614448e-05, + "loss": 0.5985, + "step": 5941 + }, + { + "epoch": 0.9092578423871461, + "grad_norm": 2.0659105778735283, + "learning_rate": 1.194406678149473e-05, + "loss": 0.5888, + "step": 5942 + }, + { + "epoch": 0.9094108645753635, + "grad_norm": 2.529484983416761, + "learning_rate": 1.1941635819982267e-05, + "loss": 0.7306, + "step": 5943 + }, + { + "epoch": 0.9095638867635807, + "grad_norm": 2.316500778173214, + "learning_rate": 1.1939204739226353e-05, + "loss": 0.6232, + "step": 5944 + }, + { + "epoch": 0.909716908951798, + "grad_norm": 2.2416933756176545, + "learning_rate": 1.193677353937629e-05, + "loss": 0.6686, + "step": 5945 + }, + { + "epoch": 0.9098699311400154, + "grad_norm": 2.296889261532724, + "learning_rate": 1.1934342220581384e-05, + "loss": 0.5793, + "step": 5946 + }, + { + "epoch": 0.9100229533282326, + "grad_norm": 2.4069543424400677, + "learning_rate": 1.193191078299096e-05, + "loss": 0.8436, + "step": 5947 + }, + { + "epoch": 0.9101759755164499, + "grad_norm": 2.162792615318407, + "learning_rate": 1.1929479226754338e-05, + "loss": 0.7032, + "step": 5948 + }, + { + "epoch": 0.9103289977046671, + "grad_norm": 2.2406697992043254, + "learning_rate": 1.1927047552020845e-05, + "loss": 0.6408, + "step": 5949 + }, + { + "epoch": 0.9104820198928845, + "grad_norm": 2.4229962303297103, + "learning_rate": 1.1924615758939824e-05, + "loss": 0.6942, + "step": 5950 + }, + { + "epoch": 0.9106350420811018, + "grad_norm": 2.467996134227574, + "learning_rate": 1.192218384766062e-05, + "loss": 0.782, + "step": 5951 + }, + { + "epoch": 0.910788064269319, + "grad_norm": 2.4342339180377603, + "learning_rate": 1.1919751818332586e-05, + "loss": 0.688, + "step": 5952 + }, + { + "epoch": 0.9109410864575364, + "grad_norm": 2.1932437749285167, + "learning_rate": 1.191731967110508e-05, + "loss": 0.6378, + "step": 5953 + }, + { + "epoch": 0.9110941086457537, + "grad_norm": 2.055801571139704, + "learning_rate": 1.1914887406127478e-05, + "loss": 0.6698, + "step": 5954 + }, + { + "epoch": 0.9112471308339709, + "grad_norm": 2.2684464246899, + "learning_rate": 1.1912455023549147e-05, + "loss": 0.6851, + "step": 5955 + }, + { + "epoch": 0.9114001530221882, + "grad_norm": 2.5037197092461922, + "learning_rate": 1.1910022523519468e-05, + "loss": 0.7673, + "step": 5956 + }, + { + "epoch": 0.9115531752104055, + "grad_norm": 1.9131788391694138, + "learning_rate": 1.1907589906187837e-05, + "loss": 0.5243, + "step": 5957 + }, + { + "epoch": 0.9117061973986228, + "grad_norm": 2.234134469025724, + "learning_rate": 1.1905157171703651e-05, + "loss": 0.6361, + "step": 5958 + }, + { + "epoch": 0.9118592195868401, + "grad_norm": 2.210064411480152, + "learning_rate": 1.1902724320216304e-05, + "loss": 0.6753, + "step": 5959 + }, + { + "epoch": 0.9120122417750574, + "grad_norm": 2.3378206504402086, + "learning_rate": 1.1900291351875215e-05, + "loss": 0.6271, + "step": 5960 + }, + { + "epoch": 0.9121652639632747, + "grad_norm": 2.2103620527103556, + "learning_rate": 1.18978582668298e-05, + "loss": 0.6063, + "step": 5961 + }, + { + "epoch": 0.912318286151492, + "grad_norm": 2.265094089348472, + "learning_rate": 1.1895425065229487e-05, + "loss": 0.6786, + "step": 5962 + }, + { + "epoch": 0.9124713083397092, + "grad_norm": 2.1542940090981553, + "learning_rate": 1.1892991747223704e-05, + "loss": 0.6719, + "step": 5963 + }, + { + "epoch": 0.9126243305279266, + "grad_norm": 2.1440181946848904, + "learning_rate": 1.189055831296189e-05, + "loss": 0.5932, + "step": 5964 + }, + { + "epoch": 0.9127773527161438, + "grad_norm": 2.0963045032970906, + "learning_rate": 1.1888124762593496e-05, + "loss": 0.6104, + "step": 5965 + }, + { + "epoch": 0.9129303749043611, + "grad_norm": 2.5262671669907633, + "learning_rate": 1.1885691096267975e-05, + "loss": 0.717, + "step": 5966 + }, + { + "epoch": 0.9130833970925785, + "grad_norm": 2.2639544983140274, + "learning_rate": 1.1883257314134787e-05, + "loss": 0.6589, + "step": 5967 + }, + { + "epoch": 0.9132364192807957, + "grad_norm": 2.020210103595384, + "learning_rate": 1.1880823416343397e-05, + "loss": 0.6291, + "step": 5968 + }, + { + "epoch": 0.913389441469013, + "grad_norm": 2.1045346358957007, + "learning_rate": 1.1878389403043284e-05, + "loss": 0.6635, + "step": 5969 + }, + { + "epoch": 0.9135424636572304, + "grad_norm": 2.0845494979682653, + "learning_rate": 1.1875955274383934e-05, + "loss": 0.6641, + "step": 5970 + }, + { + "epoch": 0.9136954858454476, + "grad_norm": 2.449931136792063, + "learning_rate": 1.1873521030514826e-05, + "loss": 0.7608, + "step": 5971 + }, + { + "epoch": 0.9138485080336649, + "grad_norm": 2.4427533981065226, + "learning_rate": 1.1871086671585465e-05, + "loss": 0.804, + "step": 5972 + }, + { + "epoch": 0.9140015302218821, + "grad_norm": 2.196649531828342, + "learning_rate": 1.1868652197745351e-05, + "loss": 0.5977, + "step": 5973 + }, + { + "epoch": 0.9141545524100995, + "grad_norm": 2.0401281352602343, + "learning_rate": 1.1866217609143998e-05, + "loss": 0.7304, + "step": 5974 + }, + { + "epoch": 0.9143075745983168, + "grad_norm": 2.2225569238419274, + "learning_rate": 1.1863782905930918e-05, + "loss": 0.6879, + "step": 5975 + }, + { + "epoch": 0.914460596786534, + "grad_norm": 2.415433043337009, + "learning_rate": 1.186134808825564e-05, + "loss": 0.7098, + "step": 5976 + }, + { + "epoch": 0.9146136189747514, + "grad_norm": 2.1208798515694767, + "learning_rate": 1.1858913156267694e-05, + "loss": 0.638, + "step": 5977 + }, + { + "epoch": 0.9147666411629687, + "grad_norm": 2.186872985982911, + "learning_rate": 1.1856478110116613e-05, + "loss": 0.6302, + "step": 5978 + }, + { + "epoch": 0.9149196633511859, + "grad_norm": 2.153919248477717, + "learning_rate": 1.1854042949951958e-05, + "loss": 0.7054, + "step": 5979 + }, + { + "epoch": 0.9150726855394032, + "grad_norm": 2.397388197874348, + "learning_rate": 1.1851607675923269e-05, + "loss": 0.7276, + "step": 5980 + }, + { + "epoch": 0.9152257077276205, + "grad_norm": 2.0648750992816165, + "learning_rate": 1.1849172288180106e-05, + "loss": 0.6745, + "step": 5981 + }, + { + "epoch": 0.9153787299158378, + "grad_norm": 2.067192519982237, + "learning_rate": 1.1846736786872042e-05, + "loss": 0.6398, + "step": 5982 + }, + { + "epoch": 0.9155317521040551, + "grad_norm": 2.045318240934085, + "learning_rate": 1.1844301172148649e-05, + "loss": 0.6044, + "step": 5983 + }, + { + "epoch": 0.9156847742922724, + "grad_norm": 2.1862827497217547, + "learning_rate": 1.1841865444159502e-05, + "loss": 0.69, + "step": 5984 + }, + { + "epoch": 0.9158377964804897, + "grad_norm": 2.411569778709733, + "learning_rate": 1.1839429603054195e-05, + "loss": 0.6993, + "step": 5985 + }, + { + "epoch": 0.9159908186687069, + "grad_norm": 2.163870964236157, + "learning_rate": 1.183699364898232e-05, + "loss": 0.7071, + "step": 5986 + }, + { + "epoch": 0.9161438408569242, + "grad_norm": 2.2216540837646543, + "learning_rate": 1.1834557582093478e-05, + "loss": 0.7181, + "step": 5987 + }, + { + "epoch": 0.9162968630451416, + "grad_norm": 2.4990352433843683, + "learning_rate": 1.183212140253728e-05, + "loss": 0.76, + "step": 5988 + }, + { + "epoch": 0.9164498852333588, + "grad_norm": 2.4982223206565495, + "learning_rate": 1.1829685110463339e-05, + "loss": 0.7631, + "step": 5989 + }, + { + "epoch": 0.9166029074215761, + "grad_norm": 2.5748308659023333, + "learning_rate": 1.1827248706021274e-05, + "loss": 0.6416, + "step": 5990 + }, + { + "epoch": 0.9167559296097935, + "grad_norm": 2.3410439692766816, + "learning_rate": 1.182481218936072e-05, + "loss": 0.6934, + "step": 5991 + }, + { + "epoch": 0.9169089517980107, + "grad_norm": 2.55217646983151, + "learning_rate": 1.1822375560631311e-05, + "loss": 0.6742, + "step": 5992 + }, + { + "epoch": 0.917061973986228, + "grad_norm": 2.460463784312502, + "learning_rate": 1.181993881998269e-05, + "loss": 0.7407, + "step": 5993 + }, + { + "epoch": 0.9172149961744452, + "grad_norm": 2.094408911904513, + "learning_rate": 1.1817501967564503e-05, + "loss": 0.5756, + "step": 5994 + }, + { + "epoch": 0.9173680183626626, + "grad_norm": 2.3667932565329397, + "learning_rate": 1.1815065003526417e-05, + "loss": 0.6282, + "step": 5995 + }, + { + "epoch": 0.9175210405508799, + "grad_norm": 2.259307626828805, + "learning_rate": 1.1812627928018086e-05, + "loss": 0.6107, + "step": 5996 + }, + { + "epoch": 0.9176740627390971, + "grad_norm": 2.6719058170793133, + "learning_rate": 1.1810190741189177e-05, + "loss": 0.7363, + "step": 5997 + }, + { + "epoch": 0.9178270849273145, + "grad_norm": 2.253953083863052, + "learning_rate": 1.180775344318938e-05, + "loss": 0.6705, + "step": 5998 + }, + { + "epoch": 0.9179801071155318, + "grad_norm": 2.2485300966189326, + "learning_rate": 1.1805316034168373e-05, + "loss": 0.6667, + "step": 5999 + }, + { + "epoch": 0.918133129303749, + "grad_norm": 2.1631800211031407, + "learning_rate": 1.1802878514275839e-05, + "loss": 0.6333, + "step": 6000 + }, + { + "epoch": 0.9182861514919664, + "grad_norm": 2.3348928082724445, + "learning_rate": 1.1800440883661485e-05, + "loss": 0.7369, + "step": 6001 + }, + { + "epoch": 0.9184391736801836, + "grad_norm": 2.592997532053126, + "learning_rate": 1.1798003142475017e-05, + "loss": 0.6496, + "step": 6002 + }, + { + "epoch": 0.9185921958684009, + "grad_norm": 2.2719886310328548, + "learning_rate": 1.1795565290866137e-05, + "loss": 0.659, + "step": 6003 + }, + { + "epoch": 0.9187452180566182, + "grad_norm": 2.1908026163259957, + "learning_rate": 1.1793127328984572e-05, + "loss": 0.6543, + "step": 6004 + }, + { + "epoch": 0.9188982402448355, + "grad_norm": 2.2403925541148357, + "learning_rate": 1.1790689256980042e-05, + "loss": 0.7169, + "step": 6005 + }, + { + "epoch": 0.9190512624330528, + "grad_norm": 2.1158419300219986, + "learning_rate": 1.1788251075002277e-05, + "loss": 0.6643, + "step": 6006 + }, + { + "epoch": 0.9192042846212701, + "grad_norm": 2.256627339715977, + "learning_rate": 1.178581278320102e-05, + "loss": 0.6332, + "step": 6007 + }, + { + "epoch": 0.9193573068094874, + "grad_norm": 2.2967103658681163, + "learning_rate": 1.1783374381726016e-05, + "loss": 0.6572, + "step": 6008 + }, + { + "epoch": 0.9195103289977047, + "grad_norm": 2.3176786154668316, + "learning_rate": 1.178093587072701e-05, + "loss": 0.6278, + "step": 6009 + }, + { + "epoch": 0.9196633511859219, + "grad_norm": 2.276335975164193, + "learning_rate": 1.1778497250353767e-05, + "loss": 0.7025, + "step": 6010 + }, + { + "epoch": 0.9198163733741392, + "grad_norm": 2.198538269825206, + "learning_rate": 1.1776058520756053e-05, + "loss": 0.708, + "step": 6011 + }, + { + "epoch": 0.9199693955623566, + "grad_norm": 2.0093992303204287, + "learning_rate": 1.1773619682083632e-05, + "loss": 0.5572, + "step": 6012 + }, + { + "epoch": 0.9201224177505738, + "grad_norm": 2.147072844426607, + "learning_rate": 1.1771180734486292e-05, + "loss": 0.6686, + "step": 6013 + }, + { + "epoch": 0.9202754399387911, + "grad_norm": 2.120096783175212, + "learning_rate": 1.1768741678113814e-05, + "loss": 0.692, + "step": 6014 + }, + { + "epoch": 0.9204284621270085, + "grad_norm": 2.18494526069704, + "learning_rate": 1.1766302513115988e-05, + "loss": 0.6957, + "step": 6015 + }, + { + "epoch": 0.9205814843152257, + "grad_norm": 2.183623529461029, + "learning_rate": 1.1763863239642617e-05, + "loss": 0.6199, + "step": 6016 + }, + { + "epoch": 0.920734506503443, + "grad_norm": 2.2086171453443346, + "learning_rate": 1.1761423857843504e-05, + "loss": 0.6994, + "step": 6017 + }, + { + "epoch": 0.9208875286916602, + "grad_norm": 2.175576432459934, + "learning_rate": 1.175898436786846e-05, + "loss": 0.6868, + "step": 6018 + }, + { + "epoch": 0.9210405508798776, + "grad_norm": 2.18905913642669, + "learning_rate": 1.1756544769867304e-05, + "loss": 0.6596, + "step": 6019 + }, + { + "epoch": 0.9211935730680949, + "grad_norm": 2.1552159728330467, + "learning_rate": 1.1754105063989865e-05, + "loss": 0.628, + "step": 6020 + }, + { + "epoch": 0.9213465952563121, + "grad_norm": 2.352853585156651, + "learning_rate": 1.175166525038597e-05, + "loss": 0.7225, + "step": 6021 + }, + { + "epoch": 0.9214996174445295, + "grad_norm": 2.2186163270481645, + "learning_rate": 1.1749225329205458e-05, + "loss": 0.5838, + "step": 6022 + }, + { + "epoch": 0.9216526396327468, + "grad_norm": 2.316156419640685, + "learning_rate": 1.1746785300598178e-05, + "loss": 0.6525, + "step": 6023 + }, + { + "epoch": 0.921805661820964, + "grad_norm": 2.5308854754596917, + "learning_rate": 1.174434516471398e-05, + "loss": 0.6507, + "step": 6024 + }, + { + "epoch": 0.9219586840091814, + "grad_norm": 2.038668645225144, + "learning_rate": 1.1741904921702718e-05, + "loss": 0.7139, + "step": 6025 + }, + { + "epoch": 0.9221117061973986, + "grad_norm": 2.118866474655756, + "learning_rate": 1.1739464571714263e-05, + "loss": 0.6408, + "step": 6026 + }, + { + "epoch": 0.9222647283856159, + "grad_norm": 2.4219096622082366, + "learning_rate": 1.1737024114898483e-05, + "loss": 0.9817, + "step": 6027 + }, + { + "epoch": 0.9224177505738332, + "grad_norm": 2.3920546815691632, + "learning_rate": 1.1734583551405257e-05, + "loss": 0.731, + "step": 6028 + }, + { + "epoch": 0.9225707727620505, + "grad_norm": 2.2898608411847734, + "learning_rate": 1.1732142881384469e-05, + "loss": 0.7989, + "step": 6029 + }, + { + "epoch": 0.9227237949502678, + "grad_norm": 2.275071139556073, + "learning_rate": 1.1729702104986011e-05, + "loss": 0.6128, + "step": 6030 + }, + { + "epoch": 0.9228768171384851, + "grad_norm": 2.2660964406470048, + "learning_rate": 1.1727261222359781e-05, + "loss": 0.7475, + "step": 6031 + }, + { + "epoch": 0.9230298393267024, + "grad_norm": 1.9638194324248897, + "learning_rate": 1.172482023365568e-05, + "loss": 0.6469, + "step": 6032 + }, + { + "epoch": 0.9231828615149197, + "grad_norm": 2.061803171551883, + "learning_rate": 1.1722379139023623e-05, + "loss": 0.7059, + "step": 6033 + }, + { + "epoch": 0.9233358837031369, + "grad_norm": 2.2447062635254897, + "learning_rate": 1.1719937938613523e-05, + "loss": 0.6911, + "step": 6034 + }, + { + "epoch": 0.9234889058913542, + "grad_norm": 2.1651544199285295, + "learning_rate": 1.1717496632575304e-05, + "loss": 0.5678, + "step": 6035 + }, + { + "epoch": 0.9236419280795716, + "grad_norm": 2.4604106984706773, + "learning_rate": 1.1715055221058903e-05, + "loss": 0.7333, + "step": 6036 + }, + { + "epoch": 0.9237949502677888, + "grad_norm": 2.0659823412706104, + "learning_rate": 1.171261370421425e-05, + "loss": 0.5865, + "step": 6037 + }, + { + "epoch": 0.9239479724560061, + "grad_norm": 2.2467918440005628, + "learning_rate": 1.1710172082191288e-05, + "loss": 0.6591, + "step": 6038 + }, + { + "epoch": 0.9241009946442235, + "grad_norm": 2.1449493890857787, + "learning_rate": 1.170773035513997e-05, + "loss": 0.7061, + "step": 6039 + }, + { + "epoch": 0.9242540168324407, + "grad_norm": 2.2560181924361755, + "learning_rate": 1.1705288523210253e-05, + "loss": 0.6814, + "step": 6040 + }, + { + "epoch": 0.924407039020658, + "grad_norm": 2.141905549714062, + "learning_rate": 1.1702846586552088e-05, + "loss": 0.6385, + "step": 6041 + }, + { + "epoch": 0.9245600612088752, + "grad_norm": 2.2548252680890135, + "learning_rate": 1.1700404545315458e-05, + "loss": 0.6793, + "step": 6042 + }, + { + "epoch": 0.9247130833970926, + "grad_norm": 2.482889662318577, + "learning_rate": 1.1697962399650333e-05, + "loss": 0.7582, + "step": 6043 + }, + { + "epoch": 0.9248661055853099, + "grad_norm": 2.5506460166887264, + "learning_rate": 1.1695520149706693e-05, + "loss": 0.7141, + "step": 6044 + }, + { + "epoch": 0.9250191277735271, + "grad_norm": 2.145699244583287, + "learning_rate": 1.1693077795634531e-05, + "loss": 0.6328, + "step": 6045 + }, + { + "epoch": 0.9251721499617445, + "grad_norm": 2.4752175224389577, + "learning_rate": 1.1690635337583836e-05, + "loss": 0.6299, + "step": 6046 + }, + { + "epoch": 0.9253251721499618, + "grad_norm": 2.2683649200809675, + "learning_rate": 1.168819277570461e-05, + "loss": 0.6869, + "step": 6047 + }, + { + "epoch": 0.925478194338179, + "grad_norm": 2.2004469435637457, + "learning_rate": 1.168575011014686e-05, + "loss": 0.6129, + "step": 6048 + }, + { + "epoch": 0.9256312165263963, + "grad_norm": 2.4371132806076994, + "learning_rate": 1.1683307341060608e-05, + "loss": 0.7319, + "step": 6049 + }, + { + "epoch": 0.9257842387146136, + "grad_norm": 2.2224151080045402, + "learning_rate": 1.168086446859586e-05, + "loss": 0.6627, + "step": 6050 + }, + { + "epoch": 0.9259372609028309, + "grad_norm": 2.0430737085760287, + "learning_rate": 1.1678421492902652e-05, + "loss": 0.6371, + "step": 6051 + }, + { + "epoch": 0.9260902830910482, + "grad_norm": 2.253282435206617, + "learning_rate": 1.1675978414131013e-05, + "loss": 0.6545, + "step": 6052 + }, + { + "epoch": 0.9262433052792655, + "grad_norm": 2.2800025754564257, + "learning_rate": 1.1673535232430986e-05, + "loss": 0.6834, + "step": 6053 + }, + { + "epoch": 0.9263963274674828, + "grad_norm": 2.107954815478626, + "learning_rate": 1.1671091947952611e-05, + "loss": 0.5648, + "step": 6054 + }, + { + "epoch": 0.9265493496557001, + "grad_norm": 2.4385308339899248, + "learning_rate": 1.1668648560845944e-05, + "loss": 0.6888, + "step": 6055 + }, + { + "epoch": 0.9267023718439173, + "grad_norm": 2.2467214330301633, + "learning_rate": 1.1666205071261038e-05, + "loss": 0.7371, + "step": 6056 + }, + { + "epoch": 0.9268553940321347, + "grad_norm": 2.507942422969182, + "learning_rate": 1.1663761479347964e-05, + "loss": 0.7353, + "step": 6057 + }, + { + "epoch": 0.9270084162203519, + "grad_norm": 2.196286306638163, + "learning_rate": 1.1661317785256787e-05, + "loss": 0.717, + "step": 6058 + }, + { + "epoch": 0.9271614384085692, + "grad_norm": 2.2098159183285606, + "learning_rate": 1.1658873989137585e-05, + "loss": 0.7388, + "step": 6059 + }, + { + "epoch": 0.9273144605967866, + "grad_norm": 2.539805991550103, + "learning_rate": 1.165643009114044e-05, + "loss": 0.7721, + "step": 6060 + }, + { + "epoch": 0.9274674827850038, + "grad_norm": 2.6044090069624524, + "learning_rate": 1.165398609141545e-05, + "loss": 0.6935, + "step": 6061 + }, + { + "epoch": 0.9276205049732211, + "grad_norm": 2.384150928114248, + "learning_rate": 1.1651541990112698e-05, + "loss": 0.7076, + "step": 6062 + }, + { + "epoch": 0.9277735271614385, + "grad_norm": 2.4976738478057343, + "learning_rate": 1.164909778738229e-05, + "loss": 0.7126, + "step": 6063 + }, + { + "epoch": 0.9279265493496557, + "grad_norm": 2.2947249137280923, + "learning_rate": 1.164665348337434e-05, + "loss": 0.6781, + "step": 6064 + }, + { + "epoch": 0.928079571537873, + "grad_norm": 2.3117604725656293, + "learning_rate": 1.164420907823896e-05, + "loss": 0.6601, + "step": 6065 + }, + { + "epoch": 0.9282325937260902, + "grad_norm": 2.1388291258950605, + "learning_rate": 1.1641764572126262e-05, + "loss": 0.66, + "step": 6066 + }, + { + "epoch": 0.9283856159143076, + "grad_norm": 2.2545193918675506, + "learning_rate": 1.1639319965186382e-05, + "loss": 0.6792, + "step": 6067 + }, + { + "epoch": 0.9285386381025249, + "grad_norm": 2.2752674938211506, + "learning_rate": 1.1636875257569448e-05, + "loss": 0.6095, + "step": 6068 + }, + { + "epoch": 0.9286916602907421, + "grad_norm": 2.2575997598142723, + "learning_rate": 1.1634430449425604e-05, + "loss": 0.6722, + "step": 6069 + }, + { + "epoch": 0.9288446824789595, + "grad_norm": 2.2098103933749904, + "learning_rate": 1.1631985540904991e-05, + "loss": 0.6803, + "step": 6070 + }, + { + "epoch": 0.9289977046671768, + "grad_norm": 2.294225796155611, + "learning_rate": 1.1629540532157758e-05, + "loss": 0.7256, + "step": 6071 + }, + { + "epoch": 0.929150726855394, + "grad_norm": 2.458554414220298, + "learning_rate": 1.1627095423334072e-05, + "loss": 0.802, + "step": 6072 + }, + { + "epoch": 0.9293037490436113, + "grad_norm": 2.3884126063112157, + "learning_rate": 1.1624650214584088e-05, + "loss": 0.7631, + "step": 6073 + }, + { + "epoch": 0.9294567712318286, + "grad_norm": 2.1963002613069618, + "learning_rate": 1.1622204906057979e-05, + "loss": 0.6559, + "step": 6074 + }, + { + "epoch": 0.9296097934200459, + "grad_norm": 2.3884546744501933, + "learning_rate": 1.161975949790592e-05, + "loss": 0.6752, + "step": 6075 + }, + { + "epoch": 0.9297628156082632, + "grad_norm": 2.2694496861180737, + "learning_rate": 1.1617313990278093e-05, + "loss": 0.6918, + "step": 6076 + }, + { + "epoch": 0.9299158377964805, + "grad_norm": 2.3172171693412937, + "learning_rate": 1.161486838332469e-05, + "loss": 0.6784, + "step": 6077 + }, + { + "epoch": 0.9300688599846978, + "grad_norm": 2.1329383279977665, + "learning_rate": 1.16124226771959e-05, + "loss": 0.5508, + "step": 6078 + }, + { + "epoch": 0.9302218821729151, + "grad_norm": 2.293796679557291, + "learning_rate": 1.1609976872041924e-05, + "loss": 0.658, + "step": 6079 + }, + { + "epoch": 0.9303749043611323, + "grad_norm": 2.210358770465247, + "learning_rate": 1.1607530968012971e-05, + "loss": 0.7028, + "step": 6080 + }, + { + "epoch": 0.9305279265493497, + "grad_norm": 2.213691145452231, + "learning_rate": 1.1605084965259256e-05, + "loss": 0.615, + "step": 6081 + }, + { + "epoch": 0.9306809487375669, + "grad_norm": 2.4649032540344344, + "learning_rate": 1.160263886393099e-05, + "loss": 0.7268, + "step": 6082 + }, + { + "epoch": 0.9308339709257842, + "grad_norm": 2.017207465174122, + "learning_rate": 1.1600192664178405e-05, + "loss": 0.6671, + "step": 6083 + }, + { + "epoch": 0.9309869931140016, + "grad_norm": 1.9259330149007499, + "learning_rate": 1.1597746366151725e-05, + "loss": 0.5948, + "step": 6084 + }, + { + "epoch": 0.9311400153022188, + "grad_norm": 2.167872374879758, + "learning_rate": 1.159529997000119e-05, + "loss": 0.7672, + "step": 6085 + }, + { + "epoch": 0.9312930374904361, + "grad_norm": 2.551354194085087, + "learning_rate": 1.1592853475877049e-05, + "loss": 0.7649, + "step": 6086 + }, + { + "epoch": 0.9314460596786535, + "grad_norm": 2.3450320571655348, + "learning_rate": 1.159040688392954e-05, + "loss": 0.6592, + "step": 6087 + }, + { + "epoch": 0.9315990818668707, + "grad_norm": 2.413844869725996, + "learning_rate": 1.158796019430892e-05, + "loss": 0.7055, + "step": 6088 + }, + { + "epoch": 0.931752104055088, + "grad_norm": 2.0778539002805014, + "learning_rate": 1.1585513407165456e-05, + "loss": 0.578, + "step": 6089 + }, + { + "epoch": 0.9319051262433052, + "grad_norm": 2.286133113069323, + "learning_rate": 1.1583066522649413e-05, + "loss": 0.7993, + "step": 6090 + }, + { + "epoch": 0.9320581484315226, + "grad_norm": 2.1857984709529985, + "learning_rate": 1.1580619540911058e-05, + "loss": 0.7388, + "step": 6091 + }, + { + "epoch": 0.9322111706197399, + "grad_norm": 2.4184829856903547, + "learning_rate": 1.1578172462100676e-05, + "loss": 0.6769, + "step": 6092 + }, + { + "epoch": 0.9323641928079571, + "grad_norm": 2.2057962215002744, + "learning_rate": 1.157572528636855e-05, + "loss": 0.6771, + "step": 6093 + }, + { + "epoch": 0.9325172149961745, + "grad_norm": 1.9624533908826989, + "learning_rate": 1.1573278013864968e-05, + "loss": 0.5544, + "step": 6094 + }, + { + "epoch": 0.9326702371843917, + "grad_norm": 2.12121986199544, + "learning_rate": 1.1570830644740227e-05, + "loss": 0.6088, + "step": 6095 + }, + { + "epoch": 0.932823259372609, + "grad_norm": 2.2944618881122474, + "learning_rate": 1.1568383179144634e-05, + "loss": 0.7788, + "step": 6096 + }, + { + "epoch": 0.9329762815608263, + "grad_norm": 2.2593200428155256, + "learning_rate": 1.1565935617228492e-05, + "loss": 0.5957, + "step": 6097 + }, + { + "epoch": 0.9331293037490436, + "grad_norm": 2.252447232896904, + "learning_rate": 1.1563487959142121e-05, + "loss": 0.6521, + "step": 6098 + }, + { + "epoch": 0.9332823259372609, + "grad_norm": 2.5056513509402656, + "learning_rate": 1.1561040205035835e-05, + "loss": 0.8213, + "step": 6099 + }, + { + "epoch": 0.9334353481254782, + "grad_norm": 2.247790538197072, + "learning_rate": 1.1558592355059965e-05, + "loss": 0.6064, + "step": 6100 + }, + { + "epoch": 0.9335883703136955, + "grad_norm": 2.1827896418062407, + "learning_rate": 1.1556144409364838e-05, + "loss": 0.6262, + "step": 6101 + }, + { + "epoch": 0.9337413925019128, + "grad_norm": 2.5213596631860202, + "learning_rate": 1.15536963681008e-05, + "loss": 0.714, + "step": 6102 + }, + { + "epoch": 0.93389441469013, + "grad_norm": 2.139941754037212, + "learning_rate": 1.1551248231418188e-05, + "loss": 0.6674, + "step": 6103 + }, + { + "epoch": 0.9340474368783473, + "grad_norm": 2.2803009133909513, + "learning_rate": 1.154879999946735e-05, + "loss": 0.6406, + "step": 6104 + }, + { + "epoch": 0.9342004590665647, + "grad_norm": 2.317103896037347, + "learning_rate": 1.154635167239865e-05, + "loss": 0.7685, + "step": 6105 + }, + { + "epoch": 0.9343534812547819, + "grad_norm": 2.1500102693630345, + "learning_rate": 1.1543903250362446e-05, + "loss": 0.6668, + "step": 6106 + }, + { + "epoch": 0.9345065034429992, + "grad_norm": 2.132728448753486, + "learning_rate": 1.1541454733509096e-05, + "loss": 0.6863, + "step": 6107 + }, + { + "epoch": 0.9346595256312166, + "grad_norm": 2.208420311449703, + "learning_rate": 1.1539006121988984e-05, + "loss": 0.751, + "step": 6108 + }, + { + "epoch": 0.9348125478194338, + "grad_norm": 2.24257468362804, + "learning_rate": 1.1536557415952488e-05, + "loss": 0.6207, + "step": 6109 + }, + { + "epoch": 0.9349655700076511, + "grad_norm": 2.504464488973117, + "learning_rate": 1.1534108615549988e-05, + "loss": 0.7274, + "step": 6110 + }, + { + "epoch": 0.9351185921958683, + "grad_norm": 2.004035147085841, + "learning_rate": 1.1531659720931877e-05, + "loss": 0.6282, + "step": 6111 + }, + { + "epoch": 0.9352716143840857, + "grad_norm": 2.095751079721034, + "learning_rate": 1.152921073224855e-05, + "loss": 0.587, + "step": 6112 + }, + { + "epoch": 0.935424636572303, + "grad_norm": 2.0794742962572843, + "learning_rate": 1.152676164965041e-05, + "loss": 0.6929, + "step": 6113 + }, + { + "epoch": 0.9355776587605202, + "grad_norm": 2.2462164854122606, + "learning_rate": 1.1524312473287866e-05, + "loss": 0.6546, + "step": 6114 + }, + { + "epoch": 0.9357306809487376, + "grad_norm": 2.2331729878802467, + "learning_rate": 1.1521863203311327e-05, + "loss": 0.7246, + "step": 6115 + }, + { + "epoch": 0.9358837031369549, + "grad_norm": 2.2690525367182928, + "learning_rate": 1.1519413839871218e-05, + "loss": 0.7247, + "step": 6116 + }, + { + "epoch": 0.9360367253251721, + "grad_norm": 2.009458991477944, + "learning_rate": 1.1516964383117957e-05, + "loss": 0.6288, + "step": 6117 + }, + { + "epoch": 0.9361897475133895, + "grad_norm": 2.456698580139736, + "learning_rate": 1.1514514833201981e-05, + "loss": 0.7207, + "step": 6118 + }, + { + "epoch": 0.9363427697016067, + "grad_norm": 1.9635176193390491, + "learning_rate": 1.1512065190273728e-05, + "loss": 0.6222, + "step": 6119 + }, + { + "epoch": 0.936495791889824, + "grad_norm": 2.2822880914918287, + "learning_rate": 1.150961545448363e-05, + "loss": 0.697, + "step": 6120 + }, + { + "epoch": 0.9366488140780413, + "grad_norm": 2.1295174618645745, + "learning_rate": 1.1507165625982144e-05, + "loss": 0.6507, + "step": 6121 + }, + { + "epoch": 0.9368018362662586, + "grad_norm": 2.1926894823041376, + "learning_rate": 1.1504715704919722e-05, + "loss": 0.6582, + "step": 6122 + }, + { + "epoch": 0.9369548584544759, + "grad_norm": 2.5892530861553373, + "learning_rate": 1.1502265691446821e-05, + "loss": 0.6738, + "step": 6123 + }, + { + "epoch": 0.9371078806426932, + "grad_norm": 2.068185855054658, + "learning_rate": 1.1499815585713909e-05, + "loss": 0.6418, + "step": 6124 + }, + { + "epoch": 0.9372609028309105, + "grad_norm": 2.2803267411959394, + "learning_rate": 1.1497365387871454e-05, + "loss": 0.5819, + "step": 6125 + }, + { + "epoch": 0.9374139250191278, + "grad_norm": 2.1349957197936713, + "learning_rate": 1.1494915098069927e-05, + "loss": 0.5879, + "step": 6126 + }, + { + "epoch": 0.937566947207345, + "grad_norm": 2.5188656753058507, + "learning_rate": 1.1492464716459824e-05, + "loss": 0.6856, + "step": 6127 + }, + { + "epoch": 0.9377199693955623, + "grad_norm": 2.0389553034383265, + "learning_rate": 1.1490014243191621e-05, + "loss": 0.5911, + "step": 6128 + }, + { + "epoch": 0.9378729915837797, + "grad_norm": 2.356383652380313, + "learning_rate": 1.1487563678415812e-05, + "loss": 0.7431, + "step": 6129 + }, + { + "epoch": 0.9380260137719969, + "grad_norm": 2.1593952450928033, + "learning_rate": 1.14851130222829e-05, + "loss": 0.6465, + "step": 6130 + }, + { + "epoch": 0.9381790359602142, + "grad_norm": 2.4550090735416306, + "learning_rate": 1.148266227494339e-05, + "loss": 0.7766, + "step": 6131 + }, + { + "epoch": 0.9383320581484316, + "grad_norm": 2.4549971534102175, + "learning_rate": 1.1480211436547783e-05, + "loss": 0.68, + "step": 6132 + }, + { + "epoch": 0.9384850803366488, + "grad_norm": 2.3471098658149447, + "learning_rate": 1.1477760507246606e-05, + "loss": 0.6494, + "step": 6133 + }, + { + "epoch": 0.9386381025248661, + "grad_norm": 2.173375519361347, + "learning_rate": 1.1475309487190376e-05, + "loss": 0.6001, + "step": 6134 + }, + { + "epoch": 0.9387911247130833, + "grad_norm": 2.5263042796935973, + "learning_rate": 1.1472858376529616e-05, + "loss": 0.6864, + "step": 6135 + }, + { + "epoch": 0.9389441469013007, + "grad_norm": 2.177731124432879, + "learning_rate": 1.1470407175414864e-05, + "loss": 0.6324, + "step": 6136 + }, + { + "epoch": 0.939097169089518, + "grad_norm": 2.2483120867164, + "learning_rate": 1.1467955883996653e-05, + "loss": 0.7105, + "step": 6137 + }, + { + "epoch": 0.9392501912777352, + "grad_norm": 2.3912382526507656, + "learning_rate": 1.146550450242553e-05, + "loss": 0.6568, + "step": 6138 + }, + { + "epoch": 0.9394032134659526, + "grad_norm": 2.411089906961605, + "learning_rate": 1.146305303085204e-05, + "loss": 0.6886, + "step": 6139 + }, + { + "epoch": 0.9395562356541699, + "grad_norm": 2.2962550676687505, + "learning_rate": 1.1460601469426741e-05, + "loss": 0.7108, + "step": 6140 + }, + { + "epoch": 0.9397092578423871, + "grad_norm": 2.2516999284059, + "learning_rate": 1.1458149818300192e-05, + "loss": 0.6853, + "step": 6141 + }, + { + "epoch": 0.9398622800306045, + "grad_norm": 2.3487910495210094, + "learning_rate": 1.1455698077622959e-05, + "loss": 0.6304, + "step": 6142 + }, + { + "epoch": 0.9400153022188217, + "grad_norm": 2.3253614074896682, + "learning_rate": 1.1453246247545612e-05, + "loss": 0.7826, + "step": 6143 + }, + { + "epoch": 0.940168324407039, + "grad_norm": 2.035455850686615, + "learning_rate": 1.145079432821873e-05, + "loss": 0.6026, + "step": 6144 + }, + { + "epoch": 0.9403213465952563, + "grad_norm": 2.0392874454271714, + "learning_rate": 1.1448342319792886e-05, + "loss": 0.7886, + "step": 6145 + }, + { + "epoch": 0.9404743687834736, + "grad_norm": 2.301481523837578, + "learning_rate": 1.1445890222418681e-05, + "loss": 0.6503, + "step": 6146 + }, + { + "epoch": 0.9406273909716909, + "grad_norm": 2.307110238176336, + "learning_rate": 1.1443438036246702e-05, + "loss": 0.7039, + "step": 6147 + }, + { + "epoch": 0.9407804131599082, + "grad_norm": 2.355842387023983, + "learning_rate": 1.144098576142754e-05, + "loss": 0.7409, + "step": 6148 + }, + { + "epoch": 0.9409334353481255, + "grad_norm": 2.5326182737200673, + "learning_rate": 1.1438533398111808e-05, + "loss": 0.8251, + "step": 6149 + }, + { + "epoch": 0.9410864575363428, + "grad_norm": 2.227199426196083, + "learning_rate": 1.1436080946450115e-05, + "loss": 0.6085, + "step": 6150 + }, + { + "epoch": 0.94123947972456, + "grad_norm": 2.2345241158679165, + "learning_rate": 1.1433628406593069e-05, + "loss": 0.6675, + "step": 6151 + }, + { + "epoch": 0.9413925019127773, + "grad_norm": 1.8715859371306767, + "learning_rate": 1.1431175778691297e-05, + "loss": 0.5941, + "step": 6152 + }, + { + "epoch": 0.9415455241009947, + "grad_norm": 2.2216163917317675, + "learning_rate": 1.1428723062895421e-05, + "loss": 0.6047, + "step": 6153 + }, + { + "epoch": 0.9416985462892119, + "grad_norm": 1.9294546956579186, + "learning_rate": 1.1426270259356071e-05, + "loss": 0.5409, + "step": 6154 + }, + { + "epoch": 0.9418515684774292, + "grad_norm": 2.1209551146155405, + "learning_rate": 1.1423817368223886e-05, + "loss": 0.6277, + "step": 6155 + }, + { + "epoch": 0.9420045906656466, + "grad_norm": 2.192111802183402, + "learning_rate": 1.1421364389649508e-05, + "loss": 0.5492, + "step": 6156 + }, + { + "epoch": 0.9421576128538638, + "grad_norm": 2.19448860446009, + "learning_rate": 1.141891132378358e-05, + "loss": 0.6536, + "step": 6157 + }, + { + "epoch": 0.9423106350420811, + "grad_norm": 2.071398629519424, + "learning_rate": 1.1416458170776757e-05, + "loss": 0.5741, + "step": 6158 + }, + { + "epoch": 0.9424636572302983, + "grad_norm": 2.138162941124051, + "learning_rate": 1.1414004930779697e-05, + "loss": 0.642, + "step": 6159 + }, + { + "epoch": 0.9426166794185157, + "grad_norm": 2.0968607838531805, + "learning_rate": 1.1411551603943065e-05, + "loss": 0.6482, + "step": 6160 + }, + { + "epoch": 0.942769701606733, + "grad_norm": 2.2631243499184577, + "learning_rate": 1.1409098190417523e-05, + "loss": 0.5649, + "step": 6161 + }, + { + "epoch": 0.9429227237949502, + "grad_norm": 2.2003646678049993, + "learning_rate": 1.1406644690353752e-05, + "loss": 0.6781, + "step": 6162 + }, + { + "epoch": 0.9430757459831676, + "grad_norm": 2.2561519198823246, + "learning_rate": 1.1404191103902425e-05, + "loss": 0.57, + "step": 6163 + }, + { + "epoch": 0.9432287681713849, + "grad_norm": 2.362404452628459, + "learning_rate": 1.140173743121423e-05, + "loss": 0.6441, + "step": 6164 + }, + { + "epoch": 0.9433817903596021, + "grad_norm": 2.2230712076613153, + "learning_rate": 1.1399283672439856e-05, + "loss": 0.6328, + "step": 6165 + }, + { + "epoch": 0.9435348125478195, + "grad_norm": 2.156629373945652, + "learning_rate": 1.1396829827729998e-05, + "loss": 0.5572, + "step": 6166 + }, + { + "epoch": 0.9436878347360367, + "grad_norm": 2.184268144546899, + "learning_rate": 1.1394375897235354e-05, + "loss": 0.6756, + "step": 6167 + }, + { + "epoch": 0.943840856924254, + "grad_norm": 2.2888378781567225, + "learning_rate": 1.1391921881106636e-05, + "loss": 0.7744, + "step": 6168 + }, + { + "epoch": 0.9439938791124713, + "grad_norm": 2.2626375112314685, + "learning_rate": 1.1389467779494545e-05, + "loss": 0.672, + "step": 6169 + }, + { + "epoch": 0.9441469013006886, + "grad_norm": 2.1640029277804635, + "learning_rate": 1.1387013592549803e-05, + "loss": 0.625, + "step": 6170 + }, + { + "epoch": 0.9442999234889059, + "grad_norm": 2.37356580152094, + "learning_rate": 1.1384559320423132e-05, + "loss": 0.7834, + "step": 6171 + }, + { + "epoch": 0.9444529456771232, + "grad_norm": 2.1480740870003228, + "learning_rate": 1.1382104963265256e-05, + "loss": 0.6343, + "step": 6172 + }, + { + "epoch": 0.9446059678653405, + "grad_norm": 2.189878831615825, + "learning_rate": 1.1379650521226907e-05, + "loss": 0.7639, + "step": 6173 + }, + { + "epoch": 0.9447589900535578, + "grad_norm": 2.2193507647046324, + "learning_rate": 1.1377195994458823e-05, + "loss": 0.6586, + "step": 6174 + }, + { + "epoch": 0.944912012241775, + "grad_norm": 2.169829742772359, + "learning_rate": 1.1374741383111745e-05, + "loss": 0.6261, + "step": 6175 + }, + { + "epoch": 0.9450650344299923, + "grad_norm": 2.5015537311138467, + "learning_rate": 1.137228668733642e-05, + "loss": 0.6167, + "step": 6176 + }, + { + "epoch": 0.9452180566182097, + "grad_norm": 2.225124391565102, + "learning_rate": 1.13698319072836e-05, + "loss": 0.6276, + "step": 6177 + }, + { + "epoch": 0.9453710788064269, + "grad_norm": 2.218346428912096, + "learning_rate": 1.136737704310405e-05, + "loss": 0.6192, + "step": 6178 + }, + { + "epoch": 0.9455241009946442, + "grad_norm": 2.1116689466249743, + "learning_rate": 1.1364922094948521e-05, + "loss": 0.6749, + "step": 6179 + }, + { + "epoch": 0.9456771231828616, + "grad_norm": 2.5467465934570375, + "learning_rate": 1.1362467062967785e-05, + "loss": 0.733, + "step": 6180 + }, + { + "epoch": 0.9458301453710788, + "grad_norm": 2.159692074442581, + "learning_rate": 1.1360011947312622e-05, + "loss": 0.5801, + "step": 6181 + }, + { + "epoch": 0.9459831675592961, + "grad_norm": 2.2593107197857982, + "learning_rate": 1.13575567481338e-05, + "loss": 0.7481, + "step": 6182 + }, + { + "epoch": 0.9461361897475133, + "grad_norm": 2.7468321108243154, + "learning_rate": 1.1355101465582109e-05, + "loss": 0.6763, + "step": 6183 + }, + { + "epoch": 0.9462892119357307, + "grad_norm": 2.4697235771987374, + "learning_rate": 1.1352646099808338e-05, + "loss": 0.7542, + "step": 6184 + }, + { + "epoch": 0.946442234123948, + "grad_norm": 2.6351990269079355, + "learning_rate": 1.1350190650963278e-05, + "loss": 0.6509, + "step": 6185 + }, + { + "epoch": 0.9465952563121652, + "grad_norm": 2.0847983344763823, + "learning_rate": 1.1347735119197724e-05, + "loss": 0.6195, + "step": 6186 + }, + { + "epoch": 0.9467482785003826, + "grad_norm": 2.2628289082127613, + "learning_rate": 1.1345279504662488e-05, + "loss": 0.6789, + "step": 6187 + }, + { + "epoch": 0.9469013006885999, + "grad_norm": 2.3024156670048446, + "learning_rate": 1.1342823807508371e-05, + "loss": 0.6697, + "step": 6188 + }, + { + "epoch": 0.9470543228768171, + "grad_norm": 2.2869700353035065, + "learning_rate": 1.1340368027886195e-05, + "loss": 0.6546, + "step": 6189 + }, + { + "epoch": 0.9472073450650345, + "grad_norm": 2.1321757720571153, + "learning_rate": 1.1337912165946773e-05, + "loss": 0.607, + "step": 6190 + }, + { + "epoch": 0.9473603672532517, + "grad_norm": 2.0630142586951647, + "learning_rate": 1.1335456221840932e-05, + "loss": 0.6203, + "step": 6191 + }, + { + "epoch": 0.947513389441469, + "grad_norm": 2.1348173216555995, + "learning_rate": 1.1333000195719498e-05, + "loss": 0.6734, + "step": 6192 + }, + { + "epoch": 0.9476664116296863, + "grad_norm": 2.5299071976646723, + "learning_rate": 1.1330544087733311e-05, + "loss": 0.7018, + "step": 6193 + }, + { + "epoch": 0.9478194338179036, + "grad_norm": 2.1658288076024137, + "learning_rate": 1.1328087898033204e-05, + "loss": 0.7224, + "step": 6194 + }, + { + "epoch": 0.9479724560061209, + "grad_norm": 2.03617237782173, + "learning_rate": 1.1325631626770024e-05, + "loss": 0.6653, + "step": 6195 + }, + { + "epoch": 0.9481254781943382, + "grad_norm": 2.28947007165544, + "learning_rate": 1.1323175274094615e-05, + "loss": 0.7274, + "step": 6196 + }, + { + "epoch": 0.9482785003825555, + "grad_norm": 2.2920800270128603, + "learning_rate": 1.1320718840157844e-05, + "loss": 0.6127, + "step": 6197 + }, + { + "epoch": 0.9484315225707728, + "grad_norm": 2.2807534856633374, + "learning_rate": 1.131826232511056e-05, + "loss": 0.7514, + "step": 6198 + }, + { + "epoch": 0.94858454475899, + "grad_norm": 2.277274760847133, + "learning_rate": 1.1315805729103626e-05, + "loss": 0.7422, + "step": 6199 + }, + { + "epoch": 0.9487375669472073, + "grad_norm": 2.3026912902212056, + "learning_rate": 1.1313349052287918e-05, + "loss": 0.7148, + "step": 6200 + }, + { + "epoch": 0.9488905891354247, + "grad_norm": 2.27866912590338, + "learning_rate": 1.1310892294814308e-05, + "loss": 0.6766, + "step": 6201 + }, + { + "epoch": 0.9490436113236419, + "grad_norm": 2.199628352082259, + "learning_rate": 1.130843545683367e-05, + "loss": 0.6551, + "step": 6202 + }, + { + "epoch": 0.9491966335118592, + "grad_norm": 2.3123931122807866, + "learning_rate": 1.1305978538496896e-05, + "loss": 0.7475, + "step": 6203 + }, + { + "epoch": 0.9493496557000765, + "grad_norm": 2.2427418927664564, + "learning_rate": 1.130352153995487e-05, + "loss": 0.7086, + "step": 6204 + }, + { + "epoch": 0.9495026778882938, + "grad_norm": 2.131171988564861, + "learning_rate": 1.1301064461358484e-05, + "loss": 0.6465, + "step": 6205 + }, + { + "epoch": 0.9496557000765111, + "grad_norm": 2.029476964393686, + "learning_rate": 1.1298607302858643e-05, + "loss": 0.5969, + "step": 6206 + }, + { + "epoch": 0.9498087222647283, + "grad_norm": 2.352972110165139, + "learning_rate": 1.1296150064606244e-05, + "loss": 0.7222, + "step": 6207 + }, + { + "epoch": 0.9499617444529457, + "grad_norm": 2.0387321968883874, + "learning_rate": 1.1293692746752201e-05, + "loss": 0.5474, + "step": 6208 + }, + { + "epoch": 0.950114766641163, + "grad_norm": 2.1671336258881344, + "learning_rate": 1.1291235349447427e-05, + "loss": 0.6044, + "step": 6209 + }, + { + "epoch": 0.9502677888293802, + "grad_norm": 2.11725262790491, + "learning_rate": 1.1288777872842837e-05, + "loss": 0.5895, + "step": 6210 + }, + { + "epoch": 0.9504208110175976, + "grad_norm": 2.175716482784178, + "learning_rate": 1.1286320317089354e-05, + "loss": 0.673, + "step": 6211 + }, + { + "epoch": 0.9505738332058148, + "grad_norm": 2.4334045623113507, + "learning_rate": 1.1283862682337909e-05, + "loss": 0.742, + "step": 6212 + }, + { + "epoch": 0.9507268553940321, + "grad_norm": 2.2530770860240295, + "learning_rate": 1.128140496873944e-05, + "loss": 0.6588, + "step": 6213 + }, + { + "epoch": 0.9508798775822495, + "grad_norm": 2.2343257508068772, + "learning_rate": 1.1278947176444872e-05, + "loss": 0.7206, + "step": 6214 + }, + { + "epoch": 0.9510328997704667, + "grad_norm": 2.3492826056594915, + "learning_rate": 1.1276489305605157e-05, + "loss": 0.681, + "step": 6215 + }, + { + "epoch": 0.951185921958684, + "grad_norm": 2.249954446637355, + "learning_rate": 1.127403135637124e-05, + "loss": 0.7103, + "step": 6216 + }, + { + "epoch": 0.9513389441469013, + "grad_norm": 2.106521792271321, + "learning_rate": 1.1271573328894074e-05, + "loss": 0.64, + "step": 6217 + }, + { + "epoch": 0.9514919663351186, + "grad_norm": 1.899570026742498, + "learning_rate": 1.1269115223324615e-05, + "loss": 0.6032, + "step": 6218 + }, + { + "epoch": 0.9516449885233359, + "grad_norm": 2.279531297155367, + "learning_rate": 1.1266657039813826e-05, + "loss": 0.6868, + "step": 6219 + }, + { + "epoch": 0.9517980107115531, + "grad_norm": 2.2086467285134965, + "learning_rate": 1.1264198778512674e-05, + "loss": 0.697, + "step": 6220 + }, + { + "epoch": 0.9519510328997705, + "grad_norm": 2.0313857844134904, + "learning_rate": 1.1261740439572126e-05, + "loss": 0.6574, + "step": 6221 + }, + { + "epoch": 0.9521040550879878, + "grad_norm": 1.930961524376918, + "learning_rate": 1.1259282023143166e-05, + "loss": 0.6359, + "step": 6222 + }, + { + "epoch": 0.952257077276205, + "grad_norm": 2.2039196753961376, + "learning_rate": 1.1256823529376768e-05, + "loss": 0.6681, + "step": 6223 + }, + { + "epoch": 0.9524100994644223, + "grad_norm": 2.455580268575347, + "learning_rate": 1.1254364958423917e-05, + "loss": 0.787, + "step": 6224 + }, + { + "epoch": 0.9525631216526397, + "grad_norm": 2.2229067809145566, + "learning_rate": 1.1251906310435611e-05, + "loss": 0.6966, + "step": 6225 + }, + { + "epoch": 0.9527161438408569, + "grad_norm": 2.233353532493785, + "learning_rate": 1.1249447585562843e-05, + "loss": 0.6934, + "step": 6226 + }, + { + "epoch": 0.9528691660290742, + "grad_norm": 2.2308301822804255, + "learning_rate": 1.1246988783956606e-05, + "loss": 0.6645, + "step": 6227 + }, + { + "epoch": 0.9530221882172915, + "grad_norm": 2.6020370006509546, + "learning_rate": 1.124452990576791e-05, + "loss": 0.7857, + "step": 6228 + }, + { + "epoch": 0.9531752104055088, + "grad_norm": 2.408349890753668, + "learning_rate": 1.1242070951147767e-05, + "loss": 0.6858, + "step": 6229 + }, + { + "epoch": 0.9533282325937261, + "grad_norm": 2.2188435241964544, + "learning_rate": 1.1239611920247187e-05, + "loss": 0.7378, + "step": 6230 + }, + { + "epoch": 0.9534812547819433, + "grad_norm": 2.3730115015136746, + "learning_rate": 1.123715281321719e-05, + "loss": 0.6563, + "step": 6231 + }, + { + "epoch": 0.9536342769701607, + "grad_norm": 2.2447044972291827, + "learning_rate": 1.1234693630208798e-05, + "loss": 0.7419, + "step": 6232 + }, + { + "epoch": 0.953787299158378, + "grad_norm": 2.1414125465799687, + "learning_rate": 1.1232234371373041e-05, + "loss": 0.6364, + "step": 6233 + }, + { + "epoch": 0.9539403213465952, + "grad_norm": 2.3749105034101388, + "learning_rate": 1.1229775036860948e-05, + "loss": 0.7654, + "step": 6234 + }, + { + "epoch": 0.9540933435348126, + "grad_norm": 2.0829661403047433, + "learning_rate": 1.1227315626823562e-05, + "loss": 0.6018, + "step": 6235 + }, + { + "epoch": 0.9542463657230298, + "grad_norm": 2.621822847733121, + "learning_rate": 1.1224856141411921e-05, + "loss": 0.652, + "step": 6236 + }, + { + "epoch": 0.9543993879112471, + "grad_norm": 1.8273526527697626, + "learning_rate": 1.122239658077707e-05, + "loss": 0.6322, + "step": 6237 + }, + { + "epoch": 0.9545524100994645, + "grad_norm": 2.0097539648147817, + "learning_rate": 1.121993694507007e-05, + "loss": 0.5612, + "step": 6238 + }, + { + "epoch": 0.9547054322876817, + "grad_norm": 2.5389843690982983, + "learning_rate": 1.1217477234441965e-05, + "loss": 0.7735, + "step": 6239 + }, + { + "epoch": 0.954858454475899, + "grad_norm": 2.1787684248203347, + "learning_rate": 1.1215017449043817e-05, + "loss": 0.7613, + "step": 6240 + }, + { + "epoch": 0.9550114766641163, + "grad_norm": 2.2525442589068376, + "learning_rate": 1.1212557589026699e-05, + "loss": 0.6428, + "step": 6241 + }, + { + "epoch": 0.9551644988523336, + "grad_norm": 2.3005374778193457, + "learning_rate": 1.1210097654541676e-05, + "loss": 0.7743, + "step": 6242 + }, + { + "epoch": 0.9553175210405509, + "grad_norm": 2.2746574255072023, + "learning_rate": 1.120763764573982e-05, + "loss": 0.6967, + "step": 6243 + }, + { + "epoch": 0.9554705432287681, + "grad_norm": 2.378397266502046, + "learning_rate": 1.1205177562772212e-05, + "loss": 0.7118, + "step": 6244 + }, + { + "epoch": 0.9556235654169855, + "grad_norm": 1.9416013991409358, + "learning_rate": 1.1202717405789936e-05, + "loss": 0.4577, + "step": 6245 + }, + { + "epoch": 0.9557765876052028, + "grad_norm": 2.3703928068791456, + "learning_rate": 1.120025717494408e-05, + "loss": 0.669, + "step": 6246 + }, + { + "epoch": 0.95592960979342, + "grad_norm": 2.1911621815779734, + "learning_rate": 1.1197796870385732e-05, + "loss": 0.6874, + "step": 6247 + }, + { + "epoch": 0.9560826319816373, + "grad_norm": 2.23570911339143, + "learning_rate": 1.1195336492265997e-05, + "loss": 0.5486, + "step": 6248 + }, + { + "epoch": 0.9562356541698547, + "grad_norm": 2.1892871871968955, + "learning_rate": 1.1192876040735968e-05, + "loss": 0.6156, + "step": 6249 + }, + { + "epoch": 0.9563886763580719, + "grad_norm": 2.5372633282546535, + "learning_rate": 1.1190415515946757e-05, + "loss": 0.7427, + "step": 6250 + }, + { + "epoch": 0.9565416985462892, + "grad_norm": 2.1746459071347517, + "learning_rate": 1.1187954918049473e-05, + "loss": 0.6861, + "step": 6251 + }, + { + "epoch": 0.9566947207345065, + "grad_norm": 2.3915477100098266, + "learning_rate": 1.1185494247195227e-05, + "loss": 0.6661, + "step": 6252 + }, + { + "epoch": 0.9568477429227238, + "grad_norm": 2.301774197826807, + "learning_rate": 1.1183033503535144e-05, + "loss": 0.7023, + "step": 6253 + }, + { + "epoch": 0.9570007651109411, + "grad_norm": 2.0690146112814616, + "learning_rate": 1.1180572687220349e-05, + "loss": 0.6382, + "step": 6254 + }, + { + "epoch": 0.9571537872991583, + "grad_norm": 2.0792422582969805, + "learning_rate": 1.1178111798401959e-05, + "loss": 0.6192, + "step": 6255 + }, + { + "epoch": 0.9573068094873757, + "grad_norm": 2.332152072423957, + "learning_rate": 1.1175650837231118e-05, + "loss": 0.7097, + "step": 6256 + }, + { + "epoch": 0.957459831675593, + "grad_norm": 2.252289252328118, + "learning_rate": 1.1173189803858961e-05, + "loss": 0.69, + "step": 6257 + }, + { + "epoch": 0.9576128538638102, + "grad_norm": 2.149486560835919, + "learning_rate": 1.1170728698436629e-05, + "loss": 0.6495, + "step": 6258 + }, + { + "epoch": 0.9577658760520276, + "grad_norm": 2.4865796283824535, + "learning_rate": 1.1168267521115266e-05, + "loss": 0.6406, + "step": 6259 + }, + { + "epoch": 0.9579188982402448, + "grad_norm": 2.562998136403146, + "learning_rate": 1.1165806272046024e-05, + "loss": 0.7459, + "step": 6260 + }, + { + "epoch": 0.9580719204284621, + "grad_norm": 2.244474523121574, + "learning_rate": 1.116334495138006e-05, + "loss": 0.7253, + "step": 6261 + }, + { + "epoch": 0.9582249426166795, + "grad_norm": 2.3237989302201343, + "learning_rate": 1.116088355926853e-05, + "loss": 0.7225, + "step": 6262 + }, + { + "epoch": 0.9583779648048967, + "grad_norm": 2.296054403182734, + "learning_rate": 1.11584220958626e-05, + "loss": 0.6612, + "step": 6263 + }, + { + "epoch": 0.958530986993114, + "grad_norm": 2.1853141461542034, + "learning_rate": 1.1155960561313437e-05, + "loss": 0.7537, + "step": 6264 + }, + { + "epoch": 0.9586840091813313, + "grad_norm": 2.2041375129714615, + "learning_rate": 1.1153498955772213e-05, + "loss": 0.5933, + "step": 6265 + }, + { + "epoch": 0.9588370313695486, + "grad_norm": 2.3673848542353024, + "learning_rate": 1.1151037279390106e-05, + "loss": 0.6677, + "step": 6266 + }, + { + "epoch": 0.9589900535577659, + "grad_norm": 2.1283912180610756, + "learning_rate": 1.11485755323183e-05, + "loss": 0.6213, + "step": 6267 + }, + { + "epoch": 0.9591430757459831, + "grad_norm": 2.104333165924923, + "learning_rate": 1.1146113714707973e-05, + "loss": 0.6682, + "step": 6268 + }, + { + "epoch": 0.9592960979342005, + "grad_norm": 2.0811203535808054, + "learning_rate": 1.114365182671032e-05, + "loss": 0.6221, + "step": 6269 + }, + { + "epoch": 0.9594491201224178, + "grad_norm": 2.1932532045760778, + "learning_rate": 1.1141189868476536e-05, + "loss": 0.7587, + "step": 6270 + }, + { + "epoch": 0.959602142310635, + "grad_norm": 2.489560695641433, + "learning_rate": 1.1138727840157817e-05, + "loss": 0.7461, + "step": 6271 + }, + { + "epoch": 0.9597551644988523, + "grad_norm": 2.2827480402995888, + "learning_rate": 1.1136265741905367e-05, + "loss": 0.6466, + "step": 6272 + }, + { + "epoch": 0.9599081866870697, + "grad_norm": 2.5641799903735167, + "learning_rate": 1.1133803573870392e-05, + "loss": 0.7205, + "step": 6273 + }, + { + "epoch": 0.9600612088752869, + "grad_norm": 2.242478983384006, + "learning_rate": 1.1131341336204104e-05, + "loss": 0.6067, + "step": 6274 + }, + { + "epoch": 0.9602142310635042, + "grad_norm": 2.3288654475119297, + "learning_rate": 1.112887902905772e-05, + "loss": 0.6887, + "step": 6275 + }, + { + "epoch": 0.9603672532517215, + "grad_norm": 2.304115755073521, + "learning_rate": 1.1126416652582456e-05, + "loss": 0.7624, + "step": 6276 + }, + { + "epoch": 0.9605202754399388, + "grad_norm": 2.279433405641376, + "learning_rate": 1.1123954206929542e-05, + "loss": 0.6743, + "step": 6277 + }, + { + "epoch": 0.9606732976281561, + "grad_norm": 2.331560845281564, + "learning_rate": 1.11214916922502e-05, + "loss": 0.6911, + "step": 6278 + }, + { + "epoch": 0.9608263198163733, + "grad_norm": 2.273458434140446, + "learning_rate": 1.111902910869567e-05, + "loss": 0.6622, + "step": 6279 + }, + { + "epoch": 0.9609793420045907, + "grad_norm": 2.154670876055646, + "learning_rate": 1.1116566456417181e-05, + "loss": 0.6577, + "step": 6280 + }, + { + "epoch": 0.961132364192808, + "grad_norm": 2.0581058974508832, + "learning_rate": 1.1114103735565977e-05, + "loss": 0.5898, + "step": 6281 + }, + { + "epoch": 0.9612853863810252, + "grad_norm": 2.2992654047403263, + "learning_rate": 1.1111640946293308e-05, + "loss": 0.7091, + "step": 6282 + }, + { + "epoch": 0.9614384085692426, + "grad_norm": 2.315602918362659, + "learning_rate": 1.1109178088750422e-05, + "loss": 0.5043, + "step": 6283 + }, + { + "epoch": 0.9615914307574598, + "grad_norm": 2.35266513608379, + "learning_rate": 1.1106715163088561e-05, + "loss": 0.6637, + "step": 6284 + }, + { + "epoch": 0.9617444529456771, + "grad_norm": 2.1833480375393672, + "learning_rate": 1.1104252169459e-05, + "loss": 0.6923, + "step": 6285 + }, + { + "epoch": 0.9618974751338945, + "grad_norm": 2.551652413206869, + "learning_rate": 1.110178910801299e-05, + "loss": 0.6136, + "step": 6286 + }, + { + "epoch": 0.9620504973221117, + "grad_norm": 2.42416461169718, + "learning_rate": 1.1099325978901802e-05, + "loss": 0.6589, + "step": 6287 + }, + { + "epoch": 0.962203519510329, + "grad_norm": 2.3115177880706854, + "learning_rate": 1.1096862782276707e-05, + "loss": 0.7251, + "step": 6288 + }, + { + "epoch": 0.9623565416985463, + "grad_norm": 2.4133212993849997, + "learning_rate": 1.1094399518288975e-05, + "loss": 0.6165, + "step": 6289 + }, + { + "epoch": 0.9625095638867636, + "grad_norm": 2.476384822568825, + "learning_rate": 1.1091936187089886e-05, + "loss": 0.7031, + "step": 6290 + }, + { + "epoch": 0.9626625860749809, + "grad_norm": 2.497069760113835, + "learning_rate": 1.1089472788830728e-05, + "loss": 0.8166, + "step": 6291 + }, + { + "epoch": 0.9628156082631981, + "grad_norm": 2.1440996425297296, + "learning_rate": 1.1087009323662784e-05, + "loss": 0.6813, + "step": 6292 + }, + { + "epoch": 0.9629686304514155, + "grad_norm": 2.2455927624657135, + "learning_rate": 1.108454579173734e-05, + "loss": 0.6084, + "step": 6293 + }, + { + "epoch": 0.9631216526396328, + "grad_norm": 2.240564503687054, + "learning_rate": 1.1082082193205699e-05, + "loss": 0.5922, + "step": 6294 + }, + { + "epoch": 0.96327467482785, + "grad_norm": 2.232747072090611, + "learning_rate": 1.1079618528219159e-05, + "loss": 0.6345, + "step": 6295 + }, + { + "epoch": 0.9634276970160673, + "grad_norm": 2.3703209346787446, + "learning_rate": 1.107715479692902e-05, + "loss": 0.6895, + "step": 6296 + }, + { + "epoch": 0.9635807192042847, + "grad_norm": 2.241003272516828, + "learning_rate": 1.1074690999486591e-05, + "loss": 0.7439, + "step": 6297 + }, + { + "epoch": 0.9637337413925019, + "grad_norm": 2.0491357691467074, + "learning_rate": 1.1072227136043182e-05, + "loss": 0.6752, + "step": 6298 + }, + { + "epoch": 0.9638867635807192, + "grad_norm": 2.236282969158411, + "learning_rate": 1.1069763206750113e-05, + "loss": 0.616, + "step": 6299 + }, + { + "epoch": 0.9640397857689365, + "grad_norm": 2.4976383991206865, + "learning_rate": 1.10672992117587e-05, + "loss": 0.6593, + "step": 6300 + }, + { + "epoch": 0.9641928079571538, + "grad_norm": 2.2143008291617483, + "learning_rate": 1.1064835151220265e-05, + "loss": 0.6282, + "step": 6301 + }, + { + "epoch": 0.9643458301453711, + "grad_norm": 2.4819206347775427, + "learning_rate": 1.106237102528614e-05, + "loss": 0.6849, + "step": 6302 + }, + { + "epoch": 0.9644988523335883, + "grad_norm": 2.2845724829416985, + "learning_rate": 1.1059906834107652e-05, + "loss": 0.6554, + "step": 6303 + }, + { + "epoch": 0.9646518745218057, + "grad_norm": 2.536708963196747, + "learning_rate": 1.1057442577836141e-05, + "loss": 0.7574, + "step": 6304 + }, + { + "epoch": 0.964804896710023, + "grad_norm": 2.4234275083828565, + "learning_rate": 1.1054978256622946e-05, + "loss": 0.6173, + "step": 6305 + }, + { + "epoch": 0.9649579188982402, + "grad_norm": 2.3943143908981805, + "learning_rate": 1.1052513870619403e-05, + "loss": 0.7037, + "step": 6306 + }, + { + "epoch": 0.9651109410864576, + "grad_norm": 2.3606987033507507, + "learning_rate": 1.1050049419976872e-05, + "loss": 0.695, + "step": 6307 + }, + { + "epoch": 0.9652639632746748, + "grad_norm": 2.5633579786892686, + "learning_rate": 1.1047584904846697e-05, + "loss": 0.6554, + "step": 6308 + }, + { + "epoch": 0.9654169854628921, + "grad_norm": 2.2020850155001543, + "learning_rate": 1.1045120325380233e-05, + "loss": 0.6187, + "step": 6309 + }, + { + "epoch": 0.9655700076511095, + "grad_norm": 2.1260293430856634, + "learning_rate": 1.1042655681728842e-05, + "loss": 0.6128, + "step": 6310 + }, + { + "epoch": 0.9657230298393267, + "grad_norm": 2.3544765145165045, + "learning_rate": 1.1040190974043887e-05, + "loss": 0.6527, + "step": 6311 + }, + { + "epoch": 0.965876052027544, + "grad_norm": 2.218314723977588, + "learning_rate": 1.1037726202476735e-05, + "loss": 0.7008, + "step": 6312 + }, + { + "epoch": 0.9660290742157613, + "grad_norm": 2.403254705515555, + "learning_rate": 1.1035261367178758e-05, + "loss": 0.6956, + "step": 6313 + }, + { + "epoch": 0.9661820964039786, + "grad_norm": 2.0814432448403006, + "learning_rate": 1.103279646830133e-05, + "loss": 0.6016, + "step": 6314 + }, + { + "epoch": 0.9663351185921959, + "grad_norm": 2.379683650495674, + "learning_rate": 1.103033150599583e-05, + "loss": 0.6538, + "step": 6315 + }, + { + "epoch": 0.9664881407804131, + "grad_norm": 2.286013236090055, + "learning_rate": 1.1027866480413642e-05, + "loss": 0.6498, + "step": 6316 + }, + { + "epoch": 0.9666411629686305, + "grad_norm": 2.3171666595250646, + "learning_rate": 1.1025401391706152e-05, + "loss": 0.6939, + "step": 6317 + }, + { + "epoch": 0.9667941851568478, + "grad_norm": 2.1385639093202724, + "learning_rate": 1.1022936240024754e-05, + "loss": 0.6075, + "step": 6318 + }, + { + "epoch": 0.966947207345065, + "grad_norm": 2.038129807160247, + "learning_rate": 1.1020471025520833e-05, + "loss": 0.5521, + "step": 6319 + }, + { + "epoch": 0.9671002295332823, + "grad_norm": 2.782764068823436, + "learning_rate": 1.1018005748345802e-05, + "loss": 0.7901, + "step": 6320 + }, + { + "epoch": 0.9672532517214996, + "grad_norm": 1.9353389662910547, + "learning_rate": 1.1015540408651051e-05, + "loss": 0.5659, + "step": 6321 + }, + { + "epoch": 0.9674062739097169, + "grad_norm": 2.364548658211014, + "learning_rate": 1.101307500658799e-05, + "loss": 0.6433, + "step": 6322 + }, + { + "epoch": 0.9675592960979342, + "grad_norm": 2.14311294534737, + "learning_rate": 1.101060954230803e-05, + "loss": 0.6272, + "step": 6323 + }, + { + "epoch": 0.9677123182861515, + "grad_norm": 2.3056064170443893, + "learning_rate": 1.100814401596259e-05, + "loss": 0.6459, + "step": 6324 + }, + { + "epoch": 0.9678653404743688, + "grad_norm": 2.02182565029233, + "learning_rate": 1.1005678427703075e-05, + "loss": 0.6518, + "step": 6325 + }, + { + "epoch": 0.9680183626625861, + "grad_norm": 2.3303131018908547, + "learning_rate": 1.1003212777680916e-05, + "loss": 0.6813, + "step": 6326 + }, + { + "epoch": 0.9681713848508033, + "grad_norm": 2.114523748506382, + "learning_rate": 1.1000747066047536e-05, + "loss": 0.5194, + "step": 6327 + }, + { + "epoch": 0.9683244070390207, + "grad_norm": 2.265418197230814, + "learning_rate": 1.0998281292954362e-05, + "loss": 0.6422, + "step": 6328 + }, + { + "epoch": 0.9684774292272379, + "grad_norm": 2.338030038622006, + "learning_rate": 1.0995815458552833e-05, + "loss": 0.7219, + "step": 6329 + }, + { + "epoch": 0.9686304514154552, + "grad_norm": 2.521459284136043, + "learning_rate": 1.0993349562994378e-05, + "loss": 0.6203, + "step": 6330 + }, + { + "epoch": 0.9687834736036726, + "grad_norm": 2.0493537019726564, + "learning_rate": 1.0990883606430439e-05, + "loss": 0.593, + "step": 6331 + }, + { + "epoch": 0.9689364957918898, + "grad_norm": 2.6405871157081777, + "learning_rate": 1.0988417589012464e-05, + "loss": 0.79, + "step": 6332 + }, + { + "epoch": 0.9690895179801071, + "grad_norm": 2.096405636694104, + "learning_rate": 1.0985951510891899e-05, + "loss": 0.7241, + "step": 6333 + }, + { + "epoch": 0.9692425401683245, + "grad_norm": 2.088604644544387, + "learning_rate": 1.098348537222019e-05, + "loss": 0.6886, + "step": 6334 + }, + { + "epoch": 0.9693955623565417, + "grad_norm": 1.9904441066693555, + "learning_rate": 1.0981019173148802e-05, + "loss": 0.5716, + "step": 6335 + }, + { + "epoch": 0.969548584544759, + "grad_norm": 2.062780568286232, + "learning_rate": 1.0978552913829185e-05, + "loss": 0.5858, + "step": 6336 + }, + { + "epoch": 0.9697016067329762, + "grad_norm": 2.2337258488936538, + "learning_rate": 1.0976086594412808e-05, + "loss": 0.6003, + "step": 6337 + }, + { + "epoch": 0.9698546289211936, + "grad_norm": 2.372510122203268, + "learning_rate": 1.0973620215051132e-05, + "loss": 0.7456, + "step": 6338 + }, + { + "epoch": 0.9700076511094109, + "grad_norm": 2.2495942249697083, + "learning_rate": 1.0971153775895633e-05, + "loss": 0.6003, + "step": 6339 + }, + { + "epoch": 0.9701606732976281, + "grad_norm": 2.239803426785077, + "learning_rate": 1.096868727709778e-05, + "loss": 0.5947, + "step": 6340 + }, + { + "epoch": 0.9703136954858455, + "grad_norm": 2.0374315924280446, + "learning_rate": 1.0966220718809054e-05, + "loss": 0.6142, + "step": 6341 + }, + { + "epoch": 0.9704667176740628, + "grad_norm": 2.1103707648171106, + "learning_rate": 1.096375410118093e-05, + "loss": 0.5306, + "step": 6342 + }, + { + "epoch": 0.97061973986228, + "grad_norm": 2.07129788644162, + "learning_rate": 1.0961287424364896e-05, + "loss": 0.6902, + "step": 6343 + }, + { + "epoch": 0.9707727620504973, + "grad_norm": 2.3083558399211164, + "learning_rate": 1.0958820688512441e-05, + "loss": 0.6831, + "step": 6344 + }, + { + "epoch": 0.9709257842387146, + "grad_norm": 2.426716481951558, + "learning_rate": 1.0956353893775062e-05, + "loss": 0.7141, + "step": 6345 + }, + { + "epoch": 0.9710788064269319, + "grad_norm": 2.162494654252418, + "learning_rate": 1.0953887040304245e-05, + "loss": 0.5416, + "step": 6346 + }, + { + "epoch": 0.9712318286151492, + "grad_norm": 2.1205750952379447, + "learning_rate": 1.0951420128251491e-05, + "loss": 0.6947, + "step": 6347 + }, + { + "epoch": 0.9713848508033665, + "grad_norm": 2.4172902478803358, + "learning_rate": 1.0948953157768309e-05, + "loss": 0.667, + "step": 6348 + }, + { + "epoch": 0.9715378729915838, + "grad_norm": 2.238302293539362, + "learning_rate": 1.0946486129006202e-05, + "loss": 0.666, + "step": 6349 + }, + { + "epoch": 0.9716908951798011, + "grad_norm": 2.2943963497156576, + "learning_rate": 1.0944019042116673e-05, + "loss": 0.6383, + "step": 6350 + }, + { + "epoch": 0.9718439173680183, + "grad_norm": 2.0424734728227354, + "learning_rate": 1.0941551897251248e-05, + "loss": 0.6146, + "step": 6351 + }, + { + "epoch": 0.9719969395562357, + "grad_norm": 2.348686525361142, + "learning_rate": 1.0939084694561434e-05, + "loss": 0.6622, + "step": 6352 + }, + { + "epoch": 0.9721499617444529, + "grad_norm": 1.9904593663363055, + "learning_rate": 1.0936617434198758e-05, + "loss": 0.5622, + "step": 6353 + }, + { + "epoch": 0.9723029839326702, + "grad_norm": 2.2619272391880254, + "learning_rate": 1.093415011631474e-05, + "loss": 0.7357, + "step": 6354 + }, + { + "epoch": 0.9724560061208876, + "grad_norm": 2.142159801287082, + "learning_rate": 1.0931682741060907e-05, + "loss": 0.6831, + "step": 6355 + }, + { + "epoch": 0.9726090283091048, + "grad_norm": 2.4643906335625196, + "learning_rate": 1.0929215308588795e-05, + "loss": 0.715, + "step": 6356 + }, + { + "epoch": 0.9727620504973221, + "grad_norm": 2.0428082848603037, + "learning_rate": 1.0926747819049935e-05, + "loss": 0.6328, + "step": 6357 + }, + { + "epoch": 0.9729150726855395, + "grad_norm": 2.253982118695505, + "learning_rate": 1.0924280272595863e-05, + "loss": 0.7491, + "step": 6358 + }, + { + "epoch": 0.9730680948737567, + "grad_norm": 2.1914156685495443, + "learning_rate": 1.0921812669378126e-05, + "loss": 0.6725, + "step": 6359 + }, + { + "epoch": 0.973221117061974, + "grad_norm": 2.26967414402112, + "learning_rate": 1.0919345009548266e-05, + "loss": 0.7652, + "step": 6360 + }, + { + "epoch": 0.9733741392501912, + "grad_norm": 2.246918731454627, + "learning_rate": 1.0916877293257837e-05, + "loss": 0.635, + "step": 6361 + }, + { + "epoch": 0.9735271614384086, + "grad_norm": 2.237823796273232, + "learning_rate": 1.0914409520658382e-05, + "loss": 0.6303, + "step": 6362 + }, + { + "epoch": 0.9736801836266259, + "grad_norm": 2.3232073625174903, + "learning_rate": 1.091194169190146e-05, + "loss": 0.5327, + "step": 6363 + }, + { + "epoch": 0.9738332058148431, + "grad_norm": 2.387396773346526, + "learning_rate": 1.0909473807138633e-05, + "loss": 0.6605, + "step": 6364 + }, + { + "epoch": 0.9739862280030605, + "grad_norm": 2.3255204349093996, + "learning_rate": 1.0907005866521462e-05, + "loss": 0.6902, + "step": 6365 + }, + { + "epoch": 0.9741392501912778, + "grad_norm": 2.350835243846951, + "learning_rate": 1.0904537870201512e-05, + "loss": 0.6583, + "step": 6366 + }, + { + "epoch": 0.974292272379495, + "grad_norm": 2.0874447124765667, + "learning_rate": 1.0902069818330354e-05, + "loss": 0.6323, + "step": 6367 + }, + { + "epoch": 0.9744452945677123, + "grad_norm": 2.4843168652978216, + "learning_rate": 1.089960171105956e-05, + "loss": 0.7759, + "step": 6368 + }, + { + "epoch": 0.9745983167559296, + "grad_norm": 2.375106997044347, + "learning_rate": 1.0897133548540701e-05, + "loss": 0.6102, + "step": 6369 + }, + { + "epoch": 0.9747513389441469, + "grad_norm": 2.343614659524406, + "learning_rate": 1.0894665330925368e-05, + "loss": 0.6207, + "step": 6370 + }, + { + "epoch": 0.9749043611323642, + "grad_norm": 2.2805865460307237, + "learning_rate": 1.0892197058365135e-05, + "loss": 0.6677, + "step": 6371 + }, + { + "epoch": 0.9750573833205815, + "grad_norm": 2.235287175901108, + "learning_rate": 1.0889728731011587e-05, + "loss": 0.7031, + "step": 6372 + }, + { + "epoch": 0.9752104055087988, + "grad_norm": 2.104978197412232, + "learning_rate": 1.0887260349016318e-05, + "loss": 0.5624, + "step": 6373 + }, + { + "epoch": 0.9753634276970161, + "grad_norm": 2.3101831680546323, + "learning_rate": 1.0884791912530923e-05, + "loss": 0.5689, + "step": 6374 + }, + { + "epoch": 0.9755164498852333, + "grad_norm": 2.3371258325914255, + "learning_rate": 1.0882323421706991e-05, + "loss": 0.7777, + "step": 6375 + }, + { + "epoch": 0.9756694720734507, + "grad_norm": 2.7997626109199745, + "learning_rate": 1.087985487669613e-05, + "loss": 0.8104, + "step": 6376 + }, + { + "epoch": 0.9758224942616679, + "grad_norm": 2.412073839993447, + "learning_rate": 1.0877386277649938e-05, + "loss": 0.7437, + "step": 6377 + }, + { + "epoch": 0.9759755164498852, + "grad_norm": 2.316799724033941, + "learning_rate": 1.087491762472002e-05, + "loss": 0.725, + "step": 6378 + }, + { + "epoch": 0.9761285386381026, + "grad_norm": 2.2241974714922486, + "learning_rate": 1.0872448918057989e-05, + "loss": 0.768, + "step": 6379 + }, + { + "epoch": 0.9762815608263198, + "grad_norm": 2.0678765281664386, + "learning_rate": 1.0869980157815458e-05, + "loss": 0.5852, + "step": 6380 + }, + { + "epoch": 0.9764345830145371, + "grad_norm": 2.211865767184328, + "learning_rate": 1.0867511344144042e-05, + "loss": 0.6851, + "step": 6381 + }, + { + "epoch": 0.9765876052027544, + "grad_norm": 2.113420351755564, + "learning_rate": 1.0865042477195362e-05, + "loss": 0.6036, + "step": 6382 + }, + { + "epoch": 0.9767406273909717, + "grad_norm": 2.238136535568473, + "learning_rate": 1.0862573557121036e-05, + "loss": 0.6267, + "step": 6383 + }, + { + "epoch": 0.976893649579189, + "grad_norm": 2.2475432663562107, + "learning_rate": 1.0860104584072698e-05, + "loss": 0.7343, + "step": 6384 + }, + { + "epoch": 0.9770466717674062, + "grad_norm": 2.332824885622173, + "learning_rate": 1.0857635558201966e-05, + "loss": 0.6669, + "step": 6385 + }, + { + "epoch": 0.9771996939556236, + "grad_norm": 2.1477623949536624, + "learning_rate": 1.0855166479660487e-05, + "loss": 0.5838, + "step": 6386 + }, + { + "epoch": 0.9773527161438409, + "grad_norm": 2.2525788281704773, + "learning_rate": 1.0852697348599883e-05, + "loss": 0.7541, + "step": 6387 + }, + { + "epoch": 0.9775057383320581, + "grad_norm": 2.2180595252269955, + "learning_rate": 1.08502281651718e-05, + "loss": 0.6829, + "step": 6388 + }, + { + "epoch": 0.9776587605202754, + "grad_norm": 2.216679839529683, + "learning_rate": 1.0847758929527881e-05, + "loss": 0.6328, + "step": 6389 + }, + { + "epoch": 0.9778117827084928, + "grad_norm": 2.094749687529566, + "learning_rate": 1.0845289641819772e-05, + "loss": 0.5704, + "step": 6390 + }, + { + "epoch": 0.97796480489671, + "grad_norm": 2.158104895092101, + "learning_rate": 1.0842820302199115e-05, + "loss": 0.6678, + "step": 6391 + }, + { + "epoch": 0.9781178270849273, + "grad_norm": 2.5011576786078495, + "learning_rate": 1.0840350910817568e-05, + "loss": 0.6058, + "step": 6392 + }, + { + "epoch": 0.9782708492731446, + "grad_norm": 2.1857763444580067, + "learning_rate": 1.0837881467826783e-05, + "loss": 0.6703, + "step": 6393 + }, + { + "epoch": 0.9784238714613619, + "grad_norm": 2.2003107696201027, + "learning_rate": 1.0835411973378421e-05, + "loss": 0.7169, + "step": 6394 + }, + { + "epoch": 0.9785768936495792, + "grad_norm": 2.0337957252981096, + "learning_rate": 1.083294242762414e-05, + "loss": 0.5438, + "step": 6395 + }, + { + "epoch": 0.9787299158377964, + "grad_norm": 2.150512225972815, + "learning_rate": 1.0830472830715605e-05, + "loss": 0.6302, + "step": 6396 + }, + { + "epoch": 0.9788829380260138, + "grad_norm": 2.484740579652343, + "learning_rate": 1.0828003182804488e-05, + "loss": 0.6283, + "step": 6397 + }, + { + "epoch": 0.9790359602142311, + "grad_norm": 2.5080302326286668, + "learning_rate": 1.0825533484042451e-05, + "loss": 0.7363, + "step": 6398 + }, + { + "epoch": 0.9791889824024483, + "grad_norm": 2.4527364589897527, + "learning_rate": 1.082306373458118e-05, + "loss": 0.6692, + "step": 6399 + }, + { + "epoch": 0.9793420045906657, + "grad_norm": 2.5425671821277587, + "learning_rate": 1.0820593934572344e-05, + "loss": 0.5979, + "step": 6400 + }, + { + "epoch": 0.9794950267788829, + "grad_norm": 2.3776505208003607, + "learning_rate": 1.0818124084167619e-05, + "loss": 0.6738, + "step": 6401 + }, + { + "epoch": 0.9796480489671002, + "grad_norm": 2.2778353809646887, + "learning_rate": 1.0815654183518698e-05, + "loss": 0.7114, + "step": 6402 + }, + { + "epoch": 0.9798010711553176, + "grad_norm": 2.193755629471049, + "learning_rate": 1.0813184232777262e-05, + "loss": 0.6681, + "step": 6403 + }, + { + "epoch": 0.9799540933435348, + "grad_norm": 1.9695219026147595, + "learning_rate": 1.0810714232095001e-05, + "loss": 0.6318, + "step": 6404 + }, + { + "epoch": 0.9801071155317521, + "grad_norm": 2.279528997785273, + "learning_rate": 1.0808244181623606e-05, + "loss": 0.6502, + "step": 6405 + }, + { + "epoch": 0.9802601377199694, + "grad_norm": 2.3554587060275716, + "learning_rate": 1.0805774081514774e-05, + "loss": 0.7645, + "step": 6406 + }, + { + "epoch": 0.9804131599081867, + "grad_norm": 2.178746017369633, + "learning_rate": 1.0803303931920206e-05, + "loss": 0.5972, + "step": 6407 + }, + { + "epoch": 0.980566182096404, + "grad_norm": 2.200882154882437, + "learning_rate": 1.08008337329916e-05, + "loss": 0.6289, + "step": 6408 + }, + { + "epoch": 0.9807192042846212, + "grad_norm": 2.1322827299103064, + "learning_rate": 1.079836348488066e-05, + "loss": 0.5765, + "step": 6409 + }, + { + "epoch": 0.9808722264728386, + "grad_norm": 2.264692330253768, + "learning_rate": 1.0795893187739095e-05, + "loss": 0.728, + "step": 6410 + }, + { + "epoch": 0.9810252486610559, + "grad_norm": 2.2316755970527984, + "learning_rate": 1.079342284171862e-05, + "loss": 0.6295, + "step": 6411 + }, + { + "epoch": 0.9811782708492731, + "grad_norm": 2.2053391147834756, + "learning_rate": 1.0790952446970942e-05, + "loss": 0.6607, + "step": 6412 + }, + { + "epoch": 0.9813312930374904, + "grad_norm": 2.1109423143732897, + "learning_rate": 1.0788482003647775e-05, + "loss": 0.6516, + "step": 6413 + }, + { + "epoch": 0.9814843152257078, + "grad_norm": 2.268297928669402, + "learning_rate": 1.078601151190085e-05, + "loss": 0.7403, + "step": 6414 + }, + { + "epoch": 0.981637337413925, + "grad_norm": 2.2137529570759122, + "learning_rate": 1.0783540971881882e-05, + "loss": 0.6743, + "step": 6415 + }, + { + "epoch": 0.9817903596021423, + "grad_norm": 2.0507644073612137, + "learning_rate": 1.0781070383742595e-05, + "loss": 0.4971, + "step": 6416 + }, + { + "epoch": 0.9819433817903596, + "grad_norm": 2.088792875167074, + "learning_rate": 1.0778599747634723e-05, + "loss": 0.6713, + "step": 6417 + }, + { + "epoch": 0.9820964039785769, + "grad_norm": 2.3516082787434063, + "learning_rate": 1.0776129063709994e-05, + "loss": 0.6598, + "step": 6418 + }, + { + "epoch": 0.9822494261667942, + "grad_norm": 2.6734111206794915, + "learning_rate": 1.0773658332120143e-05, + "loss": 0.6645, + "step": 6419 + }, + { + "epoch": 0.9824024483550114, + "grad_norm": 2.3219289364900053, + "learning_rate": 1.0771187553016907e-05, + "loss": 0.6809, + "step": 6420 + }, + { + "epoch": 0.9825554705432288, + "grad_norm": 2.295162797643992, + "learning_rate": 1.0768716726552027e-05, + "loss": 0.6209, + "step": 6421 + }, + { + "epoch": 0.9827084927314461, + "grad_norm": 2.1896216122076155, + "learning_rate": 1.0766245852877244e-05, + "loss": 0.633, + "step": 6422 + }, + { + "epoch": 0.9828615149196633, + "grad_norm": 2.423235832841934, + "learning_rate": 1.0763774932144306e-05, + "loss": 0.634, + "step": 6423 + }, + { + "epoch": 0.9830145371078807, + "grad_norm": 2.352333616434348, + "learning_rate": 1.0761303964504963e-05, + "loss": 0.6987, + "step": 6424 + }, + { + "epoch": 0.9831675592960979, + "grad_norm": 2.223375288114711, + "learning_rate": 1.0758832950110965e-05, + "loss": 0.5861, + "step": 6425 + }, + { + "epoch": 0.9833205814843152, + "grad_norm": 2.2183349950150646, + "learning_rate": 1.0756361889114065e-05, + "loss": 0.6365, + "step": 6426 + }, + { + "epoch": 0.9834736036725326, + "grad_norm": 1.9636195416029705, + "learning_rate": 1.0753890781666025e-05, + "loss": 0.6346, + "step": 6427 + }, + { + "epoch": 0.9836266258607498, + "grad_norm": 2.2297263596321404, + "learning_rate": 1.0751419627918602e-05, + "loss": 0.666, + "step": 6428 + }, + { + "epoch": 0.9837796480489671, + "grad_norm": 2.4350159907453146, + "learning_rate": 1.0748948428023557e-05, + "loss": 0.6205, + "step": 6429 + }, + { + "epoch": 0.9839326702371843, + "grad_norm": 2.181247738797785, + "learning_rate": 1.0746477182132665e-05, + "loss": 0.6318, + "step": 6430 + }, + { + "epoch": 0.9840856924254017, + "grad_norm": 2.646466395978632, + "learning_rate": 1.0744005890397686e-05, + "loss": 0.6104, + "step": 6431 + }, + { + "epoch": 0.984238714613619, + "grad_norm": 2.2751928031916804, + "learning_rate": 1.0741534552970393e-05, + "loss": 0.6352, + "step": 6432 + }, + { + "epoch": 0.9843917368018362, + "grad_norm": 19.9658720243908, + "learning_rate": 1.0739063170002564e-05, + "loss": 0.6467, + "step": 6433 + }, + { + "epoch": 0.9845447589900536, + "grad_norm": 2.084851787789992, + "learning_rate": 1.0736591741645974e-05, + "loss": 0.4974, + "step": 6434 + }, + { + "epoch": 0.9846977811782709, + "grad_norm": 2.0840003737302095, + "learning_rate": 1.0734120268052403e-05, + "loss": 0.6906, + "step": 6435 + }, + { + "epoch": 0.9848508033664881, + "grad_norm": 2.2573509449638096, + "learning_rate": 1.0731648749373637e-05, + "loss": 0.6702, + "step": 6436 + }, + { + "epoch": 0.9850038255547054, + "grad_norm": 2.105354426708069, + "learning_rate": 1.0729177185761457e-05, + "loss": 0.571, + "step": 6437 + }, + { + "epoch": 0.9851568477429227, + "grad_norm": 2.288305079827146, + "learning_rate": 1.0726705577367652e-05, + "loss": 0.7155, + "step": 6438 + }, + { + "epoch": 0.98530986993114, + "grad_norm": 2.2850243056547734, + "learning_rate": 1.0724233924344016e-05, + "loss": 0.6159, + "step": 6439 + }, + { + "epoch": 0.9854628921193573, + "grad_norm": 1.9649566640224054, + "learning_rate": 1.0721762226842344e-05, + "loss": 0.5237, + "step": 6440 + }, + { + "epoch": 0.9856159143075746, + "grad_norm": 1.9893769537451282, + "learning_rate": 1.0719290485014428e-05, + "loss": 0.5814, + "step": 6441 + }, + { + "epoch": 0.9857689364957919, + "grad_norm": 2.564034080142649, + "learning_rate": 1.0716818699012067e-05, + "loss": 0.7568, + "step": 6442 + }, + { + "epoch": 0.9859219586840092, + "grad_norm": 2.295834576636577, + "learning_rate": 1.0714346868987068e-05, + "loss": 0.7564, + "step": 6443 + }, + { + "epoch": 0.9860749808722264, + "grad_norm": 2.0971214868536894, + "learning_rate": 1.0711874995091239e-05, + "loss": 0.6822, + "step": 6444 + }, + { + "epoch": 0.9862280030604438, + "grad_norm": 2.21508227273722, + "learning_rate": 1.0709403077476372e-05, + "loss": 0.6229, + "step": 6445 + }, + { + "epoch": 0.986381025248661, + "grad_norm": 2.043342706115372, + "learning_rate": 1.070693111629429e-05, + "loss": 0.5323, + "step": 6446 + }, + { + "epoch": 0.9865340474368783, + "grad_norm": 2.2277205626682277, + "learning_rate": 1.0704459111696805e-05, + "loss": 0.6203, + "step": 6447 + }, + { + "epoch": 0.9866870696250957, + "grad_norm": 2.5719639860424133, + "learning_rate": 1.0701987063835728e-05, + "loss": 0.784, + "step": 6448 + }, + { + "epoch": 0.9868400918133129, + "grad_norm": 2.4296363914023047, + "learning_rate": 1.069951497286288e-05, + "loss": 0.6575, + "step": 6449 + }, + { + "epoch": 0.9869931140015302, + "grad_norm": 2.7202807521864147, + "learning_rate": 1.069704283893008e-05, + "loss": 0.6461, + "step": 6450 + }, + { + "epoch": 0.9871461361897476, + "grad_norm": 2.143158373415149, + "learning_rate": 1.0694570662189152e-05, + "loss": 0.6412, + "step": 6451 + }, + { + "epoch": 0.9872991583779648, + "grad_norm": 2.3625004784021257, + "learning_rate": 1.0692098442791925e-05, + "loss": 0.6861, + "step": 6452 + }, + { + "epoch": 0.9874521805661821, + "grad_norm": 2.5257222065048013, + "learning_rate": 1.0689626180890225e-05, + "loss": 0.7539, + "step": 6453 + }, + { + "epoch": 0.9876052027543993, + "grad_norm": 2.1250124084674864, + "learning_rate": 1.0687153876635878e-05, + "loss": 0.7075, + "step": 6454 + }, + { + "epoch": 0.9877582249426167, + "grad_norm": 2.409259571306803, + "learning_rate": 1.068468153018073e-05, + "loss": 0.6375, + "step": 6455 + }, + { + "epoch": 0.987911247130834, + "grad_norm": 2.32874679222682, + "learning_rate": 1.068220914167661e-05, + "loss": 0.6387, + "step": 6456 + }, + { + "epoch": 0.9880642693190512, + "grad_norm": 2.2644969331295517, + "learning_rate": 1.0679736711275352e-05, + "loss": 0.6875, + "step": 6457 + }, + { + "epoch": 0.9882172915072686, + "grad_norm": 1.9025109306992631, + "learning_rate": 1.0677264239128809e-05, + "loss": 0.5653, + "step": 6458 + }, + { + "epoch": 0.9883703136954859, + "grad_norm": 2.1561627079866503, + "learning_rate": 1.0674791725388818e-05, + "loss": 0.6281, + "step": 6459 + }, + { + "epoch": 0.9885233358837031, + "grad_norm": 2.061516117776862, + "learning_rate": 1.0672319170207227e-05, + "loss": 0.6177, + "step": 6460 + }, + { + "epoch": 0.9886763580719204, + "grad_norm": 2.2521575998416066, + "learning_rate": 1.0669846573735885e-05, + "loss": 0.6979, + "step": 6461 + }, + { + "epoch": 0.9888293802601377, + "grad_norm": 2.1209188840044573, + "learning_rate": 1.0667373936126646e-05, + "loss": 0.7254, + "step": 6462 + }, + { + "epoch": 0.988982402448355, + "grad_norm": 2.4464366863064515, + "learning_rate": 1.0664901257531362e-05, + "loss": 0.6411, + "step": 6463 + }, + { + "epoch": 0.9891354246365723, + "grad_norm": 2.2092005460424895, + "learning_rate": 1.0662428538101893e-05, + "loss": 0.6615, + "step": 6464 + }, + { + "epoch": 0.9892884468247896, + "grad_norm": 2.298112700131103, + "learning_rate": 1.0659955777990092e-05, + "loss": 0.6517, + "step": 6465 + }, + { + "epoch": 0.9894414690130069, + "grad_norm": 2.51742007501707, + "learning_rate": 1.0657482977347826e-05, + "loss": 0.6929, + "step": 6466 + }, + { + "epoch": 0.9895944912012242, + "grad_norm": 2.2902881205759833, + "learning_rate": 1.0655010136326957e-05, + "loss": 0.6197, + "step": 6467 + }, + { + "epoch": 0.9897475133894414, + "grad_norm": 2.0662944894518684, + "learning_rate": 1.0652537255079359e-05, + "loss": 0.6396, + "step": 6468 + }, + { + "epoch": 0.9899005355776588, + "grad_norm": 2.3815178778296033, + "learning_rate": 1.0650064333756892e-05, + "loss": 0.7265, + "step": 6469 + }, + { + "epoch": 0.990053557765876, + "grad_norm": 2.332863841758195, + "learning_rate": 1.0647591372511427e-05, + "loss": 0.6633, + "step": 6470 + }, + { + "epoch": 0.9902065799540933, + "grad_norm": 2.289934714096384, + "learning_rate": 1.0645118371494847e-05, + "loss": 0.6172, + "step": 6471 + }, + { + "epoch": 0.9903596021423107, + "grad_norm": 2.2808303258671625, + "learning_rate": 1.0642645330859023e-05, + "loss": 0.647, + "step": 6472 + }, + { + "epoch": 0.9905126243305279, + "grad_norm": 2.001627220573405, + "learning_rate": 1.0640172250755836e-05, + "loss": 0.6933, + "step": 6473 + }, + { + "epoch": 0.9906656465187452, + "grad_norm": 2.133895070860213, + "learning_rate": 1.0637699131337167e-05, + "loss": 0.6485, + "step": 6474 + }, + { + "epoch": 0.9908186687069626, + "grad_norm": 2.306956253050283, + "learning_rate": 1.06352259727549e-05, + "loss": 0.6595, + "step": 6475 + }, + { + "epoch": 0.9909716908951798, + "grad_norm": 2.1617438230826407, + "learning_rate": 1.0632752775160917e-05, + "loss": 0.5378, + "step": 6476 + }, + { + "epoch": 0.9911247130833971, + "grad_norm": 2.5429299085792296, + "learning_rate": 1.0630279538707116e-05, + "loss": 0.7616, + "step": 6477 + }, + { + "epoch": 0.9912777352716143, + "grad_norm": 2.252167453852857, + "learning_rate": 1.0627806263545379e-05, + "loss": 0.6494, + "step": 6478 + }, + { + "epoch": 0.9914307574598317, + "grad_norm": 2.2880745559100775, + "learning_rate": 1.0625332949827604e-05, + "loss": 0.6506, + "step": 6479 + }, + { + "epoch": 0.991583779648049, + "grad_norm": 2.476915922169972, + "learning_rate": 1.0622859597705686e-05, + "loss": 0.7807, + "step": 6480 + }, + { + "epoch": 0.9917368018362662, + "grad_norm": 2.4263691944067034, + "learning_rate": 1.0620386207331528e-05, + "loss": 0.6673, + "step": 6481 + }, + { + "epoch": 0.9918898240244836, + "grad_norm": 2.288249879251959, + "learning_rate": 1.0617912778857022e-05, + "loss": 0.5908, + "step": 6482 + }, + { + "epoch": 0.9920428462127009, + "grad_norm": 2.1329413418096625, + "learning_rate": 1.0615439312434073e-05, + "loss": 0.6263, + "step": 6483 + }, + { + "epoch": 0.9921958684009181, + "grad_norm": 2.3367729594747804, + "learning_rate": 1.061296580821459e-05, + "loss": 0.6701, + "step": 6484 + }, + { + "epoch": 0.9923488905891354, + "grad_norm": 2.0871602655046506, + "learning_rate": 1.0610492266350484e-05, + "loss": 0.5942, + "step": 6485 + }, + { + "epoch": 0.9925019127773527, + "grad_norm": 2.1715923797764236, + "learning_rate": 1.0608018686993651e-05, + "loss": 0.592, + "step": 6486 + }, + { + "epoch": 0.99265493496557, + "grad_norm": 2.1720410847528266, + "learning_rate": 1.0605545070296017e-05, + "loss": 0.6236, + "step": 6487 + }, + { + "epoch": 0.9928079571537873, + "grad_norm": 2.199533292871803, + "learning_rate": 1.060307141640949e-05, + "loss": 0.5939, + "step": 6488 + }, + { + "epoch": 0.9929609793420046, + "grad_norm": 2.314051801230297, + "learning_rate": 1.0600597725485988e-05, + "loss": 0.6454, + "step": 6489 + }, + { + "epoch": 0.9931140015302219, + "grad_norm": 2.414577584604856, + "learning_rate": 1.0598123997677432e-05, + "loss": 0.5892, + "step": 6490 + }, + { + "epoch": 0.9932670237184392, + "grad_norm": 2.220440454756989, + "learning_rate": 1.059565023313574e-05, + "loss": 0.6345, + "step": 6491 + }, + { + "epoch": 0.9934200459066564, + "grad_norm": 2.359228123391197, + "learning_rate": 1.0593176432012836e-05, + "loss": 0.6315, + "step": 6492 + }, + { + "epoch": 0.9935730680948738, + "grad_norm": 2.2007100542708797, + "learning_rate": 1.0590702594460649e-05, + "loss": 0.672, + "step": 6493 + }, + { + "epoch": 0.993726090283091, + "grad_norm": 2.162309831580927, + "learning_rate": 1.0588228720631104e-05, + "loss": 0.6506, + "step": 6494 + }, + { + "epoch": 0.9938791124713083, + "grad_norm": 2.5394139530332698, + "learning_rate": 1.0585754810676127e-05, + "loss": 0.8196, + "step": 6495 + }, + { + "epoch": 0.9940321346595257, + "grad_norm": 2.1750634220281104, + "learning_rate": 1.0583280864747663e-05, + "loss": 0.5415, + "step": 6496 + }, + { + "epoch": 0.9941851568477429, + "grad_norm": 2.056184817961102, + "learning_rate": 1.0580806882997638e-05, + "loss": 0.5811, + "step": 6497 + }, + { + "epoch": 0.9943381790359602, + "grad_norm": 1.9628993588402721, + "learning_rate": 1.0578332865577987e-05, + "loss": 0.5865, + "step": 6498 + }, + { + "epoch": 0.9944912012241776, + "grad_norm": 2.0909315968389572, + "learning_rate": 1.0575858812640653e-05, + "loss": 0.6559, + "step": 6499 + }, + { + "epoch": 0.9946442234123948, + "grad_norm": 2.1491534931389547, + "learning_rate": 1.057338472433758e-05, + "loss": 0.5262, + "step": 6500 + }, + { + "epoch": 0.9947972456006121, + "grad_norm": 2.25180793663901, + "learning_rate": 1.0570910600820703e-05, + "loss": 0.7352, + "step": 6501 + }, + { + "epoch": 0.9949502677888293, + "grad_norm": 2.1499742816028613, + "learning_rate": 1.0568436442241975e-05, + "loss": 0.6087, + "step": 6502 + }, + { + "epoch": 0.9951032899770467, + "grad_norm": 2.202787694955412, + "learning_rate": 1.0565962248753344e-05, + "loss": 0.6215, + "step": 6503 + }, + { + "epoch": 0.995256312165264, + "grad_norm": 2.225784673710967, + "learning_rate": 1.0563488020506754e-05, + "loss": 0.7141, + "step": 6504 + }, + { + "epoch": 0.9954093343534812, + "grad_norm": 2.3329954360329843, + "learning_rate": 1.056101375765416e-05, + "loss": 0.7444, + "step": 6505 + }, + { + "epoch": 0.9955623565416986, + "grad_norm": 2.0139835559989505, + "learning_rate": 1.0558539460347518e-05, + "loss": 0.5935, + "step": 6506 + }, + { + "epoch": 0.9957153787299159, + "grad_norm": 2.263146610045272, + "learning_rate": 1.0556065128738782e-05, + "loss": 0.6425, + "step": 6507 + }, + { + "epoch": 0.9958684009181331, + "grad_norm": 2.2635096597755426, + "learning_rate": 1.055359076297991e-05, + "loss": 0.6783, + "step": 6508 + }, + { + "epoch": 0.9960214231063504, + "grad_norm": 2.2452785958982764, + "learning_rate": 1.0551116363222864e-05, + "loss": 0.8419, + "step": 6509 + }, + { + "epoch": 0.9961744452945677, + "grad_norm": 2.447402852546272, + "learning_rate": 1.054864192961961e-05, + "loss": 0.6773, + "step": 6510 + }, + { + "epoch": 0.996327467482785, + "grad_norm": 1.9661543729169815, + "learning_rate": 1.0546167462322103e-05, + "loss": 0.5886, + "step": 6511 + }, + { + "epoch": 0.9964804896710023, + "grad_norm": 2.0809576482884693, + "learning_rate": 1.054369296148232e-05, + "loss": 0.6409, + "step": 6512 + }, + { + "epoch": 0.9966335118592196, + "grad_norm": 1.9729856131001111, + "learning_rate": 1.0541218427252222e-05, + "loss": 0.6166, + "step": 6513 + }, + { + "epoch": 0.9967865340474369, + "grad_norm": 2.206970740539544, + "learning_rate": 1.0538743859783787e-05, + "loss": 0.6654, + "step": 6514 + }, + { + "epoch": 0.9969395562356542, + "grad_norm": 2.0815380641893793, + "learning_rate": 1.053626925922898e-05, + "loss": 0.564, + "step": 6515 + }, + { + "epoch": 0.9970925784238714, + "grad_norm": 2.043032926313095, + "learning_rate": 1.0533794625739782e-05, + "loss": 0.6125, + "step": 6516 + }, + { + "epoch": 0.9972456006120888, + "grad_norm": 2.08135336365385, + "learning_rate": 1.0531319959468167e-05, + "loss": 0.5659, + "step": 6517 + }, + { + "epoch": 0.997398622800306, + "grad_norm": 2.0383774964273287, + "learning_rate": 1.0528845260566116e-05, + "loss": 0.5637, + "step": 6518 + }, + { + "epoch": 0.9975516449885233, + "grad_norm": 2.066268610798444, + "learning_rate": 1.0526370529185608e-05, + "loss": 0.6478, + "step": 6519 + }, + { + "epoch": 0.9977046671767407, + "grad_norm": 2.5050073606635817, + "learning_rate": 1.0523895765478627e-05, + "loss": 0.6566, + "step": 6520 + }, + { + "epoch": 0.9978576893649579, + "grad_norm": 2.5001747882359084, + "learning_rate": 1.0521420969597156e-05, + "loss": 0.6971, + "step": 6521 + }, + { + "epoch": 0.9980107115531752, + "grad_norm": 2.21713217988516, + "learning_rate": 1.0518946141693185e-05, + "loss": 0.5671, + "step": 6522 + }, + { + "epoch": 0.9981637337413926, + "grad_norm": 2.216049429630443, + "learning_rate": 1.0516471281918702e-05, + "loss": 0.6914, + "step": 6523 + }, + { + "epoch": 0.9983167559296098, + "grad_norm": 2.2994572356248106, + "learning_rate": 1.0513996390425694e-05, + "loss": 0.6335, + "step": 6524 + }, + { + "epoch": 0.9984697781178271, + "grad_norm": 2.1282739516172238, + "learning_rate": 1.051152146736616e-05, + "loss": 0.5977, + "step": 6525 + }, + { + "epoch": 0.9986228003060443, + "grad_norm": 2.4591726916231615, + "learning_rate": 1.0509046512892092e-05, + "loss": 0.684, + "step": 6526 + }, + { + "epoch": 0.9987758224942617, + "grad_norm": 2.64549225493789, + "learning_rate": 1.0506571527155482e-05, + "loss": 0.6533, + "step": 6527 + }, + { + "epoch": 0.998928844682479, + "grad_norm": 2.3069809469331384, + "learning_rate": 1.0504096510308335e-05, + "loss": 0.601, + "step": 6528 + }, + { + "epoch": 0.9990818668706962, + "grad_norm": 2.144064608208212, + "learning_rate": 1.0501621462502651e-05, + "loss": 0.6259, + "step": 6529 + }, + { + "epoch": 0.9992348890589136, + "grad_norm": 2.4104646760425394, + "learning_rate": 1.049914638389043e-05, + "loss": 0.7642, + "step": 6530 + }, + { + "epoch": 0.9993879112471309, + "grad_norm": 2.048851027289111, + "learning_rate": 1.0496671274623675e-05, + "loss": 0.6663, + "step": 6531 + }, + { + "epoch": 0.9995409334353481, + "grad_norm": 2.531917512682566, + "learning_rate": 1.0494196134854395e-05, + "loss": 0.8121, + "step": 6532 + }, + { + "epoch": 0.9996939556235654, + "grad_norm": 2.7016640424743534, + "learning_rate": 1.0491720964734595e-05, + "loss": 0.7759, + "step": 6533 + }, + { + "epoch": 0.9998469778117827, + "grad_norm": 2.227959878473994, + "learning_rate": 1.0489245764416294e-05, + "loss": 0.6892, + "step": 6534 + }, + { + "epoch": 1.0, + "grad_norm": 2.2604243565912374, + "learning_rate": 1.0486770534051492e-05, + "loss": 0.7176, + "step": 6535 + }, + { + "epoch": 1.0001530221882173, + "grad_norm": 2.033100428541894, + "learning_rate": 1.0484295273792204e-05, + "loss": 0.6903, + "step": 6536 + }, + { + "epoch": 1.0003060443764347, + "grad_norm": 2.1314696539623275, + "learning_rate": 1.0481819983790454e-05, + "loss": 0.5955, + "step": 6537 + }, + { + "epoch": 1.0004590665646518, + "grad_norm": 2.5135837640358405, + "learning_rate": 1.0479344664198251e-05, + "loss": 0.6917, + "step": 6538 + }, + { + "epoch": 1.000612088752869, + "grad_norm": 2.133925287881583, + "learning_rate": 1.0476869315167617e-05, + "loss": 0.616, + "step": 6539 + }, + { + "epoch": 1.0007651109410864, + "grad_norm": 2.1679524566869377, + "learning_rate": 1.0474393936850573e-05, + "loss": 0.7227, + "step": 6540 + }, + { + "epoch": 1.0009181331293038, + "grad_norm": 2.3980955664439105, + "learning_rate": 1.0471918529399143e-05, + "loss": 0.7499, + "step": 6541 + }, + { + "epoch": 1.001071155317521, + "grad_norm": 2.3699351005706255, + "learning_rate": 1.046944309296535e-05, + "loss": 0.6967, + "step": 6542 + }, + { + "epoch": 1.0012241775057382, + "grad_norm": 2.647323810066508, + "learning_rate": 1.0466967627701219e-05, + "loss": 0.7663, + "step": 6543 + }, + { + "epoch": 1.0013771996939556, + "grad_norm": 2.137859552412195, + "learning_rate": 1.0464492133758779e-05, + "loss": 0.5863, + "step": 6544 + }, + { + "epoch": 1.0015302218821729, + "grad_norm": 2.0219765558236285, + "learning_rate": 1.046201661129006e-05, + "loss": 0.6693, + "step": 6545 + }, + { + "epoch": 1.0016832440703902, + "grad_norm": 2.4366531420683137, + "learning_rate": 1.0459541060447091e-05, + "loss": 0.703, + "step": 6546 + }, + { + "epoch": 1.0018362662586076, + "grad_norm": 1.9828507027857656, + "learning_rate": 1.0457065481381913e-05, + "loss": 0.582, + "step": 6547 + }, + { + "epoch": 1.0019892884468249, + "grad_norm": 2.0692223786471624, + "learning_rate": 1.0454589874246555e-05, + "loss": 0.5675, + "step": 6548 + }, + { + "epoch": 1.002142310635042, + "grad_norm": 1.9859837618858298, + "learning_rate": 1.0452114239193049e-05, + "loss": 0.5558, + "step": 6549 + }, + { + "epoch": 1.0022953328232593, + "grad_norm": 2.272954348610601, + "learning_rate": 1.0449638576373445e-05, + "loss": 0.6769, + "step": 6550 + }, + { + "epoch": 1.0024483550114767, + "grad_norm": 2.420602351894704, + "learning_rate": 1.0447162885939776e-05, + "loss": 0.6877, + "step": 6551 + }, + { + "epoch": 1.002601377199694, + "grad_norm": 2.494825351434444, + "learning_rate": 1.0444687168044082e-05, + "loss": 0.7181, + "step": 6552 + }, + { + "epoch": 1.0027543993879113, + "grad_norm": 2.285517189923609, + "learning_rate": 1.044221142283841e-05, + "loss": 0.6558, + "step": 6553 + }, + { + "epoch": 1.0029074215761284, + "grad_norm": 2.044150801571359, + "learning_rate": 1.0439735650474807e-05, + "loss": 0.5108, + "step": 6554 + }, + { + "epoch": 1.0030604437643458, + "grad_norm": 2.26517016730885, + "learning_rate": 1.0437259851105318e-05, + "loss": 0.6558, + "step": 6555 + }, + { + "epoch": 1.003213465952563, + "grad_norm": 2.24291348116444, + "learning_rate": 1.0434784024881988e-05, + "loss": 0.6491, + "step": 6556 + }, + { + "epoch": 1.0033664881407804, + "grad_norm": 2.187640461685535, + "learning_rate": 1.043230817195687e-05, + "loss": 0.6019, + "step": 6557 + }, + { + "epoch": 1.0035195103289978, + "grad_norm": 2.3819794343778278, + "learning_rate": 1.0429832292482019e-05, + "loss": 0.7496, + "step": 6558 + }, + { + "epoch": 1.0036725325172149, + "grad_norm": 2.1489073395127356, + "learning_rate": 1.0427356386609482e-05, + "loss": 0.6125, + "step": 6559 + }, + { + "epoch": 1.0038255547054322, + "grad_norm": 2.0082206501261686, + "learning_rate": 1.042488045449132e-05, + "loss": 0.5703, + "step": 6560 + }, + { + "epoch": 1.0039785768936496, + "grad_norm": 2.142001534829241, + "learning_rate": 1.0422404496279585e-05, + "loss": 0.5988, + "step": 6561 + }, + { + "epoch": 1.0041315990818669, + "grad_norm": 2.1385641923885648, + "learning_rate": 1.0419928512126337e-05, + "loss": 0.6956, + "step": 6562 + }, + { + "epoch": 1.0042846212700842, + "grad_norm": 2.3824901717985436, + "learning_rate": 1.041745250218364e-05, + "loss": 0.6451, + "step": 6563 + }, + { + "epoch": 1.0044376434583016, + "grad_norm": 2.4015296436629776, + "learning_rate": 1.0414976466603551e-05, + "loss": 0.7278, + "step": 6564 + }, + { + "epoch": 1.0045906656465187, + "grad_norm": 2.1534435564300347, + "learning_rate": 1.0412500405538129e-05, + "loss": 0.6363, + "step": 6565 + }, + { + "epoch": 1.004743687834736, + "grad_norm": 2.48153539686987, + "learning_rate": 1.0410024319139447e-05, + "loss": 0.6363, + "step": 6566 + }, + { + "epoch": 1.0048967100229533, + "grad_norm": 2.172810737488868, + "learning_rate": 1.040754820755957e-05, + "loss": 0.6484, + "step": 6567 + }, + { + "epoch": 1.0050497322111707, + "grad_norm": 2.5460489768001144, + "learning_rate": 1.040507207095056e-05, + "loss": 0.8215, + "step": 6568 + }, + { + "epoch": 1.005202754399388, + "grad_norm": 2.16079862891005, + "learning_rate": 1.0402595909464489e-05, + "loss": 0.5477, + "step": 6569 + }, + { + "epoch": 1.005355776587605, + "grad_norm": 2.202814891087014, + "learning_rate": 1.0400119723253428e-05, + "loss": 0.5666, + "step": 6570 + }, + { + "epoch": 1.0055087987758224, + "grad_norm": 2.212828269877084, + "learning_rate": 1.039764351246945e-05, + "loss": 0.5264, + "step": 6571 + }, + { + "epoch": 1.0056618209640398, + "grad_norm": 2.117103171907091, + "learning_rate": 1.039516727726463e-05, + "loss": 0.5496, + "step": 6572 + }, + { + "epoch": 1.005814843152257, + "grad_norm": 2.2084826072189996, + "learning_rate": 1.0392691017791041e-05, + "loss": 0.5141, + "step": 6573 + }, + { + "epoch": 1.0059678653404744, + "grad_norm": 2.32130852843778, + "learning_rate": 1.0390214734200758e-05, + "loss": 0.6334, + "step": 6574 + }, + { + "epoch": 1.0061208875286916, + "grad_norm": 2.2964584356167284, + "learning_rate": 1.0387738426645865e-05, + "loss": 0.6497, + "step": 6575 + }, + { + "epoch": 1.0062739097169089, + "grad_norm": 2.4158826799520434, + "learning_rate": 1.0385262095278438e-05, + "loss": 0.6641, + "step": 6576 + }, + { + "epoch": 1.0064269319051262, + "grad_norm": 2.167843641637399, + "learning_rate": 1.0382785740250557e-05, + "loss": 0.5314, + "step": 6577 + }, + { + "epoch": 1.0065799540933436, + "grad_norm": 2.283231101433695, + "learning_rate": 1.0380309361714306e-05, + "loss": 0.6207, + "step": 6578 + }, + { + "epoch": 1.0067329762815609, + "grad_norm": 2.536910953794421, + "learning_rate": 1.0377832959821773e-05, + "loss": 0.7732, + "step": 6579 + }, + { + "epoch": 1.0068859984697782, + "grad_norm": 2.194479016477561, + "learning_rate": 1.0375356534725036e-05, + "loss": 0.6016, + "step": 6580 + }, + { + "epoch": 1.0070390206579953, + "grad_norm": 2.568553979829469, + "learning_rate": 1.0372880086576187e-05, + "loss": 0.5968, + "step": 6581 + }, + { + "epoch": 1.0071920428462127, + "grad_norm": 2.2861022154819017, + "learning_rate": 1.0370403615527311e-05, + "loss": 0.6048, + "step": 6582 + }, + { + "epoch": 1.00734506503443, + "grad_norm": 2.302797866266313, + "learning_rate": 1.0367927121730503e-05, + "loss": 0.622, + "step": 6583 + }, + { + "epoch": 1.0074980872226473, + "grad_norm": 2.0879458077730715, + "learning_rate": 1.036545060533785e-05, + "loss": 0.5533, + "step": 6584 + }, + { + "epoch": 1.0076511094108647, + "grad_norm": 2.557603243001996, + "learning_rate": 1.0362974066501445e-05, + "loss": 0.7397, + "step": 6585 + }, + { + "epoch": 1.0078041315990818, + "grad_norm": 2.312252735085079, + "learning_rate": 1.0360497505373386e-05, + "loss": 0.7411, + "step": 6586 + }, + { + "epoch": 1.007957153787299, + "grad_norm": 2.3173357806675736, + "learning_rate": 1.035802092210576e-05, + "loss": 0.6168, + "step": 6587 + }, + { + "epoch": 1.0081101759755164, + "grad_norm": 2.2189725542214678, + "learning_rate": 1.0355544316850671e-05, + "loss": 0.6329, + "step": 6588 + }, + { + "epoch": 1.0082631981637338, + "grad_norm": 2.204599066701292, + "learning_rate": 1.0353067689760217e-05, + "loss": 0.5969, + "step": 6589 + }, + { + "epoch": 1.008416220351951, + "grad_norm": 2.6794556741279414, + "learning_rate": 1.0350591040986489e-05, + "loss": 0.6986, + "step": 6590 + }, + { + "epoch": 1.0085692425401682, + "grad_norm": 2.3559628832205797, + "learning_rate": 1.03481143706816e-05, + "loss": 0.6133, + "step": 6591 + }, + { + "epoch": 1.0087222647283856, + "grad_norm": 2.759684199213884, + "learning_rate": 1.0345637678997642e-05, + "loss": 0.6948, + "step": 6592 + }, + { + "epoch": 1.0088752869166029, + "grad_norm": 2.051050778484884, + "learning_rate": 1.0343160966086723e-05, + "loss": 0.6322, + "step": 6593 + }, + { + "epoch": 1.0090283091048202, + "grad_norm": 2.375738470682142, + "learning_rate": 1.0340684232100946e-05, + "loss": 0.6488, + "step": 6594 + }, + { + "epoch": 1.0091813312930376, + "grad_norm": 2.2479561575153966, + "learning_rate": 1.0338207477192417e-05, + "loss": 0.6915, + "step": 6595 + }, + { + "epoch": 1.0093343534812549, + "grad_norm": 2.1508005625889686, + "learning_rate": 1.0335730701513245e-05, + "loss": 0.5837, + "step": 6596 + }, + { + "epoch": 1.009487375669472, + "grad_norm": 2.4929163407137604, + "learning_rate": 1.0333253905215537e-05, + "loss": 0.7049, + "step": 6597 + }, + { + "epoch": 1.0096403978576893, + "grad_norm": 2.322330911445932, + "learning_rate": 1.0330777088451402e-05, + "loss": 0.6233, + "step": 6598 + }, + { + "epoch": 1.0097934200459067, + "grad_norm": 2.283433120066431, + "learning_rate": 1.0328300251372954e-05, + "loss": 0.5305, + "step": 6599 + }, + { + "epoch": 1.009946442234124, + "grad_norm": 2.17015984089956, + "learning_rate": 1.0325823394132301e-05, + "loss": 0.5958, + "step": 6600 + }, + { + "epoch": 1.0100994644223413, + "grad_norm": 2.436120696696377, + "learning_rate": 1.032334651688156e-05, + "loss": 0.6444, + "step": 6601 + }, + { + "epoch": 1.0102524866105584, + "grad_norm": 2.3607087470720938, + "learning_rate": 1.0320869619772847e-05, + "loss": 0.6319, + "step": 6602 + }, + { + "epoch": 1.0104055087987758, + "grad_norm": 2.2905924990988273, + "learning_rate": 1.0318392702958268e-05, + "loss": 0.6301, + "step": 6603 + }, + { + "epoch": 1.010558530986993, + "grad_norm": 2.44559146610296, + "learning_rate": 1.0315915766589957e-05, + "loss": 0.6883, + "step": 6604 + }, + { + "epoch": 1.0107115531752104, + "grad_norm": 2.4650413110292977, + "learning_rate": 1.0313438810820018e-05, + "loss": 0.5647, + "step": 6605 + }, + { + "epoch": 1.0108645753634278, + "grad_norm": 2.1910260377940567, + "learning_rate": 1.0310961835800572e-05, + "loss": 0.577, + "step": 6606 + }, + { + "epoch": 1.0110175975516449, + "grad_norm": 2.38892339041073, + "learning_rate": 1.0308484841683749e-05, + "loss": 0.5909, + "step": 6607 + }, + { + "epoch": 1.0111706197398622, + "grad_norm": 2.4041673025346832, + "learning_rate": 1.0306007828621665e-05, + "loss": 0.6755, + "step": 6608 + }, + { + "epoch": 1.0113236419280796, + "grad_norm": 2.2844546507484287, + "learning_rate": 1.0303530796766439e-05, + "loss": 0.5774, + "step": 6609 + }, + { + "epoch": 1.0114766641162969, + "grad_norm": 2.116403540663527, + "learning_rate": 1.03010537462702e-05, + "loss": 0.515, + "step": 6610 + }, + { + "epoch": 1.0116296863045142, + "grad_norm": 2.1836805111004374, + "learning_rate": 1.0298576677285076e-05, + "loss": 0.5105, + "step": 6611 + }, + { + "epoch": 1.0117827084927316, + "grad_norm": 2.507796655419912, + "learning_rate": 1.0296099589963185e-05, + "loss": 0.6309, + "step": 6612 + }, + { + "epoch": 1.0119357306809487, + "grad_norm": 2.0527464335910413, + "learning_rate": 1.0293622484456665e-05, + "loss": 0.6004, + "step": 6613 + }, + { + "epoch": 1.012088752869166, + "grad_norm": 2.334226311405419, + "learning_rate": 1.0291145360917639e-05, + "loss": 0.5387, + "step": 6614 + }, + { + "epoch": 1.0122417750573833, + "grad_norm": 2.204170110208716, + "learning_rate": 1.0288668219498232e-05, + "loss": 0.6759, + "step": 6615 + }, + { + "epoch": 1.0123947972456007, + "grad_norm": 2.3108595745215776, + "learning_rate": 1.0286191060350586e-05, + "loss": 0.6272, + "step": 6616 + }, + { + "epoch": 1.012547819433818, + "grad_norm": 2.2782166898829077, + "learning_rate": 1.0283713883626829e-05, + "loss": 0.6998, + "step": 6617 + }, + { + "epoch": 1.012700841622035, + "grad_norm": 2.2993109109204903, + "learning_rate": 1.0281236689479086e-05, + "loss": 0.6204, + "step": 6618 + }, + { + "epoch": 1.0128538638102524, + "grad_norm": 2.3404054714266582, + "learning_rate": 1.0278759478059502e-05, + "loss": 0.6709, + "step": 6619 + }, + { + "epoch": 1.0130068859984698, + "grad_norm": 2.5884362282825637, + "learning_rate": 1.0276282249520207e-05, + "loss": 0.7326, + "step": 6620 + }, + { + "epoch": 1.013159908186687, + "grad_norm": 1.9300570634216025, + "learning_rate": 1.0273805004013337e-05, + "loss": 0.5012, + "step": 6621 + }, + { + "epoch": 1.0133129303749044, + "grad_norm": 2.377275021865992, + "learning_rate": 1.0271327741691032e-05, + "loss": 0.6814, + "step": 6622 + }, + { + "epoch": 1.0134659525631216, + "grad_norm": 2.182477995481934, + "learning_rate": 1.0268850462705431e-05, + "loss": 0.5598, + "step": 6623 + }, + { + "epoch": 1.0136189747513389, + "grad_norm": 2.3066689578762207, + "learning_rate": 1.0266373167208668e-05, + "loss": 0.6129, + "step": 6624 + }, + { + "epoch": 1.0137719969395562, + "grad_norm": 2.307693232138134, + "learning_rate": 1.0263895855352887e-05, + "loss": 0.5754, + "step": 6625 + }, + { + "epoch": 1.0139250191277736, + "grad_norm": 2.292009668838204, + "learning_rate": 1.0261418527290233e-05, + "loss": 0.6121, + "step": 6626 + }, + { + "epoch": 1.0140780413159909, + "grad_norm": 2.492568937733668, + "learning_rate": 1.025894118317284e-05, + "loss": 0.6629, + "step": 6627 + }, + { + "epoch": 1.0142310635042082, + "grad_norm": 2.2340661216832096, + "learning_rate": 1.0256463823152855e-05, + "loss": 0.5866, + "step": 6628 + }, + { + "epoch": 1.0143840856924253, + "grad_norm": 2.110334526976937, + "learning_rate": 1.0253986447382429e-05, + "loss": 0.5203, + "step": 6629 + }, + { + "epoch": 1.0145371078806427, + "grad_norm": 2.692658979279218, + "learning_rate": 1.0251509056013697e-05, + "loss": 0.5705, + "step": 6630 + }, + { + "epoch": 1.01469013006886, + "grad_norm": 2.428970298715567, + "learning_rate": 1.024903164919881e-05, + "loss": 0.6109, + "step": 6631 + }, + { + "epoch": 1.0148431522570773, + "grad_norm": 2.238146074078266, + "learning_rate": 1.0246554227089918e-05, + "loss": 0.6261, + "step": 6632 + }, + { + "epoch": 1.0149961744452947, + "grad_norm": 2.573325893114674, + "learning_rate": 1.0244076789839169e-05, + "loss": 0.6369, + "step": 6633 + }, + { + "epoch": 1.0151491966335118, + "grad_norm": 2.3159256199731053, + "learning_rate": 1.0241599337598702e-05, + "loss": 0.5916, + "step": 6634 + }, + { + "epoch": 1.015302218821729, + "grad_norm": 2.2308322352798444, + "learning_rate": 1.0239121870520679e-05, + "loss": 0.5703, + "step": 6635 + }, + { + "epoch": 1.0154552410099464, + "grad_norm": 2.329808400919156, + "learning_rate": 1.0236644388757245e-05, + "loss": 0.6378, + "step": 6636 + }, + { + "epoch": 1.0156082631981638, + "grad_norm": 2.435679508309255, + "learning_rate": 1.0234166892460554e-05, + "loss": 0.6254, + "step": 6637 + }, + { + "epoch": 1.015761285386381, + "grad_norm": 2.4224361592010295, + "learning_rate": 1.0231689381782756e-05, + "loss": 0.5599, + "step": 6638 + }, + { + "epoch": 1.0159143075745982, + "grad_norm": 2.1885969206768032, + "learning_rate": 1.0229211856876011e-05, + "loss": 0.5952, + "step": 6639 + }, + { + "epoch": 1.0160673297628156, + "grad_norm": 2.3897857156470255, + "learning_rate": 1.0226734317892466e-05, + "loss": 0.5323, + "step": 6640 + }, + { + "epoch": 1.0162203519510329, + "grad_norm": 2.092213659667365, + "learning_rate": 1.0224256764984282e-05, + "loss": 0.4703, + "step": 6641 + }, + { + "epoch": 1.0163733741392502, + "grad_norm": 2.1854404332671185, + "learning_rate": 1.022177919830361e-05, + "loss": 0.6041, + "step": 6642 + }, + { + "epoch": 1.0165263963274676, + "grad_norm": 2.4022816997942593, + "learning_rate": 1.0219301618002611e-05, + "loss": 0.64, + "step": 6643 + }, + { + "epoch": 1.0166794185156847, + "grad_norm": 2.3801526159134982, + "learning_rate": 1.021682402423344e-05, + "loss": 0.5524, + "step": 6644 + }, + { + "epoch": 1.016832440703902, + "grad_norm": 2.218696255320501, + "learning_rate": 1.021434641714826e-05, + "loss": 0.6162, + "step": 6645 + }, + { + "epoch": 1.0169854628921193, + "grad_norm": 2.7939201783520056, + "learning_rate": 1.0211868796899229e-05, + "loss": 0.6066, + "step": 6646 + }, + { + "epoch": 1.0171384850803367, + "grad_norm": 2.264811330544672, + "learning_rate": 1.0209391163638503e-05, + "loss": 0.539, + "step": 6647 + }, + { + "epoch": 1.017291507268554, + "grad_norm": 2.318419121835664, + "learning_rate": 1.0206913517518246e-05, + "loss": 0.6371, + "step": 6648 + }, + { + "epoch": 1.0174445294567713, + "grad_norm": 2.206422409859083, + "learning_rate": 1.0204435858690625e-05, + "loss": 0.5619, + "step": 6649 + }, + { + "epoch": 1.0175975516449884, + "grad_norm": 2.626006555589893, + "learning_rate": 1.0201958187307794e-05, + "loss": 0.5531, + "step": 6650 + }, + { + "epoch": 1.0177505738332058, + "grad_norm": 2.344140763614317, + "learning_rate": 1.0199480503521924e-05, + "loss": 0.6345, + "step": 6651 + }, + { + "epoch": 1.017903596021423, + "grad_norm": 2.2889519250935177, + "learning_rate": 1.0197002807485175e-05, + "loss": 0.5799, + "step": 6652 + }, + { + "epoch": 1.0180566182096404, + "grad_norm": 2.6253194916546154, + "learning_rate": 1.0194525099349708e-05, + "loss": 0.6793, + "step": 6653 + }, + { + "epoch": 1.0182096403978578, + "grad_norm": 2.3472924349764344, + "learning_rate": 1.01920473792677e-05, + "loss": 0.5215, + "step": 6654 + }, + { + "epoch": 1.0183626625860749, + "grad_norm": 2.22200896592304, + "learning_rate": 1.0189569647391308e-05, + "loss": 0.5543, + "step": 6655 + }, + { + "epoch": 1.0185156847742922, + "grad_norm": 2.0095456114314585, + "learning_rate": 1.0187091903872703e-05, + "loss": 0.5121, + "step": 6656 + }, + { + "epoch": 1.0186687069625096, + "grad_norm": 2.297998178865159, + "learning_rate": 1.0184614148864052e-05, + "loss": 0.584, + "step": 6657 + }, + { + "epoch": 1.0188217291507269, + "grad_norm": 2.332497200625412, + "learning_rate": 1.0182136382517526e-05, + "loss": 0.5356, + "step": 6658 + }, + { + "epoch": 1.0189747513389442, + "grad_norm": 2.4417067324261064, + "learning_rate": 1.017965860498529e-05, + "loss": 0.543, + "step": 6659 + }, + { + "epoch": 1.0191277735271613, + "grad_norm": 2.15499036744487, + "learning_rate": 1.0177180816419516e-05, + "loss": 0.5874, + "step": 6660 + }, + { + "epoch": 1.0192807957153787, + "grad_norm": 2.108874428758097, + "learning_rate": 1.0174703016972376e-05, + "loss": 0.4502, + "step": 6661 + }, + { + "epoch": 1.019433817903596, + "grad_norm": 2.548469773857961, + "learning_rate": 1.017222520679604e-05, + "loss": 0.5865, + "step": 6662 + }, + { + "epoch": 1.0195868400918133, + "grad_norm": 2.513582400490902, + "learning_rate": 1.0169747386042681e-05, + "loss": 0.653, + "step": 6663 + }, + { + "epoch": 1.0197398622800307, + "grad_norm": 2.492061054460402, + "learning_rate": 1.0167269554864472e-05, + "loss": 0.6406, + "step": 6664 + }, + { + "epoch": 1.019892884468248, + "grad_norm": 2.180717896254185, + "learning_rate": 1.0164791713413583e-05, + "loss": 0.5883, + "step": 6665 + }, + { + "epoch": 1.020045906656465, + "grad_norm": 2.2006717979763786, + "learning_rate": 1.016231386184219e-05, + "loss": 0.5509, + "step": 6666 + }, + { + "epoch": 1.0201989288446824, + "grad_norm": 2.3252357391678586, + "learning_rate": 1.015983600030247e-05, + "loss": 0.6702, + "step": 6667 + }, + { + "epoch": 1.0203519510328998, + "grad_norm": 2.14665930839897, + "learning_rate": 1.0157358128946596e-05, + "loss": 0.5438, + "step": 6668 + }, + { + "epoch": 1.020504973221117, + "grad_norm": 2.3069969574975002, + "learning_rate": 1.0154880247926739e-05, + "loss": 0.5291, + "step": 6669 + }, + { + "epoch": 1.0206579954093344, + "grad_norm": 2.413058002612041, + "learning_rate": 1.0152402357395086e-05, + "loss": 0.5105, + "step": 6670 + }, + { + "epoch": 1.0208110175975516, + "grad_norm": 2.272849768241214, + "learning_rate": 1.0149924457503806e-05, + "loss": 0.6272, + "step": 6671 + }, + { + "epoch": 1.0209640397857689, + "grad_norm": 2.122826286435337, + "learning_rate": 1.0147446548405076e-05, + "loss": 0.516, + "step": 6672 + }, + { + "epoch": 1.0211170619739862, + "grad_norm": 2.329647854250823, + "learning_rate": 1.0144968630251078e-05, + "loss": 0.5394, + "step": 6673 + }, + { + "epoch": 1.0212700841622036, + "grad_norm": 2.1569395923518138, + "learning_rate": 1.0142490703193992e-05, + "loss": 0.6182, + "step": 6674 + }, + { + "epoch": 1.0214231063504209, + "grad_norm": 2.241545575430639, + "learning_rate": 1.014001276738599e-05, + "loss": 0.5988, + "step": 6675 + }, + { + "epoch": 1.021576128538638, + "grad_norm": 2.129116623058144, + "learning_rate": 1.0137534822979258e-05, + "loss": 0.539, + "step": 6676 + }, + { + "epoch": 1.0217291507268553, + "grad_norm": 2.256824403314073, + "learning_rate": 1.0135056870125976e-05, + "loss": 0.6206, + "step": 6677 + }, + { + "epoch": 1.0218821729150727, + "grad_norm": 2.354857212906966, + "learning_rate": 1.0132578908978323e-05, + "loss": 0.6072, + "step": 6678 + }, + { + "epoch": 1.02203519510329, + "grad_norm": 2.2770225052597994, + "learning_rate": 1.0130100939688478e-05, + "loss": 0.6071, + "step": 6679 + }, + { + "epoch": 1.0221882172915073, + "grad_norm": 2.1691193316787336, + "learning_rate": 1.012762296240863e-05, + "loss": 0.5668, + "step": 6680 + }, + { + "epoch": 1.0223412394797247, + "grad_norm": 2.138908409021962, + "learning_rate": 1.0125144977290952e-05, + "loss": 0.5591, + "step": 6681 + }, + { + "epoch": 1.0224942616679418, + "grad_norm": 2.3379186543767854, + "learning_rate": 1.0122666984487632e-05, + "loss": 0.5591, + "step": 6682 + }, + { + "epoch": 1.022647283856159, + "grad_norm": 2.480112583086363, + "learning_rate": 1.0120188984150857e-05, + "loss": 0.6374, + "step": 6683 + }, + { + "epoch": 1.0228003060443764, + "grad_norm": 2.2457431345148935, + "learning_rate": 1.0117710976432802e-05, + "loss": 0.603, + "step": 6684 + }, + { + "epoch": 1.0229533282325938, + "grad_norm": 2.3766699997888945, + "learning_rate": 1.0115232961485655e-05, + "loss": 0.553, + "step": 6685 + }, + { + "epoch": 1.023106350420811, + "grad_norm": 2.1244860650080652, + "learning_rate": 1.0112754939461603e-05, + "loss": 0.5574, + "step": 6686 + }, + { + "epoch": 1.0232593726090282, + "grad_norm": 2.5816730909218815, + "learning_rate": 1.011027691051283e-05, + "loss": 0.6624, + "step": 6687 + }, + { + "epoch": 1.0234123947972456, + "grad_norm": 2.2465134024423703, + "learning_rate": 1.010779887479152e-05, + "loss": 0.6497, + "step": 6688 + }, + { + "epoch": 1.0235654169854629, + "grad_norm": 2.193620461985959, + "learning_rate": 1.0105320832449856e-05, + "loss": 0.5496, + "step": 6689 + }, + { + "epoch": 1.0237184391736802, + "grad_norm": 2.3333263624843212, + "learning_rate": 1.0102842783640032e-05, + "loss": 0.6004, + "step": 6690 + }, + { + "epoch": 1.0238714613618976, + "grad_norm": 2.3841598412894442, + "learning_rate": 1.0100364728514228e-05, + "loss": 0.6371, + "step": 6691 + }, + { + "epoch": 1.0240244835501147, + "grad_norm": 2.223066765452285, + "learning_rate": 1.0097886667224634e-05, + "loss": 0.5576, + "step": 6692 + }, + { + "epoch": 1.024177505738332, + "grad_norm": 2.5087968900231012, + "learning_rate": 1.0095408599923438e-05, + "loss": 0.5812, + "step": 6693 + }, + { + "epoch": 1.0243305279265493, + "grad_norm": 2.57215618551181, + "learning_rate": 1.0092930526762824e-05, + "loss": 0.6023, + "step": 6694 + }, + { + "epoch": 1.0244835501147667, + "grad_norm": 2.402925023331007, + "learning_rate": 1.0090452447894985e-05, + "loss": 0.5092, + "step": 6695 + }, + { + "epoch": 1.024636572302984, + "grad_norm": 2.2994948689718275, + "learning_rate": 1.0087974363472107e-05, + "loss": 0.6011, + "step": 6696 + }, + { + "epoch": 1.0247895944912013, + "grad_norm": 2.220827506450029, + "learning_rate": 1.0085496273646377e-05, + "loss": 0.5272, + "step": 6697 + }, + { + "epoch": 1.0249426166794184, + "grad_norm": 2.337313218700674, + "learning_rate": 1.0083018178569987e-05, + "loss": 0.5645, + "step": 6698 + }, + { + "epoch": 1.0250956388676358, + "grad_norm": 2.3675969691011254, + "learning_rate": 1.008054007839513e-05, + "loss": 0.6337, + "step": 6699 + }, + { + "epoch": 1.025248661055853, + "grad_norm": 2.150314678408855, + "learning_rate": 1.0078061973273986e-05, + "loss": 0.5149, + "step": 6700 + }, + { + "epoch": 1.0254016832440704, + "grad_norm": 2.5780589470023942, + "learning_rate": 1.0075583863358756e-05, + "loss": 0.608, + "step": 6701 + }, + { + "epoch": 1.0255547054322878, + "grad_norm": 2.147503852334118, + "learning_rate": 1.0073105748801622e-05, + "loss": 0.4818, + "step": 6702 + }, + { + "epoch": 1.0257077276205049, + "grad_norm": 2.4672879810343504, + "learning_rate": 1.0070627629754778e-05, + "loss": 0.4973, + "step": 6703 + }, + { + "epoch": 1.0258607498087222, + "grad_norm": 2.2872325728792804, + "learning_rate": 1.0068149506370418e-05, + "loss": 0.4786, + "step": 6704 + }, + { + "epoch": 1.0260137719969395, + "grad_norm": 2.8109264658399375, + "learning_rate": 1.0065671378800725e-05, + "loss": 0.5947, + "step": 6705 + }, + { + "epoch": 1.0261667941851569, + "grad_norm": 2.12874627307908, + "learning_rate": 1.00631932471979e-05, + "loss": 0.475, + "step": 6706 + }, + { + "epoch": 1.0263198163733742, + "grad_norm": 2.3004995092458516, + "learning_rate": 1.006071511171413e-05, + "loss": 0.577, + "step": 6707 + }, + { + "epoch": 1.0264728385615913, + "grad_norm": 2.530976996073714, + "learning_rate": 1.0058236972501607e-05, + "loss": 0.7009, + "step": 6708 + }, + { + "epoch": 1.0266258607498087, + "grad_norm": 2.3849208819910586, + "learning_rate": 1.0055758829712522e-05, + "loss": 0.5976, + "step": 6709 + }, + { + "epoch": 1.026778882938026, + "grad_norm": 2.141201510237259, + "learning_rate": 1.0053280683499069e-05, + "loss": 0.5255, + "step": 6710 + }, + { + "epoch": 1.0269319051262433, + "grad_norm": 2.3014505047534985, + "learning_rate": 1.0050802534013444e-05, + "loss": 0.5084, + "step": 6711 + }, + { + "epoch": 1.0270849273144607, + "grad_norm": 2.6769632245570394, + "learning_rate": 1.0048324381407837e-05, + "loss": 0.6121, + "step": 6712 + }, + { + "epoch": 1.027237949502678, + "grad_norm": 2.2107203457710756, + "learning_rate": 1.0045846225834434e-05, + "loss": 0.5219, + "step": 6713 + }, + { + "epoch": 1.027390971690895, + "grad_norm": 2.2878697948454403, + "learning_rate": 1.004336806744544e-05, + "loss": 0.5553, + "step": 6714 + }, + { + "epoch": 1.0275439938791124, + "grad_norm": 2.1499839688324327, + "learning_rate": 1.0040889906393044e-05, + "loss": 0.5307, + "step": 6715 + }, + { + "epoch": 1.0276970160673298, + "grad_norm": 2.3165989907378037, + "learning_rate": 1.0038411742829437e-05, + "loss": 0.5045, + "step": 6716 + }, + { + "epoch": 1.027850038255547, + "grad_norm": 2.1737988536205797, + "learning_rate": 1.0035933576906815e-05, + "loss": 0.5147, + "step": 6717 + }, + { + "epoch": 1.0280030604437644, + "grad_norm": 2.489338280509698, + "learning_rate": 1.0033455408777374e-05, + "loss": 0.5733, + "step": 6718 + }, + { + "epoch": 1.0281560826319815, + "grad_norm": 2.3878592780046497, + "learning_rate": 1.0030977238593303e-05, + "loss": 0.5415, + "step": 6719 + }, + { + "epoch": 1.0283091048201989, + "grad_norm": 2.4479340330867587, + "learning_rate": 1.0028499066506799e-05, + "loss": 0.6103, + "step": 6720 + }, + { + "epoch": 1.0284621270084162, + "grad_norm": 2.3518568222133633, + "learning_rate": 1.0026020892670056e-05, + "loss": 0.5775, + "step": 6721 + }, + { + "epoch": 1.0286151491966335, + "grad_norm": 2.938195775116116, + "learning_rate": 1.0023542717235268e-05, + "loss": 0.5382, + "step": 6722 + }, + { + "epoch": 1.0287681713848509, + "grad_norm": 2.3623126529241554, + "learning_rate": 1.0021064540354627e-05, + "loss": 0.5309, + "step": 6723 + }, + { + "epoch": 1.028921193573068, + "grad_norm": 2.4496299533549166, + "learning_rate": 1.0018586362180335e-05, + "loss": 0.6377, + "step": 6724 + }, + { + "epoch": 1.0290742157612853, + "grad_norm": 2.573262792528309, + "learning_rate": 1.0016108182864578e-05, + "loss": 0.5198, + "step": 6725 + }, + { + "epoch": 1.0292272379495027, + "grad_norm": 2.29374203363862, + "learning_rate": 1.0013630002559556e-05, + "loss": 0.5467, + "step": 6726 + }, + { + "epoch": 1.02938026013772, + "grad_norm": 2.544986422031994, + "learning_rate": 1.0011151821417462e-05, + "loss": 0.6458, + "step": 6727 + }, + { + "epoch": 1.0295332823259373, + "grad_norm": 2.355349246426756, + "learning_rate": 1.0008673639590493e-05, + "loss": 0.533, + "step": 6728 + }, + { + "epoch": 1.0296863045141547, + "grad_norm": 2.4051355343169605, + "learning_rate": 1.0006195457230838e-05, + "loss": 0.5813, + "step": 6729 + }, + { + "epoch": 1.0298393267023718, + "grad_norm": 2.2601662708897052, + "learning_rate": 1.0003717274490698e-05, + "loss": 0.503, + "step": 6730 + }, + { + "epoch": 1.029992348890589, + "grad_norm": 2.6026331972423145, + "learning_rate": 1.0001239091522266e-05, + "loss": 0.6445, + "step": 6731 + }, + { + "epoch": 1.0301453710788064, + "grad_norm": 2.4502665700638664, + "learning_rate": 9.998760908477734e-06, + "loss": 0.6071, + "step": 6732 + }, + { + "epoch": 1.0302983932670238, + "grad_norm": 2.313243201112418, + "learning_rate": 9.996282725509305e-06, + "loss": 0.6863, + "step": 6733 + }, + { + "epoch": 1.030451415455241, + "grad_norm": 2.1587677649613357, + "learning_rate": 9.993804542769167e-06, + "loss": 0.5032, + "step": 6734 + }, + { + "epoch": 1.0306044376434582, + "grad_norm": 2.4333278255649176, + "learning_rate": 9.991326360409509e-06, + "loss": 0.5529, + "step": 6735 + }, + { + "epoch": 1.0307574598316755, + "grad_norm": 2.4443357441123292, + "learning_rate": 9.988848178582541e-06, + "loss": 0.5748, + "step": 6736 + }, + { + "epoch": 1.0309104820198929, + "grad_norm": 2.190378352896407, + "learning_rate": 9.986369997440445e-06, + "loss": 0.5912, + "step": 6737 + }, + { + "epoch": 1.0310635042081102, + "grad_norm": 2.2856778913735143, + "learning_rate": 9.983891817135423e-06, + "loss": 0.6169, + "step": 6738 + }, + { + "epoch": 1.0312165263963275, + "grad_norm": 2.3383660191650604, + "learning_rate": 9.98141363781967e-06, + "loss": 0.5175, + "step": 6739 + }, + { + "epoch": 1.0313695485845447, + "grad_norm": 2.4669727398099264, + "learning_rate": 9.978935459645374e-06, + "loss": 0.5764, + "step": 6740 + }, + { + "epoch": 1.031522570772762, + "grad_norm": 2.454183738970548, + "learning_rate": 9.976457282764735e-06, + "loss": 0.5029, + "step": 6741 + }, + { + "epoch": 1.0316755929609793, + "grad_norm": 2.146600754225679, + "learning_rate": 9.97397910732995e-06, + "loss": 0.4988, + "step": 6742 + }, + { + "epoch": 1.0318286151491967, + "grad_norm": 2.544317459747779, + "learning_rate": 9.971500933493203e-06, + "loss": 0.5676, + "step": 6743 + }, + { + "epoch": 1.031981637337414, + "grad_norm": 2.129239502795901, + "learning_rate": 9.9690227614067e-06, + "loss": 0.5151, + "step": 6744 + }, + { + "epoch": 1.032134659525631, + "grad_norm": 2.5537722341810354, + "learning_rate": 9.966544591222626e-06, + "loss": 0.515, + "step": 6745 + }, + { + "epoch": 1.0322876817138484, + "grad_norm": 2.4961159403188464, + "learning_rate": 9.964066423093186e-06, + "loss": 0.6098, + "step": 6746 + }, + { + "epoch": 1.0324407039020658, + "grad_norm": 2.26264700212384, + "learning_rate": 9.961588257170565e-06, + "loss": 0.4847, + "step": 6747 + }, + { + "epoch": 1.032593726090283, + "grad_norm": 2.3122842418111667, + "learning_rate": 9.959110093606956e-06, + "loss": 0.5332, + "step": 6748 + }, + { + "epoch": 1.0327467482785004, + "grad_norm": 2.3473237403635676, + "learning_rate": 9.95663193255456e-06, + "loss": 0.4923, + "step": 6749 + }, + { + "epoch": 1.0328997704667178, + "grad_norm": 2.493445708562281, + "learning_rate": 9.954153774165564e-06, + "loss": 0.6176, + "step": 6750 + }, + { + "epoch": 1.0330527926549349, + "grad_norm": 2.0928945035756823, + "learning_rate": 9.951675618592168e-06, + "loss": 0.4642, + "step": 6751 + }, + { + "epoch": 1.0332058148431522, + "grad_norm": 2.3615436131724716, + "learning_rate": 9.94919746598656e-06, + "loss": 0.4763, + "step": 6752 + }, + { + "epoch": 1.0333588370313695, + "grad_norm": 2.4754545731602575, + "learning_rate": 9.946719316500931e-06, + "loss": 0.5336, + "step": 6753 + }, + { + "epoch": 1.0335118592195869, + "grad_norm": 2.442943084544504, + "learning_rate": 9.94424117028748e-06, + "loss": 0.5174, + "step": 6754 + }, + { + "epoch": 1.0336648814078042, + "grad_norm": 2.682564815574864, + "learning_rate": 9.941763027498398e-06, + "loss": 0.6083, + "step": 6755 + }, + { + "epoch": 1.0338179035960213, + "grad_norm": 2.6595816044960965, + "learning_rate": 9.939284888285872e-06, + "loss": 0.5766, + "step": 6756 + }, + { + "epoch": 1.0339709257842387, + "grad_norm": 2.5083499459558656, + "learning_rate": 9.936806752802103e-06, + "loss": 0.5509, + "step": 6757 + }, + { + "epoch": 1.034123947972456, + "grad_norm": 2.275555021513342, + "learning_rate": 9.934328621199273e-06, + "loss": 0.5047, + "step": 6758 + }, + { + "epoch": 1.0342769701606733, + "grad_norm": 2.028981076722719, + "learning_rate": 9.931850493629587e-06, + "loss": 0.4789, + "step": 6759 + }, + { + "epoch": 1.0344299923488907, + "grad_norm": 2.0211581388756037, + "learning_rate": 9.929372370245225e-06, + "loss": 0.5024, + "step": 6760 + }, + { + "epoch": 1.034583014537108, + "grad_norm": 2.324035265435848, + "learning_rate": 9.92689425119838e-06, + "loss": 0.5728, + "step": 6761 + }, + { + "epoch": 1.034736036725325, + "grad_norm": 2.242721972544215, + "learning_rate": 9.924416136641249e-06, + "loss": 0.4953, + "step": 6762 + }, + { + "epoch": 1.0348890589135424, + "grad_norm": 2.432779427177171, + "learning_rate": 9.921938026726015e-06, + "loss": 0.6302, + "step": 6763 + }, + { + "epoch": 1.0350420811017598, + "grad_norm": 2.2598617265081153, + "learning_rate": 9.919459921604872e-06, + "loss": 0.5825, + "step": 6764 + }, + { + "epoch": 1.035195103289977, + "grad_norm": 2.5631858944269412, + "learning_rate": 9.916981821430016e-06, + "loss": 0.6006, + "step": 6765 + }, + { + "epoch": 1.0353481254781944, + "grad_norm": 2.50597329793168, + "learning_rate": 9.914503726353623e-06, + "loss": 0.5106, + "step": 6766 + }, + { + "epoch": 1.0355011476664115, + "grad_norm": 2.5551518698686464, + "learning_rate": 9.912025636527897e-06, + "loss": 0.5799, + "step": 6767 + }, + { + "epoch": 1.0356541698546289, + "grad_norm": 2.716633047615582, + "learning_rate": 9.90954755210502e-06, + "loss": 0.5963, + "step": 6768 + }, + { + "epoch": 1.0358071920428462, + "grad_norm": 2.187872786560717, + "learning_rate": 9.907069473237178e-06, + "loss": 0.5286, + "step": 6769 + }, + { + "epoch": 1.0359602142310635, + "grad_norm": 2.27583781573365, + "learning_rate": 9.904591400076567e-06, + "loss": 0.5259, + "step": 6770 + }, + { + "epoch": 1.0361132364192809, + "grad_norm": 2.2572498780404486, + "learning_rate": 9.902113332775372e-06, + "loss": 0.507, + "step": 6771 + }, + { + "epoch": 1.036266258607498, + "grad_norm": 2.270017680287196, + "learning_rate": 9.899635271485774e-06, + "loss": 0.5607, + "step": 6772 + }, + { + "epoch": 1.0364192807957153, + "grad_norm": 2.23521431485531, + "learning_rate": 9.897157216359972e-06, + "loss": 0.5248, + "step": 6773 + }, + { + "epoch": 1.0365723029839327, + "grad_norm": 2.3389129894008693, + "learning_rate": 9.894679167550143e-06, + "loss": 0.4359, + "step": 6774 + }, + { + "epoch": 1.03672532517215, + "grad_norm": 2.2620396345267344, + "learning_rate": 9.892201125208484e-06, + "loss": 0.5479, + "step": 6775 + }, + { + "epoch": 1.0368783473603673, + "grad_norm": 2.728001867195493, + "learning_rate": 9.889723089487175e-06, + "loss": 0.6709, + "step": 6776 + }, + { + "epoch": 1.0370313695485844, + "grad_norm": 2.1827657385278445, + "learning_rate": 9.887245060538397e-06, + "loss": 0.496, + "step": 6777 + }, + { + "epoch": 1.0371843917368018, + "grad_norm": 2.3223435649613973, + "learning_rate": 9.884767038514348e-06, + "loss": 0.5999, + "step": 6778 + }, + { + "epoch": 1.037337413925019, + "grad_norm": 2.2614536615688663, + "learning_rate": 9.882289023567203e-06, + "loss": 0.5056, + "step": 6779 + }, + { + "epoch": 1.0374904361132364, + "grad_norm": 2.73646991422452, + "learning_rate": 9.879811015849147e-06, + "loss": 0.6103, + "step": 6780 + }, + { + "epoch": 1.0376434583014538, + "grad_norm": 2.5056718112340093, + "learning_rate": 9.87733301551237e-06, + "loss": 0.5605, + "step": 6781 + }, + { + "epoch": 1.037796480489671, + "grad_norm": 2.48718682079311, + "learning_rate": 9.87485502270905e-06, + "loss": 0.5004, + "step": 6782 + }, + { + "epoch": 1.0379495026778882, + "grad_norm": 2.581830328744179, + "learning_rate": 9.872377037591374e-06, + "loss": 0.5789, + "step": 6783 + }, + { + "epoch": 1.0381025248661055, + "grad_norm": 2.368386825181525, + "learning_rate": 9.869899060311525e-06, + "loss": 0.5313, + "step": 6784 + }, + { + "epoch": 1.0382555470543229, + "grad_norm": 2.4010158149306884, + "learning_rate": 9.86742109102168e-06, + "loss": 0.5985, + "step": 6785 + }, + { + "epoch": 1.0384085692425402, + "grad_norm": 2.483515581888754, + "learning_rate": 9.864943129874027e-06, + "loss": 0.517, + "step": 6786 + }, + { + "epoch": 1.0385615914307575, + "grad_norm": 2.694645694193147, + "learning_rate": 9.862465177020742e-06, + "loss": 0.5725, + "step": 6787 + }, + { + "epoch": 1.0387146136189747, + "grad_norm": 2.00193207158381, + "learning_rate": 9.859987232614012e-06, + "loss": 0.4655, + "step": 6788 + }, + { + "epoch": 1.038867635807192, + "grad_norm": 2.297472219311569, + "learning_rate": 9.857509296806014e-06, + "loss": 0.5012, + "step": 6789 + }, + { + "epoch": 1.0390206579954093, + "grad_norm": 2.0400785317490127, + "learning_rate": 9.855031369748922e-06, + "loss": 0.5425, + "step": 6790 + }, + { + "epoch": 1.0391736801836267, + "grad_norm": 2.162087427618444, + "learning_rate": 9.852553451594929e-06, + "loss": 0.4972, + "step": 6791 + }, + { + "epoch": 1.039326702371844, + "grad_norm": 2.297269429343117, + "learning_rate": 9.8500755424962e-06, + "loss": 0.4935, + "step": 6792 + }, + { + "epoch": 1.039479724560061, + "grad_norm": 2.4070600512021785, + "learning_rate": 9.847597642604917e-06, + "loss": 0.5847, + "step": 6793 + }, + { + "epoch": 1.0396327467482784, + "grad_norm": 2.517284455670916, + "learning_rate": 9.845119752073265e-06, + "loss": 0.5392, + "step": 6794 + }, + { + "epoch": 1.0397857689364958, + "grad_norm": 2.3382151130233817, + "learning_rate": 9.84264187105341e-06, + "loss": 0.5164, + "step": 6795 + }, + { + "epoch": 1.039938791124713, + "grad_norm": 2.2612802000483216, + "learning_rate": 9.840163999697532e-06, + "loss": 0.5151, + "step": 6796 + }, + { + "epoch": 1.0400918133129304, + "grad_norm": 2.2601432005493263, + "learning_rate": 9.837686138157813e-06, + "loss": 0.5168, + "step": 6797 + }, + { + "epoch": 1.0402448355011478, + "grad_norm": 2.6100201009422235, + "learning_rate": 9.835208286586419e-06, + "loss": 0.6259, + "step": 6798 + }, + { + "epoch": 1.0403978576893649, + "grad_norm": 2.4883767574053706, + "learning_rate": 9.832730445135531e-06, + "loss": 0.5042, + "step": 6799 + }, + { + "epoch": 1.0405508798775822, + "grad_norm": 2.2820139860370183, + "learning_rate": 9.830252613957322e-06, + "loss": 0.4631, + "step": 6800 + }, + { + "epoch": 1.0407039020657995, + "grad_norm": 2.25483309950987, + "learning_rate": 9.827774793203961e-06, + "loss": 0.5455, + "step": 6801 + }, + { + "epoch": 1.0408569242540169, + "grad_norm": 2.6447960053774993, + "learning_rate": 9.825296983027625e-06, + "loss": 0.6592, + "step": 6802 + }, + { + "epoch": 1.0410099464422342, + "grad_norm": 2.053680781948206, + "learning_rate": 9.822819183580484e-06, + "loss": 0.5091, + "step": 6803 + }, + { + "epoch": 1.0411629686304513, + "grad_norm": 2.395301936062382, + "learning_rate": 9.820341395014713e-06, + "loss": 0.546, + "step": 6804 + }, + { + "epoch": 1.0413159908186687, + "grad_norm": 2.033876449030659, + "learning_rate": 9.817863617482479e-06, + "loss": 0.4092, + "step": 6805 + }, + { + "epoch": 1.041469013006886, + "grad_norm": 2.413056030738432, + "learning_rate": 9.815385851135948e-06, + "loss": 0.652, + "step": 6806 + }, + { + "epoch": 1.0416220351951033, + "grad_norm": 2.448201678783148, + "learning_rate": 9.8129080961273e-06, + "loss": 0.4885, + "step": 6807 + }, + { + "epoch": 1.0417750573833207, + "grad_norm": 2.518063374444106, + "learning_rate": 9.810430352608695e-06, + "loss": 0.5058, + "step": 6808 + }, + { + "epoch": 1.0419280795715378, + "grad_norm": 2.000437078764586, + "learning_rate": 9.807952620732302e-06, + "loss": 0.4609, + "step": 6809 + }, + { + "epoch": 1.042081101759755, + "grad_norm": 2.602756111222112, + "learning_rate": 9.805474900650296e-06, + "loss": 0.6179, + "step": 6810 + }, + { + "epoch": 1.0422341239479724, + "grad_norm": 2.4146641274588654, + "learning_rate": 9.802997192514827e-06, + "loss": 0.5391, + "step": 6811 + }, + { + "epoch": 1.0423871461361898, + "grad_norm": 2.4073284310419356, + "learning_rate": 9.80051949647808e-06, + "loss": 0.5885, + "step": 6812 + }, + { + "epoch": 1.042540168324407, + "grad_norm": 2.3612873008392166, + "learning_rate": 9.798041812692211e-06, + "loss": 0.5162, + "step": 6813 + }, + { + "epoch": 1.0426931905126244, + "grad_norm": 2.5148254857651042, + "learning_rate": 9.795564141309376e-06, + "loss": 0.5554, + "step": 6814 + }, + { + "epoch": 1.0428462127008415, + "grad_norm": 2.193824725601136, + "learning_rate": 9.793086482481755e-06, + "loss": 0.4637, + "step": 6815 + }, + { + "epoch": 1.0429992348890589, + "grad_norm": 2.195282093201084, + "learning_rate": 9.790608836361502e-06, + "loss": 0.5601, + "step": 6816 + }, + { + "epoch": 1.0431522570772762, + "grad_norm": 2.3815272787115522, + "learning_rate": 9.788131203100774e-06, + "loss": 0.5558, + "step": 6817 + }, + { + "epoch": 1.0433052792654935, + "grad_norm": 2.579512220117735, + "learning_rate": 9.785653582851745e-06, + "loss": 0.5961, + "step": 6818 + }, + { + "epoch": 1.0434583014537109, + "grad_norm": 2.3883347162224244, + "learning_rate": 9.783175975766561e-06, + "loss": 0.6108, + "step": 6819 + }, + { + "epoch": 1.043611323641928, + "grad_norm": 2.3174436373500686, + "learning_rate": 9.78069838199739e-06, + "loss": 0.5733, + "step": 6820 + }, + { + "epoch": 1.0437643458301453, + "grad_norm": 2.2060498412069744, + "learning_rate": 9.778220801696395e-06, + "loss": 0.5214, + "step": 6821 + }, + { + "epoch": 1.0439173680183627, + "grad_norm": 2.2232999074708797, + "learning_rate": 9.775743235015721e-06, + "loss": 0.5198, + "step": 6822 + }, + { + "epoch": 1.04407039020658, + "grad_norm": 2.3554090047539535, + "learning_rate": 9.773265682107538e-06, + "loss": 0.5541, + "step": 6823 + }, + { + "epoch": 1.0442234123947973, + "grad_norm": 2.16785446117587, + "learning_rate": 9.77078814312399e-06, + "loss": 0.4758, + "step": 6824 + }, + { + "epoch": 1.0443764345830144, + "grad_norm": 2.454126851389489, + "learning_rate": 9.768310618217246e-06, + "loss": 0.5185, + "step": 6825 + }, + { + "epoch": 1.0445294567712318, + "grad_norm": 2.6924414731654864, + "learning_rate": 9.76583310753945e-06, + "loss": 0.639, + "step": 6826 + }, + { + "epoch": 1.044682478959449, + "grad_norm": 2.265111260087062, + "learning_rate": 9.763355611242757e-06, + "loss": 0.4323, + "step": 6827 + }, + { + "epoch": 1.0448355011476664, + "grad_norm": 2.277809428494371, + "learning_rate": 9.760878129479325e-06, + "loss": 0.4809, + "step": 6828 + }, + { + "epoch": 1.0449885233358838, + "grad_norm": 2.2456569046235453, + "learning_rate": 9.758400662401301e-06, + "loss": 0.4522, + "step": 6829 + }, + { + "epoch": 1.045141545524101, + "grad_norm": 2.453577541760022, + "learning_rate": 9.755923210160836e-06, + "loss": 0.4833, + "step": 6830 + }, + { + "epoch": 1.0452945677123182, + "grad_norm": 2.482612822718974, + "learning_rate": 9.753445772910085e-06, + "loss": 0.5288, + "step": 6831 + }, + { + "epoch": 1.0454475899005355, + "grad_norm": 2.2688767121063216, + "learning_rate": 9.75096835080119e-06, + "loss": 0.5265, + "step": 6832 + }, + { + "epoch": 1.0456006120887529, + "grad_norm": 2.3742525332643485, + "learning_rate": 9.748490943986304e-06, + "loss": 0.5401, + "step": 6833 + }, + { + "epoch": 1.0457536342769702, + "grad_norm": 2.297535035475942, + "learning_rate": 9.746013552617576e-06, + "loss": 0.5176, + "step": 6834 + }, + { + "epoch": 1.0459066564651875, + "grad_norm": 2.2986799616130615, + "learning_rate": 9.743536176847145e-06, + "loss": 0.6058, + "step": 6835 + }, + { + "epoch": 1.0460596786534047, + "grad_norm": 2.6040700168531097, + "learning_rate": 9.741058816827162e-06, + "loss": 0.4995, + "step": 6836 + }, + { + "epoch": 1.046212700841622, + "grad_norm": 2.305543948337454, + "learning_rate": 9.738581472709774e-06, + "loss": 0.5135, + "step": 6837 + }, + { + "epoch": 1.0463657230298393, + "grad_norm": 2.4772310516819034, + "learning_rate": 9.736104144647114e-06, + "loss": 0.5416, + "step": 6838 + }, + { + "epoch": 1.0465187452180567, + "grad_norm": 2.292649813878011, + "learning_rate": 9.733626832791336e-06, + "loss": 0.4853, + "step": 6839 + }, + { + "epoch": 1.046671767406274, + "grad_norm": 2.7654251480866803, + "learning_rate": 9.73114953729457e-06, + "loss": 0.4503, + "step": 6840 + }, + { + "epoch": 1.046824789594491, + "grad_norm": 2.4129007595804945, + "learning_rate": 9.72867225830897e-06, + "loss": 0.547, + "step": 6841 + }, + { + "epoch": 1.0469778117827084, + "grad_norm": 2.565444877722573, + "learning_rate": 9.726194995986665e-06, + "loss": 0.5448, + "step": 6842 + }, + { + "epoch": 1.0471308339709258, + "grad_norm": 2.3819084893973965, + "learning_rate": 9.723717750479793e-06, + "loss": 0.4662, + "step": 6843 + }, + { + "epoch": 1.047283856159143, + "grad_norm": 1.9489195524522251, + "learning_rate": 9.721240521940501e-06, + "loss": 0.5165, + "step": 6844 + }, + { + "epoch": 1.0474368783473604, + "grad_norm": 2.5377098649664727, + "learning_rate": 9.718763310520916e-06, + "loss": 0.6242, + "step": 6845 + }, + { + "epoch": 1.0475899005355775, + "grad_norm": 2.280532226310244, + "learning_rate": 9.716286116373174e-06, + "loss": 0.6094, + "step": 6846 + }, + { + "epoch": 1.0477429227237949, + "grad_norm": 2.3174754567622773, + "learning_rate": 9.713808939649417e-06, + "loss": 0.4831, + "step": 6847 + }, + { + "epoch": 1.0478959449120122, + "grad_norm": 2.5142215573367515, + "learning_rate": 9.711331780501766e-06, + "loss": 0.5593, + "step": 6848 + }, + { + "epoch": 1.0480489671002295, + "grad_norm": 2.1926272912275264, + "learning_rate": 9.708854639082364e-06, + "loss": 0.4583, + "step": 6849 + }, + { + "epoch": 1.0482019892884469, + "grad_norm": 2.226115977484892, + "learning_rate": 9.70637751554334e-06, + "loss": 0.537, + "step": 6850 + }, + { + "epoch": 1.0483550114766642, + "grad_norm": 2.115691597008675, + "learning_rate": 9.703900410036815e-06, + "loss": 0.5448, + "step": 6851 + }, + { + "epoch": 1.0485080336648813, + "grad_norm": 2.180567669515745, + "learning_rate": 9.701423322714928e-06, + "loss": 0.5617, + "step": 6852 + }, + { + "epoch": 1.0486610558530987, + "grad_norm": 2.382574902525598, + "learning_rate": 9.698946253729804e-06, + "loss": 0.5341, + "step": 6853 + }, + { + "epoch": 1.048814078041316, + "grad_norm": 2.239884675264611, + "learning_rate": 9.696469203233565e-06, + "loss": 0.534, + "step": 6854 + }, + { + "epoch": 1.0489671002295333, + "grad_norm": 2.2210027907755334, + "learning_rate": 9.693992171378342e-06, + "loss": 0.4105, + "step": 6855 + }, + { + "epoch": 1.0491201224177507, + "grad_norm": 2.1536780424328894, + "learning_rate": 9.691515158316253e-06, + "loss": 0.4926, + "step": 6856 + }, + { + "epoch": 1.0492731446059678, + "grad_norm": 2.533943071647297, + "learning_rate": 9.68903816419943e-06, + "loss": 0.6514, + "step": 6857 + }, + { + "epoch": 1.049426166794185, + "grad_norm": 2.292108221676427, + "learning_rate": 9.686561189179989e-06, + "loss": 0.4834, + "step": 6858 + }, + { + "epoch": 1.0495791889824024, + "grad_norm": 2.1828211174386403, + "learning_rate": 9.684084233410048e-06, + "loss": 0.4351, + "step": 6859 + }, + { + "epoch": 1.0497322111706198, + "grad_norm": 2.3255510612548007, + "learning_rate": 9.681607297041734e-06, + "loss": 0.4607, + "step": 6860 + }, + { + "epoch": 1.049885233358837, + "grad_norm": 2.1176499482167475, + "learning_rate": 9.679130380227158e-06, + "loss": 0.4311, + "step": 6861 + }, + { + "epoch": 1.0500382555470544, + "grad_norm": 2.317475873274802, + "learning_rate": 9.676653483118441e-06, + "loss": 0.4604, + "step": 6862 + }, + { + "epoch": 1.0501912777352715, + "grad_norm": 2.855097863082061, + "learning_rate": 9.674176605867702e-06, + "loss": 0.5651, + "step": 6863 + }, + { + "epoch": 1.0503442999234889, + "grad_norm": 2.2833665485609074, + "learning_rate": 9.67169974862705e-06, + "loss": 0.4677, + "step": 6864 + }, + { + "epoch": 1.0504973221117062, + "grad_norm": 2.606929688063134, + "learning_rate": 9.6692229115486e-06, + "loss": 0.597, + "step": 6865 + }, + { + "epoch": 1.0506503442999235, + "grad_norm": 2.390847436864242, + "learning_rate": 9.666746094784468e-06, + "loss": 0.5273, + "step": 6866 + }, + { + "epoch": 1.0508033664881409, + "grad_norm": 2.5605618344036545, + "learning_rate": 9.664269298486759e-06, + "loss": 0.572, + "step": 6867 + }, + { + "epoch": 1.050956388676358, + "grad_norm": 2.254858739297128, + "learning_rate": 9.661792522807586e-06, + "loss": 0.4767, + "step": 6868 + }, + { + "epoch": 1.0511094108645753, + "grad_norm": 2.4423773067977423, + "learning_rate": 9.659315767899055e-06, + "loss": 0.5312, + "step": 6869 + }, + { + "epoch": 1.0512624330527927, + "grad_norm": 2.5114329776973423, + "learning_rate": 9.656839033913282e-06, + "loss": 0.5757, + "step": 6870 + }, + { + "epoch": 1.05141545524101, + "grad_norm": 2.17181345130201, + "learning_rate": 9.654362321002363e-06, + "loss": 0.4097, + "step": 6871 + }, + { + "epoch": 1.0515684774292273, + "grad_norm": 2.0419482932100363, + "learning_rate": 9.651885629318402e-06, + "loss": 0.4301, + "step": 6872 + }, + { + "epoch": 1.0517214996174444, + "grad_norm": 2.266374883329687, + "learning_rate": 9.649408959013513e-06, + "loss": 0.4822, + "step": 6873 + }, + { + "epoch": 1.0518745218056618, + "grad_norm": 2.205778807974861, + "learning_rate": 9.64693231023979e-06, + "loss": 0.5209, + "step": 6874 + }, + { + "epoch": 1.052027543993879, + "grad_norm": 2.260631995522916, + "learning_rate": 9.64445568314933e-06, + "loss": 0.44, + "step": 6875 + }, + { + "epoch": 1.0521805661820964, + "grad_norm": 2.3307479914634466, + "learning_rate": 9.641979077894244e-06, + "loss": 0.4535, + "step": 6876 + }, + { + "epoch": 1.0523335883703138, + "grad_norm": 2.2111017557168826, + "learning_rate": 9.639502494626618e-06, + "loss": 0.5096, + "step": 6877 + }, + { + "epoch": 1.0524866105585309, + "grad_norm": 2.366404180462465, + "learning_rate": 9.637025933498556e-06, + "loss": 0.5082, + "step": 6878 + }, + { + "epoch": 1.0526396327467482, + "grad_norm": 2.51983682850274, + "learning_rate": 9.634549394662154e-06, + "loss": 0.5402, + "step": 6879 + }, + { + "epoch": 1.0527926549349655, + "grad_norm": 2.605006859877165, + "learning_rate": 9.6320728782695e-06, + "loss": 0.5005, + "step": 6880 + }, + { + "epoch": 1.0529456771231829, + "grad_norm": 1.9584831153342839, + "learning_rate": 9.62959638447269e-06, + "loss": 0.4544, + "step": 6881 + }, + { + "epoch": 1.0530986993114002, + "grad_norm": 2.503851041495755, + "learning_rate": 9.62711991342382e-06, + "loss": 0.4845, + "step": 6882 + }, + { + "epoch": 1.0532517214996175, + "grad_norm": 2.197647946951353, + "learning_rate": 9.624643465274968e-06, + "loss": 0.5008, + "step": 6883 + }, + { + "epoch": 1.0534047436878347, + "grad_norm": 2.6562728932565185, + "learning_rate": 9.622167040178233e-06, + "loss": 0.5155, + "step": 6884 + }, + { + "epoch": 1.053557765876052, + "grad_norm": 2.2358372881033945, + "learning_rate": 9.619690638285694e-06, + "loss": 0.526, + "step": 6885 + }, + { + "epoch": 1.0537107880642693, + "grad_norm": 2.278041438282626, + "learning_rate": 9.617214259749445e-06, + "loss": 0.4483, + "step": 6886 + }, + { + "epoch": 1.0538638102524867, + "grad_norm": 3.765678149655499, + "learning_rate": 9.614737904721567e-06, + "loss": 0.4127, + "step": 6887 + }, + { + "epoch": 1.054016832440704, + "grad_norm": 2.345689494157314, + "learning_rate": 9.612261573354137e-06, + "loss": 0.4669, + "step": 6888 + }, + { + "epoch": 1.054169854628921, + "grad_norm": 2.215666024931825, + "learning_rate": 9.609785265799243e-06, + "loss": 0.4277, + "step": 6889 + }, + { + "epoch": 1.0543228768171384, + "grad_norm": 2.1371736190516355, + "learning_rate": 9.607308982208959e-06, + "loss": 0.4995, + "step": 6890 + }, + { + "epoch": 1.0544758990053558, + "grad_norm": 2.5607618798633167, + "learning_rate": 9.604832722735373e-06, + "loss": 0.4405, + "step": 6891 + }, + { + "epoch": 1.054628921193573, + "grad_norm": 2.2072868289383214, + "learning_rate": 9.602356487530553e-06, + "loss": 0.4679, + "step": 6892 + }, + { + "epoch": 1.0547819433817904, + "grad_norm": 2.417589786036523, + "learning_rate": 9.599880276746572e-06, + "loss": 0.4911, + "step": 6893 + }, + { + "epoch": 1.0549349655700078, + "grad_norm": 2.0872106304582982, + "learning_rate": 9.597404090535515e-06, + "loss": 0.4545, + "step": 6894 + }, + { + "epoch": 1.0550879877582249, + "grad_norm": 2.46713381700791, + "learning_rate": 9.594927929049447e-06, + "loss": 0.4741, + "step": 6895 + }, + { + "epoch": 1.0552410099464422, + "grad_norm": 2.306058536221254, + "learning_rate": 9.592451792440433e-06, + "loss": 0.4439, + "step": 6896 + }, + { + "epoch": 1.0553940321346595, + "grad_norm": 3.022745301253467, + "learning_rate": 9.589975680860556e-06, + "loss": 0.5254, + "step": 6897 + }, + { + "epoch": 1.0555470543228769, + "grad_norm": 2.345844867262732, + "learning_rate": 9.587499594461871e-06, + "loss": 0.3766, + "step": 6898 + }, + { + "epoch": 1.0557000765110942, + "grad_norm": 2.223361710672595, + "learning_rate": 9.585023533396452e-06, + "loss": 0.4829, + "step": 6899 + }, + { + "epoch": 1.0558530986993113, + "grad_norm": 2.328724388731237, + "learning_rate": 9.582547497816364e-06, + "loss": 0.5028, + "step": 6900 + }, + { + "epoch": 1.0560061208875287, + "grad_norm": 2.2646982308630363, + "learning_rate": 9.580071487873663e-06, + "loss": 0.4627, + "step": 6901 + }, + { + "epoch": 1.056159143075746, + "grad_norm": 2.2351291324961093, + "learning_rate": 9.577595503720417e-06, + "loss": 0.5294, + "step": 6902 + }, + { + "epoch": 1.0563121652639633, + "grad_norm": 2.317554741459491, + "learning_rate": 9.575119545508686e-06, + "loss": 0.4855, + "step": 6903 + }, + { + "epoch": 1.0564651874521807, + "grad_norm": 2.335702932508031, + "learning_rate": 9.572643613390521e-06, + "loss": 0.4449, + "step": 6904 + }, + { + "epoch": 1.0566182096403978, + "grad_norm": 2.3390630669388472, + "learning_rate": 9.570167707517986e-06, + "loss": 0.497, + "step": 6905 + }, + { + "epoch": 1.056771231828615, + "grad_norm": 2.0915049224848734, + "learning_rate": 9.567691828043131e-06, + "loss": 0.4223, + "step": 6906 + }, + { + "epoch": 1.0569242540168324, + "grad_norm": 2.4297434477300177, + "learning_rate": 9.565215975118016e-06, + "loss": 0.5876, + "step": 6907 + }, + { + "epoch": 1.0570772762050498, + "grad_norm": 2.4562615398492014, + "learning_rate": 9.562740148894687e-06, + "loss": 0.4997, + "step": 6908 + }, + { + "epoch": 1.057230298393267, + "grad_norm": 2.5933852297859024, + "learning_rate": 9.560264349525193e-06, + "loss": 0.5457, + "step": 6909 + }, + { + "epoch": 1.0573833205814842, + "grad_norm": 2.107241488019224, + "learning_rate": 9.557788577161592e-06, + "loss": 0.4718, + "step": 6910 + }, + { + "epoch": 1.0575363427697015, + "grad_norm": 2.3210855880243195, + "learning_rate": 9.555312831955921e-06, + "loss": 0.4619, + "step": 6911 + }, + { + "epoch": 1.0576893649579189, + "grad_norm": 2.2978307140280405, + "learning_rate": 9.552837114060226e-06, + "loss": 0.5951, + "step": 6912 + }, + { + "epoch": 1.0578423871461362, + "grad_norm": 2.2115010649647653, + "learning_rate": 9.550361423626558e-06, + "loss": 0.484, + "step": 6913 + }, + { + "epoch": 1.0579954093343535, + "grad_norm": 2.003534556604302, + "learning_rate": 9.54788576080695e-06, + "loss": 0.3852, + "step": 6914 + }, + { + "epoch": 1.0581484315225709, + "grad_norm": 2.429515724313838, + "learning_rate": 9.545410125753448e-06, + "loss": 0.5315, + "step": 6915 + }, + { + "epoch": 1.058301453710788, + "grad_norm": 2.173108303541178, + "learning_rate": 9.542934518618092e-06, + "loss": 0.4607, + "step": 6916 + }, + { + "epoch": 1.0584544758990053, + "grad_norm": 2.232859421941343, + "learning_rate": 9.540458939552907e-06, + "loss": 0.4832, + "step": 6917 + }, + { + "epoch": 1.0586074980872227, + "grad_norm": 2.1284743617910205, + "learning_rate": 9.537983388709943e-06, + "loss": 0.4607, + "step": 6918 + }, + { + "epoch": 1.05876052027544, + "grad_norm": 2.1196258230083673, + "learning_rate": 9.535507866241226e-06, + "loss": 0.4231, + "step": 6919 + }, + { + "epoch": 1.0589135424636573, + "grad_norm": 2.532512044150864, + "learning_rate": 9.533032372298784e-06, + "loss": 0.5098, + "step": 6920 + }, + { + "epoch": 1.0590665646518744, + "grad_norm": 2.2109378116606604, + "learning_rate": 9.530556907034653e-06, + "loss": 0.4378, + "step": 6921 + }, + { + "epoch": 1.0592195868400918, + "grad_norm": 2.3043607556194154, + "learning_rate": 9.528081470600857e-06, + "loss": 0.4155, + "step": 6922 + }, + { + "epoch": 1.059372609028309, + "grad_norm": 2.319007189741688, + "learning_rate": 9.525606063149429e-06, + "loss": 0.4975, + "step": 6923 + }, + { + "epoch": 1.0595256312165264, + "grad_norm": 2.2928485267167447, + "learning_rate": 9.523130684832386e-06, + "loss": 0.5141, + "step": 6924 + }, + { + "epoch": 1.0596786534047438, + "grad_norm": 2.4235326737855254, + "learning_rate": 9.52065533580175e-06, + "loss": 0.4867, + "step": 6925 + }, + { + "epoch": 1.0598316755929609, + "grad_norm": 2.168516218109533, + "learning_rate": 9.518180016209551e-06, + "loss": 0.4407, + "step": 6926 + }, + { + "epoch": 1.0599846977811782, + "grad_norm": 2.069744482004005, + "learning_rate": 9.515704726207796e-06, + "loss": 0.3991, + "step": 6927 + }, + { + "epoch": 1.0601377199693955, + "grad_norm": 2.1506333934930257, + "learning_rate": 9.513229465948511e-06, + "loss": 0.4787, + "step": 6928 + }, + { + "epoch": 1.0602907421576129, + "grad_norm": 2.2749822448358814, + "learning_rate": 9.510754235583713e-06, + "loss": 0.4889, + "step": 6929 + }, + { + "epoch": 1.0604437643458302, + "grad_norm": 2.018532232312824, + "learning_rate": 9.508279035265405e-06, + "loss": 0.4167, + "step": 6930 + }, + { + "epoch": 1.0605967865340475, + "grad_norm": 2.120989225905039, + "learning_rate": 9.505803865145606e-06, + "loss": 0.4536, + "step": 6931 + }, + { + "epoch": 1.0607498087222647, + "grad_norm": 2.1292285782630396, + "learning_rate": 9.50332872537633e-06, + "loss": 0.4711, + "step": 6932 + }, + { + "epoch": 1.060902830910482, + "grad_norm": 2.5222440705575466, + "learning_rate": 9.500853616109572e-06, + "loss": 0.536, + "step": 6933 + }, + { + "epoch": 1.0610558530986993, + "grad_norm": 2.396454824637159, + "learning_rate": 9.498378537497352e-06, + "loss": 0.5248, + "step": 6934 + }, + { + "epoch": 1.0612088752869167, + "grad_norm": 2.42334812785671, + "learning_rate": 9.495903489691665e-06, + "loss": 0.5531, + "step": 6935 + }, + { + "epoch": 1.061361897475134, + "grad_norm": 2.217976890998022, + "learning_rate": 9.493428472844521e-06, + "loss": 0.4391, + "step": 6936 + }, + { + "epoch": 1.061514919663351, + "grad_norm": 2.170116082357399, + "learning_rate": 9.490953487107913e-06, + "loss": 0.3839, + "step": 6937 + }, + { + "epoch": 1.0616679418515684, + "grad_norm": 2.2091807909239263, + "learning_rate": 9.488478532633842e-06, + "loss": 0.4498, + "step": 6938 + }, + { + "epoch": 1.0618209640397858, + "grad_norm": 2.455234760203659, + "learning_rate": 9.48600360957431e-06, + "loss": 0.5034, + "step": 6939 + }, + { + "epoch": 1.061973986228003, + "grad_norm": 2.6446464434159487, + "learning_rate": 9.483528718081303e-06, + "loss": 0.5489, + "step": 6940 + }, + { + "epoch": 1.0621270084162204, + "grad_norm": 2.493535788047184, + "learning_rate": 9.481053858306816e-06, + "loss": 0.5001, + "step": 6941 + }, + { + "epoch": 1.0622800306044375, + "grad_norm": 2.679684989109719, + "learning_rate": 9.478579030402849e-06, + "loss": 0.602, + "step": 6942 + }, + { + "epoch": 1.0624330527926549, + "grad_norm": 2.2237217293867833, + "learning_rate": 9.476104234521376e-06, + "loss": 0.4686, + "step": 6943 + }, + { + "epoch": 1.0625860749808722, + "grad_norm": 2.2552698510205804, + "learning_rate": 9.473629470814395e-06, + "loss": 0.482, + "step": 6944 + }, + { + "epoch": 1.0627390971690895, + "grad_norm": 2.3382411136262915, + "learning_rate": 9.471154739433889e-06, + "loss": 0.5241, + "step": 6945 + }, + { + "epoch": 1.0628921193573069, + "grad_norm": 2.277125644575901, + "learning_rate": 9.468680040531834e-06, + "loss": 0.4437, + "step": 6946 + }, + { + "epoch": 1.063045141545524, + "grad_norm": 2.078240326941889, + "learning_rate": 9.46620537426022e-06, + "loss": 0.4313, + "step": 6947 + }, + { + "epoch": 1.0631981637337413, + "grad_norm": 2.130682788759023, + "learning_rate": 9.463730740771025e-06, + "loss": 0.4409, + "step": 6948 + }, + { + "epoch": 1.0633511859219587, + "grad_norm": 2.30060169305822, + "learning_rate": 9.461256140216218e-06, + "loss": 0.4819, + "step": 6949 + }, + { + "epoch": 1.063504208110176, + "grad_norm": 2.519758537925622, + "learning_rate": 9.45878157274778e-06, + "loss": 0.5226, + "step": 6950 + }, + { + "epoch": 1.0636572302983933, + "grad_norm": 2.017059855190871, + "learning_rate": 9.456307038517682e-06, + "loss": 0.3927, + "step": 6951 + }, + { + "epoch": 1.0638102524866107, + "grad_norm": 2.4161156149987413, + "learning_rate": 9.453832537677899e-06, + "loss": 0.4671, + "step": 6952 + }, + { + "epoch": 1.0639632746748278, + "grad_norm": 2.4352087813015295, + "learning_rate": 9.451358070380395e-06, + "loss": 0.5307, + "step": 6953 + }, + { + "epoch": 1.064116296863045, + "grad_norm": 2.4821772090895924, + "learning_rate": 9.448883636777136e-06, + "loss": 0.6598, + "step": 6954 + }, + { + "epoch": 1.0642693190512624, + "grad_norm": 2.393312834043304, + "learning_rate": 9.446409237020093e-06, + "loss": 0.5168, + "step": 6955 + }, + { + "epoch": 1.0644223412394798, + "grad_norm": 2.321195584837522, + "learning_rate": 9.443934871261223e-06, + "loss": 0.4782, + "step": 6956 + }, + { + "epoch": 1.064575363427697, + "grad_norm": 2.083851712582457, + "learning_rate": 9.441460539652485e-06, + "loss": 0.3878, + "step": 6957 + }, + { + "epoch": 1.0647283856159142, + "grad_norm": 2.2876127565948106, + "learning_rate": 9.438986242345844e-06, + "loss": 0.4808, + "step": 6958 + }, + { + "epoch": 1.0648814078041315, + "grad_norm": 2.328857160236901, + "learning_rate": 9.436511979493249e-06, + "loss": 0.4874, + "step": 6959 + }, + { + "epoch": 1.0650344299923489, + "grad_norm": 2.483784227392007, + "learning_rate": 9.43403775124666e-06, + "loss": 0.4848, + "step": 6960 + }, + { + "epoch": 1.0651874521805662, + "grad_norm": 2.139738952325263, + "learning_rate": 9.43156355775803e-06, + "loss": 0.4908, + "step": 6961 + }, + { + "epoch": 1.0653404743687835, + "grad_norm": 2.215075116418688, + "learning_rate": 9.429089399179298e-06, + "loss": 0.4778, + "step": 6962 + }, + { + "epoch": 1.0654934965570009, + "grad_norm": 1.9814013494006328, + "learning_rate": 9.426615275662426e-06, + "loss": 0.441, + "step": 6963 + }, + { + "epoch": 1.065646518745218, + "grad_norm": 2.275252087564853, + "learning_rate": 9.424141187359347e-06, + "loss": 0.5158, + "step": 6964 + }, + { + "epoch": 1.0657995409334353, + "grad_norm": 2.2604361487894638, + "learning_rate": 9.421667134422018e-06, + "loss": 0.4746, + "step": 6965 + }, + { + "epoch": 1.0659525631216527, + "grad_norm": 2.539253336905643, + "learning_rate": 9.419193117002367e-06, + "loss": 0.4601, + "step": 6966 + }, + { + "epoch": 1.06610558530987, + "grad_norm": 2.3308452441992413, + "learning_rate": 9.416719135252338e-06, + "loss": 0.469, + "step": 6967 + }, + { + "epoch": 1.0662586074980873, + "grad_norm": 2.1799849355110417, + "learning_rate": 9.414245189323875e-06, + "loss": 0.4691, + "step": 6968 + }, + { + "epoch": 1.0664116296863044, + "grad_norm": 2.438373820976639, + "learning_rate": 9.411771279368902e-06, + "loss": 0.4941, + "step": 6969 + }, + { + "epoch": 1.0665646518745218, + "grad_norm": 2.1442590483045816, + "learning_rate": 9.409297405539355e-06, + "loss": 0.3921, + "step": 6970 + }, + { + "epoch": 1.066717674062739, + "grad_norm": 2.3576024377638167, + "learning_rate": 9.40682356798717e-06, + "loss": 0.4943, + "step": 6971 + }, + { + "epoch": 1.0668706962509564, + "grad_norm": 2.3218424806222435, + "learning_rate": 9.404349766864262e-06, + "loss": 0.5805, + "step": 6972 + }, + { + "epoch": 1.0670237184391738, + "grad_norm": 2.4645763128765745, + "learning_rate": 9.401876002322573e-06, + "loss": 0.4609, + "step": 6973 + }, + { + "epoch": 1.0671767406273909, + "grad_norm": 2.3130841744661073, + "learning_rate": 9.399402274514017e-06, + "loss": 0.5233, + "step": 6974 + }, + { + "epoch": 1.0673297628156082, + "grad_norm": 2.1876222922209703, + "learning_rate": 9.39692858359051e-06, + "loss": 0.4513, + "step": 6975 + }, + { + "epoch": 1.0674827850038255, + "grad_norm": 2.033847811009393, + "learning_rate": 9.394454929703987e-06, + "loss": 0.3781, + "step": 6976 + }, + { + "epoch": 1.0676358071920429, + "grad_norm": 2.482097524069422, + "learning_rate": 9.391981313006354e-06, + "loss": 0.4744, + "step": 6977 + }, + { + "epoch": 1.0677888293802602, + "grad_norm": 2.346499473043113, + "learning_rate": 9.38950773364952e-06, + "loss": 0.4851, + "step": 6978 + }, + { + "epoch": 1.0679418515684773, + "grad_norm": 2.157264052254601, + "learning_rate": 9.387034191785413e-06, + "loss": 0.4382, + "step": 6979 + }, + { + "epoch": 1.0680948737566947, + "grad_norm": 2.3721231776727056, + "learning_rate": 9.384560687565927e-06, + "loss": 0.485, + "step": 6980 + }, + { + "epoch": 1.068247895944912, + "grad_norm": 2.2298549333189306, + "learning_rate": 9.382087221142981e-06, + "loss": 0.4579, + "step": 6981 + }, + { + "epoch": 1.0684009181331293, + "grad_norm": 2.0889715056745377, + "learning_rate": 9.379613792668478e-06, + "loss": 0.4872, + "step": 6982 + }, + { + "epoch": 1.0685539403213467, + "grad_norm": 2.0541752026840365, + "learning_rate": 9.377140402294315e-06, + "loss": 0.4358, + "step": 6983 + }, + { + "epoch": 1.068706962509564, + "grad_norm": 2.327139410292757, + "learning_rate": 9.374667050172397e-06, + "loss": 0.4975, + "step": 6984 + }, + { + "epoch": 1.068859984697781, + "grad_norm": 2.367702726919621, + "learning_rate": 9.372193736454626e-06, + "loss": 0.5055, + "step": 6985 + }, + { + "epoch": 1.0690130068859984, + "grad_norm": 2.178558568246285, + "learning_rate": 9.369720461292889e-06, + "loss": 0.4234, + "step": 6986 + }, + { + "epoch": 1.0691660290742158, + "grad_norm": 2.2406729508801653, + "learning_rate": 9.367247224839084e-06, + "loss": 0.4451, + "step": 6987 + }, + { + "epoch": 1.069319051262433, + "grad_norm": 2.2256944403639953, + "learning_rate": 9.364774027245102e-06, + "loss": 0.443, + "step": 6988 + }, + { + "epoch": 1.0694720734506504, + "grad_norm": 2.0697411441110463, + "learning_rate": 9.362300868662837e-06, + "loss": 0.441, + "step": 6989 + }, + { + "epoch": 1.0696250956388675, + "grad_norm": 2.2535106828944182, + "learning_rate": 9.359827749244168e-06, + "loss": 0.3842, + "step": 6990 + }, + { + "epoch": 1.0697781178270849, + "grad_norm": 2.3355363703052214, + "learning_rate": 9.357354669140977e-06, + "loss": 0.4452, + "step": 6991 + }, + { + "epoch": 1.0699311400153022, + "grad_norm": 2.144156733012759, + "learning_rate": 9.354881628505156e-06, + "loss": 0.4491, + "step": 6992 + }, + { + "epoch": 1.0700841622035195, + "grad_norm": 2.304338238051539, + "learning_rate": 9.352408627488574e-06, + "loss": 0.4392, + "step": 6993 + }, + { + "epoch": 1.0702371843917369, + "grad_norm": 2.207433271716185, + "learning_rate": 9.349935666243112e-06, + "loss": 0.3962, + "step": 6994 + }, + { + "epoch": 1.0703902065799542, + "grad_norm": 2.339465394300809, + "learning_rate": 9.347462744920646e-06, + "loss": 0.5691, + "step": 6995 + }, + { + "epoch": 1.0705432287681713, + "grad_norm": 2.2116556612187286, + "learning_rate": 9.344989863673043e-06, + "loss": 0.3849, + "step": 6996 + }, + { + "epoch": 1.0706962509563887, + "grad_norm": 2.2286947116609244, + "learning_rate": 9.342517022652176e-06, + "loss": 0.4345, + "step": 6997 + }, + { + "epoch": 1.070849273144606, + "grad_norm": 2.523389393218545, + "learning_rate": 9.340044222009913e-06, + "loss": 0.5253, + "step": 6998 + }, + { + "epoch": 1.0710022953328233, + "grad_norm": 2.2312942926442703, + "learning_rate": 9.337571461898112e-06, + "loss": 0.504, + "step": 6999 + }, + { + "epoch": 1.0711553175210407, + "grad_norm": 2.2598765293416876, + "learning_rate": 9.33509874246864e-06, + "loss": 0.5225, + "step": 7000 + }, + { + "epoch": 1.0713083397092578, + "grad_norm": 2.503438620842996, + "learning_rate": 9.332626063873354e-06, + "loss": 0.5315, + "step": 7001 + }, + { + "epoch": 1.071461361897475, + "grad_norm": 2.5807369463416916, + "learning_rate": 9.330153426264117e-06, + "loss": 0.4877, + "step": 7002 + }, + { + "epoch": 1.0716143840856924, + "grad_norm": 2.568301950382052, + "learning_rate": 9.327680829792775e-06, + "loss": 0.5205, + "step": 7003 + }, + { + "epoch": 1.0717674062739098, + "grad_norm": 2.182053222736725, + "learning_rate": 9.325208274611184e-06, + "loss": 0.436, + "step": 7004 + }, + { + "epoch": 1.071920428462127, + "grad_norm": 2.265886222181865, + "learning_rate": 9.322735760871194e-06, + "loss": 0.4027, + "step": 7005 + }, + { + "epoch": 1.0720734506503442, + "grad_norm": 2.2566761587437125, + "learning_rate": 9.32026328872465e-06, + "loss": 0.4805, + "step": 7006 + }, + { + "epoch": 1.0722264728385615, + "grad_norm": 2.5470439085229133, + "learning_rate": 9.317790858323393e-06, + "loss": 0.5402, + "step": 7007 + }, + { + "epoch": 1.0723794950267789, + "grad_norm": 2.473910127790221, + "learning_rate": 9.315318469819275e-06, + "loss": 0.4526, + "step": 7008 + }, + { + "epoch": 1.0725325172149962, + "grad_norm": 2.1728995220377776, + "learning_rate": 9.31284612336412e-06, + "loss": 0.4184, + "step": 7009 + }, + { + "epoch": 1.0726855394032135, + "grad_norm": 2.166777588898116, + "learning_rate": 9.310373819109779e-06, + "loss": 0.4201, + "step": 7010 + }, + { + "epoch": 1.0728385615914307, + "grad_norm": 2.2110283208174177, + "learning_rate": 9.30790155720808e-06, + "loss": 0.4783, + "step": 7011 + }, + { + "epoch": 1.072991583779648, + "grad_norm": 2.2649284810005117, + "learning_rate": 9.30542933781085e-06, + "loss": 0.3991, + "step": 7012 + }, + { + "epoch": 1.0731446059678653, + "grad_norm": 2.1904501370858522, + "learning_rate": 9.302957161069921e-06, + "loss": 0.4015, + "step": 7013 + }, + { + "epoch": 1.0732976281560827, + "grad_norm": 2.1730718132967017, + "learning_rate": 9.300485027137125e-06, + "loss": 0.4699, + "step": 7014 + }, + { + "epoch": 1.0734506503443, + "grad_norm": 2.3653172817867314, + "learning_rate": 9.298012936164275e-06, + "loss": 0.5384, + "step": 7015 + }, + { + "epoch": 1.0736036725325173, + "grad_norm": 2.1901146142058394, + "learning_rate": 9.295540888303198e-06, + "loss": 0.5113, + "step": 7016 + }, + { + "epoch": 1.0737566947207344, + "grad_norm": 2.384766068642065, + "learning_rate": 9.29306888370571e-06, + "loss": 0.5185, + "step": 7017 + }, + { + "epoch": 1.0739097169089518, + "grad_norm": 2.6746763962083167, + "learning_rate": 9.29059692252363e-06, + "loss": 0.4647, + "step": 7018 + }, + { + "epoch": 1.074062739097169, + "grad_norm": 2.353980841278368, + "learning_rate": 9.288125004908768e-06, + "loss": 0.4805, + "step": 7019 + }, + { + "epoch": 1.0742157612853864, + "grad_norm": 2.311592396149525, + "learning_rate": 9.28565313101293e-06, + "loss": 0.5211, + "step": 7020 + }, + { + "epoch": 1.0743687834736038, + "grad_norm": 2.252228528893847, + "learning_rate": 9.283181300987934e-06, + "loss": 0.4451, + "step": 7021 + }, + { + "epoch": 1.0745218056618209, + "grad_norm": 2.2982993979133925, + "learning_rate": 9.280709514985575e-06, + "loss": 0.3907, + "step": 7022 + }, + { + "epoch": 1.0746748278500382, + "grad_norm": 2.3070546754874295, + "learning_rate": 9.278237773157657e-06, + "loss": 0.4716, + "step": 7023 + }, + { + "epoch": 1.0748278500382555, + "grad_norm": 2.6131421219502498, + "learning_rate": 9.275766075655986e-06, + "loss": 0.5161, + "step": 7024 + }, + { + "epoch": 1.0749808722264729, + "grad_norm": 2.4042427042020873, + "learning_rate": 9.27329442263235e-06, + "loss": 0.5227, + "step": 7025 + }, + { + "epoch": 1.0751338944146902, + "grad_norm": 2.3104549373393652, + "learning_rate": 9.270822814238547e-06, + "loss": 0.4751, + "step": 7026 + }, + { + "epoch": 1.0752869166029075, + "grad_norm": 2.1835962666687254, + "learning_rate": 9.268351250626368e-06, + "loss": 0.5067, + "step": 7027 + }, + { + "epoch": 1.0754399387911246, + "grad_norm": 2.2162958243208735, + "learning_rate": 9.265879731947599e-06, + "loss": 0.4971, + "step": 7028 + }, + { + "epoch": 1.075592960979342, + "grad_norm": 2.342103971102228, + "learning_rate": 9.263408258354028e-06, + "loss": 0.4492, + "step": 7029 + }, + { + "epoch": 1.0757459831675593, + "grad_norm": 2.5572314302614387, + "learning_rate": 9.260936829997437e-06, + "loss": 0.5074, + "step": 7030 + }, + { + "epoch": 1.0758990053557766, + "grad_norm": 2.2950024298977123, + "learning_rate": 9.258465447029609e-06, + "loss": 0.3915, + "step": 7031 + }, + { + "epoch": 1.076052027543994, + "grad_norm": 2.3647579223440163, + "learning_rate": 9.255994109602319e-06, + "loss": 0.5398, + "step": 7032 + }, + { + "epoch": 1.076205049732211, + "grad_norm": 2.336276675467057, + "learning_rate": 9.253522817867337e-06, + "loss": 0.4975, + "step": 7033 + }, + { + "epoch": 1.0763580719204284, + "grad_norm": 2.569841908595689, + "learning_rate": 9.251051571976444e-06, + "loss": 0.5007, + "step": 7034 + }, + { + "epoch": 1.0765110941086458, + "grad_norm": 1.9321188143755093, + "learning_rate": 9.248580372081403e-06, + "loss": 0.4187, + "step": 7035 + }, + { + "epoch": 1.076664116296863, + "grad_norm": 1.993423653114654, + "learning_rate": 9.246109218333978e-06, + "loss": 0.4471, + "step": 7036 + }, + { + "epoch": 1.0768171384850804, + "grad_norm": 2.8320947879207923, + "learning_rate": 9.243638110885938e-06, + "loss": 0.3765, + "step": 7037 + }, + { + "epoch": 1.0769701606732975, + "grad_norm": 2.121544366157612, + "learning_rate": 9.241167049889038e-06, + "loss": 0.4361, + "step": 7038 + }, + { + "epoch": 1.0771231828615149, + "grad_norm": 2.1231269443520153, + "learning_rate": 9.23869603549504e-06, + "loss": 0.4724, + "step": 7039 + }, + { + "epoch": 1.0772762050497322, + "grad_norm": 2.1396557730209924, + "learning_rate": 9.236225067855697e-06, + "loss": 0.4575, + "step": 7040 + }, + { + "epoch": 1.0774292272379495, + "grad_norm": 2.444995460337678, + "learning_rate": 9.233754147122759e-06, + "loss": 0.5192, + "step": 7041 + }, + { + "epoch": 1.0775822494261669, + "grad_norm": 2.09131681001774, + "learning_rate": 9.231283273447976e-06, + "loss": 0.417, + "step": 7042 + }, + { + "epoch": 1.077735271614384, + "grad_norm": 2.412492987813149, + "learning_rate": 9.228812446983098e-06, + "loss": 0.4716, + "step": 7043 + }, + { + "epoch": 1.0778882938026013, + "grad_norm": 2.376792204202723, + "learning_rate": 9.22634166787986e-06, + "loss": 0.5042, + "step": 7044 + }, + { + "epoch": 1.0780413159908186, + "grad_norm": 2.258267038660303, + "learning_rate": 9.22387093629001e-06, + "loss": 0.5356, + "step": 7045 + }, + { + "epoch": 1.078194338179036, + "grad_norm": 2.140885159688897, + "learning_rate": 9.221400252365279e-06, + "loss": 0.402, + "step": 7046 + }, + { + "epoch": 1.0783473603672533, + "grad_norm": 2.299639935119907, + "learning_rate": 9.218929616257406e-06, + "loss": 0.4874, + "step": 7047 + }, + { + "epoch": 1.0785003825554704, + "grad_norm": 2.0884411596909005, + "learning_rate": 9.216459028118123e-06, + "loss": 0.4117, + "step": 7048 + }, + { + "epoch": 1.0786534047436878, + "grad_norm": 2.3604023546791613, + "learning_rate": 9.213988488099152e-06, + "loss": 0.4785, + "step": 7049 + }, + { + "epoch": 1.078806426931905, + "grad_norm": 2.2206333646096197, + "learning_rate": 9.211517996352226e-06, + "loss": 0.5144, + "step": 7050 + }, + { + "epoch": 1.0789594491201224, + "grad_norm": 2.4418928939273794, + "learning_rate": 9.209047553029065e-06, + "loss": 0.5097, + "step": 7051 + }, + { + "epoch": 1.0791124713083398, + "grad_norm": 2.4200213816818756, + "learning_rate": 9.206577158281384e-06, + "loss": 0.4596, + "step": 7052 + }, + { + "epoch": 1.079265493496557, + "grad_norm": 2.1039066805422553, + "learning_rate": 9.204106812260908e-06, + "loss": 0.5227, + "step": 7053 + }, + { + "epoch": 1.0794185156847742, + "grad_norm": 2.3042584412347136, + "learning_rate": 9.20163651511934e-06, + "loss": 0.4305, + "step": 7054 + }, + { + "epoch": 1.0795715378729915, + "grad_norm": 2.4025169673069118, + "learning_rate": 9.199166267008404e-06, + "loss": 0.513, + "step": 7055 + }, + { + "epoch": 1.0797245600612089, + "grad_norm": 2.46461175249102, + "learning_rate": 9.196696068079799e-06, + "loss": 0.4345, + "step": 7056 + }, + { + "epoch": 1.0798775822494262, + "grad_norm": 2.0642559537388334, + "learning_rate": 9.194225918485226e-06, + "loss": 0.3485, + "step": 7057 + }, + { + "epoch": 1.0800306044376435, + "grad_norm": 2.305597405190117, + "learning_rate": 9.191755818376397e-06, + "loss": 0.4684, + "step": 7058 + }, + { + "epoch": 1.0801836266258606, + "grad_norm": 2.153729065281024, + "learning_rate": 9.189285767905006e-06, + "loss": 0.4609, + "step": 7059 + }, + { + "epoch": 1.080336648814078, + "grad_norm": 2.151763990460353, + "learning_rate": 9.18681576722274e-06, + "loss": 0.5002, + "step": 7060 + }, + { + "epoch": 1.0804896710022953, + "grad_norm": 2.121608247611442, + "learning_rate": 9.184345816481307e-06, + "loss": 0.6741, + "step": 7061 + }, + { + "epoch": 1.0806426931905126, + "grad_norm": 2.585835936462805, + "learning_rate": 9.181875915832381e-06, + "loss": 0.5757, + "step": 7062 + }, + { + "epoch": 1.08079571537873, + "grad_norm": 2.5048579827782746, + "learning_rate": 9.17940606542766e-06, + "loss": 0.4583, + "step": 7063 + }, + { + "epoch": 1.0809487375669473, + "grad_norm": 2.149409121009293, + "learning_rate": 9.176936265418825e-06, + "loss": 0.5809, + "step": 7064 + }, + { + "epoch": 1.0811017597551644, + "grad_norm": 2.1026339389941575, + "learning_rate": 9.174466515957548e-06, + "loss": 0.3908, + "step": 7065 + }, + { + "epoch": 1.0812547819433818, + "grad_norm": 2.281688039861431, + "learning_rate": 9.171996817195516e-06, + "loss": 0.4972, + "step": 7066 + }, + { + "epoch": 1.081407804131599, + "grad_norm": 2.359209844798118, + "learning_rate": 9.169527169284395e-06, + "loss": 0.5039, + "step": 7067 + }, + { + "epoch": 1.0815608263198164, + "grad_norm": 2.284727340548648, + "learning_rate": 9.167057572375863e-06, + "loss": 0.4852, + "step": 7068 + }, + { + "epoch": 1.0817138485080338, + "grad_norm": 2.4648971458789917, + "learning_rate": 9.164588026621582e-06, + "loss": 0.4096, + "step": 7069 + }, + { + "epoch": 1.0818668706962509, + "grad_norm": 2.050205220971181, + "learning_rate": 9.162118532173217e-06, + "loss": 0.4125, + "step": 7070 + }, + { + "epoch": 1.0820198928844682, + "grad_norm": 2.0811476169259704, + "learning_rate": 9.159649089182436e-06, + "loss": 0.4692, + "step": 7071 + }, + { + "epoch": 1.0821729150726855, + "grad_norm": 2.217536427398098, + "learning_rate": 9.157179697800889e-06, + "loss": 0.4477, + "step": 7072 + }, + { + "epoch": 1.0823259372609029, + "grad_norm": 2.542074757986943, + "learning_rate": 9.154710358180231e-06, + "loss": 0.5461, + "step": 7073 + }, + { + "epoch": 1.0824789594491202, + "grad_norm": 2.050089241280042, + "learning_rate": 9.152241070472122e-06, + "loss": 0.4656, + "step": 7074 + }, + { + "epoch": 1.0826319816373373, + "grad_norm": 2.2914035705247695, + "learning_rate": 9.149771834828201e-06, + "loss": 0.491, + "step": 7075 + }, + { + "epoch": 1.0827850038255546, + "grad_norm": 2.243598099910007, + "learning_rate": 9.147302651400119e-06, + "loss": 0.5017, + "step": 7076 + }, + { + "epoch": 1.082938026013772, + "grad_norm": 2.6116613995542837, + "learning_rate": 9.144833520339518e-06, + "loss": 0.5296, + "step": 7077 + }, + { + "epoch": 1.0830910482019893, + "grad_norm": 2.2604014415370592, + "learning_rate": 9.142364441798035e-06, + "loss": 0.4793, + "step": 7078 + }, + { + "epoch": 1.0832440703902066, + "grad_norm": 2.140404673851035, + "learning_rate": 9.139895415927307e-06, + "loss": 0.3525, + "step": 7079 + }, + { + "epoch": 1.0833970925784238, + "grad_norm": 2.2760551176385913, + "learning_rate": 9.137426442878969e-06, + "loss": 0.4887, + "step": 7080 + }, + { + "epoch": 1.083550114766641, + "grad_norm": 2.3819989085455298, + "learning_rate": 9.134957522804641e-06, + "loss": 0.4958, + "step": 7081 + }, + { + "epoch": 1.0837031369548584, + "grad_norm": 2.4328489903808523, + "learning_rate": 9.13248865585596e-06, + "loss": 0.4582, + "step": 7082 + }, + { + "epoch": 1.0838561591430758, + "grad_norm": 2.1001441662606575, + "learning_rate": 9.130019842184542e-06, + "loss": 0.4808, + "step": 7083 + }, + { + "epoch": 1.084009181331293, + "grad_norm": 2.126839624383373, + "learning_rate": 9.127551081942013e-06, + "loss": 0.505, + "step": 7084 + }, + { + "epoch": 1.0841622035195104, + "grad_norm": 2.0661438691181386, + "learning_rate": 9.125082375279983e-06, + "loss": 0.4676, + "step": 7085 + }, + { + "epoch": 1.0843152257077275, + "grad_norm": 2.104196112200205, + "learning_rate": 9.122613722350064e-06, + "loss": 0.383, + "step": 7086 + }, + { + "epoch": 1.0844682478959449, + "grad_norm": 2.2568665355471746, + "learning_rate": 9.120145123303874e-06, + "loss": 0.5048, + "step": 7087 + }, + { + "epoch": 1.0846212700841622, + "grad_norm": 2.2024165587118176, + "learning_rate": 9.11767657829301e-06, + "loss": 0.4727, + "step": 7088 + }, + { + "epoch": 1.0847742922723795, + "grad_norm": 2.612232217407369, + "learning_rate": 9.11520808746908e-06, + "loss": 0.5607, + "step": 7089 + }, + { + "epoch": 1.0849273144605969, + "grad_norm": 2.1450307502448847, + "learning_rate": 9.112739650983685e-06, + "loss": 0.5029, + "step": 7090 + }, + { + "epoch": 1.085080336648814, + "grad_norm": 2.578778549735402, + "learning_rate": 9.110271268988415e-06, + "loss": 0.5127, + "step": 7091 + }, + { + "epoch": 1.0852333588370313, + "grad_norm": 2.1489916563880165, + "learning_rate": 9.107802941634869e-06, + "loss": 0.4452, + "step": 7092 + }, + { + "epoch": 1.0853863810252486, + "grad_norm": 2.2916701107274147, + "learning_rate": 9.105334669074637e-06, + "loss": 0.4841, + "step": 7093 + }, + { + "epoch": 1.085539403213466, + "grad_norm": 2.152474576028727, + "learning_rate": 9.102866451459299e-06, + "loss": 0.416, + "step": 7094 + }, + { + "epoch": 1.0856924254016833, + "grad_norm": 1.9391836681067787, + "learning_rate": 9.100398288940443e-06, + "loss": 0.3911, + "step": 7095 + }, + { + "epoch": 1.0858454475899006, + "grad_norm": 2.284477748078566, + "learning_rate": 9.09793018166965e-06, + "loss": 0.4323, + "step": 7096 + }, + { + "epoch": 1.0859984697781178, + "grad_norm": 2.0810449204826558, + "learning_rate": 9.09546212979849e-06, + "loss": 0.4461, + "step": 7097 + }, + { + "epoch": 1.086151491966335, + "grad_norm": 2.102591065248399, + "learning_rate": 9.09299413347854e-06, + "loss": 0.4236, + "step": 7098 + }, + { + "epoch": 1.0863045141545524, + "grad_norm": 2.285514466521769, + "learning_rate": 9.090526192861367e-06, + "loss": 0.4432, + "step": 7099 + }, + { + "epoch": 1.0864575363427698, + "grad_norm": 2.2247005114609064, + "learning_rate": 9.088058308098542e-06, + "loss": 0.4757, + "step": 7100 + }, + { + "epoch": 1.086610558530987, + "grad_norm": 2.379153491410123, + "learning_rate": 9.085590479341623e-06, + "loss": 0.4765, + "step": 7101 + }, + { + "epoch": 1.0867635807192042, + "grad_norm": 2.321895207959366, + "learning_rate": 9.083122706742166e-06, + "loss": 0.5086, + "step": 7102 + }, + { + "epoch": 1.0869166029074215, + "grad_norm": 2.3449443560981345, + "learning_rate": 9.080654990451736e-06, + "loss": 0.4743, + "step": 7103 + }, + { + "epoch": 1.0870696250956389, + "grad_norm": 2.169152507590791, + "learning_rate": 9.078187330621876e-06, + "loss": 0.4113, + "step": 7104 + }, + { + "epoch": 1.0872226472838562, + "grad_norm": 2.2823786746054187, + "learning_rate": 9.075719727404138e-06, + "loss": 0.4769, + "step": 7105 + }, + { + "epoch": 1.0873756694720735, + "grad_norm": 2.2438777723684966, + "learning_rate": 9.073252180950072e-06, + "loss": 0.4935, + "step": 7106 + }, + { + "epoch": 1.0875286916602906, + "grad_norm": 2.191000328520221, + "learning_rate": 9.070784691411208e-06, + "loss": 0.4083, + "step": 7107 + }, + { + "epoch": 1.087681713848508, + "grad_norm": 2.2185239900801275, + "learning_rate": 9.068317258939096e-06, + "loss": 0.4524, + "step": 7108 + }, + { + "epoch": 1.0878347360367253, + "grad_norm": 2.469228148557464, + "learning_rate": 9.065849883685265e-06, + "loss": 0.4708, + "step": 7109 + }, + { + "epoch": 1.0879877582249426, + "grad_norm": 2.148317257106404, + "learning_rate": 9.063382565801247e-06, + "loss": 0.4117, + "step": 7110 + }, + { + "epoch": 1.08814078041316, + "grad_norm": 2.46960126434831, + "learning_rate": 9.060915305438567e-06, + "loss": 0.5797, + "step": 7111 + }, + { + "epoch": 1.088293802601377, + "grad_norm": 2.5654903458195295, + "learning_rate": 9.058448102748753e-06, + "loss": 0.5204, + "step": 7112 + }, + { + "epoch": 1.0884468247895944, + "grad_norm": 2.5226254011755085, + "learning_rate": 9.055980957883329e-06, + "loss": 0.5013, + "step": 7113 + }, + { + "epoch": 1.0885998469778118, + "grad_norm": 2.444152682210665, + "learning_rate": 9.053513870993803e-06, + "loss": 0.4974, + "step": 7114 + }, + { + "epoch": 1.088752869166029, + "grad_norm": 2.1433965549708223, + "learning_rate": 9.051046842231693e-06, + "loss": 0.3761, + "step": 7115 + }, + { + "epoch": 1.0889058913542464, + "grad_norm": 2.2244339091179848, + "learning_rate": 9.04857987174851e-06, + "loss": 0.4533, + "step": 7116 + }, + { + "epoch": 1.0890589135424638, + "grad_norm": 2.3900548171797342, + "learning_rate": 9.04611295969576e-06, + "loss": 0.4394, + "step": 7117 + }, + { + "epoch": 1.0892119357306809, + "grad_norm": 2.477441551752149, + "learning_rate": 9.043646106224942e-06, + "loss": 0.4756, + "step": 7118 + }, + { + "epoch": 1.0893649579188982, + "grad_norm": 2.2301190063141707, + "learning_rate": 9.04117931148756e-06, + "loss": 0.4872, + "step": 7119 + }, + { + "epoch": 1.0895179801071155, + "grad_norm": 2.2765796549488373, + "learning_rate": 9.038712575635105e-06, + "loss": 0.5097, + "step": 7120 + }, + { + "epoch": 1.0896710022953329, + "grad_norm": 2.516283088910062, + "learning_rate": 9.036245898819074e-06, + "loss": 0.4335, + "step": 7121 + }, + { + "epoch": 1.0898240244835502, + "grad_norm": 2.279848242904234, + "learning_rate": 9.033779281190953e-06, + "loss": 0.5019, + "step": 7122 + }, + { + "epoch": 1.0899770466717673, + "grad_norm": 2.1190201556445114, + "learning_rate": 9.031312722902223e-06, + "loss": 0.4462, + "step": 7123 + }, + { + "epoch": 1.0901300688599846, + "grad_norm": 2.3137880793300463, + "learning_rate": 9.028846224104369e-06, + "loss": 0.4546, + "step": 7124 + }, + { + "epoch": 1.090283091048202, + "grad_norm": 2.1680262820024723, + "learning_rate": 9.026379784948873e-06, + "loss": 0.4073, + "step": 7125 + }, + { + "epoch": 1.0904361132364193, + "grad_norm": 2.2731966671711166, + "learning_rate": 9.023913405587196e-06, + "loss": 0.4626, + "step": 7126 + }, + { + "epoch": 1.0905891354246366, + "grad_norm": 2.066266313161121, + "learning_rate": 9.021447086170818e-06, + "loss": 0.4734, + "step": 7127 + }, + { + "epoch": 1.090742157612854, + "grad_norm": 1.9706951129586714, + "learning_rate": 9.0189808268512e-06, + "loss": 0.4781, + "step": 7128 + }, + { + "epoch": 1.090895179801071, + "grad_norm": 2.141784621387068, + "learning_rate": 9.016514627779811e-06, + "loss": 0.484, + "step": 7129 + }, + { + "epoch": 1.0910482019892884, + "grad_norm": 2.63631118781373, + "learning_rate": 9.014048489108106e-06, + "loss": 0.4505, + "step": 7130 + }, + { + "epoch": 1.0912012241775058, + "grad_norm": 2.1987115618847795, + "learning_rate": 9.011582410987538e-06, + "loss": 0.4639, + "step": 7131 + }, + { + "epoch": 1.091354246365723, + "grad_norm": 2.134264485035292, + "learning_rate": 9.009116393569563e-06, + "loss": 0.4645, + "step": 7132 + }, + { + "epoch": 1.0915072685539404, + "grad_norm": 2.0750373860179785, + "learning_rate": 9.006650437005627e-06, + "loss": 0.4651, + "step": 7133 + }, + { + "epoch": 1.0916602907421575, + "grad_norm": 2.2855081732746036, + "learning_rate": 9.004184541447169e-06, + "loss": 0.4995, + "step": 7134 + }, + { + "epoch": 1.0918133129303749, + "grad_norm": 2.4098402902812914, + "learning_rate": 9.00171870704564e-06, + "loss": 0.4734, + "step": 7135 + }, + { + "epoch": 1.0919663351185922, + "grad_norm": 2.032376714257287, + "learning_rate": 8.999252933952465e-06, + "loss": 0.3866, + "step": 7136 + }, + { + "epoch": 1.0921193573068095, + "grad_norm": 2.15493218358809, + "learning_rate": 8.996787222319087e-06, + "loss": 0.5154, + "step": 7137 + }, + { + "epoch": 1.0922723794950269, + "grad_norm": 2.0858765860099173, + "learning_rate": 8.99432157229693e-06, + "loss": 0.3879, + "step": 7138 + }, + { + "epoch": 1.092425401683244, + "grad_norm": 2.5591535079462098, + "learning_rate": 8.991855984037414e-06, + "loss": 0.4634, + "step": 7139 + }, + { + "epoch": 1.0925784238714613, + "grad_norm": 2.4454327491285417, + "learning_rate": 8.989390457691973e-06, + "loss": 0.4858, + "step": 7140 + }, + { + "epoch": 1.0927314460596786, + "grad_norm": 1.8496428093473942, + "learning_rate": 8.986924993412012e-06, + "loss": 0.4182, + "step": 7141 + }, + { + "epoch": 1.092884468247896, + "grad_norm": 1.9629732376566862, + "learning_rate": 8.984459591348952e-06, + "loss": 0.414, + "step": 7142 + }, + { + "epoch": 1.0930374904361133, + "grad_norm": 2.382561085581707, + "learning_rate": 8.981994251654205e-06, + "loss": 0.4709, + "step": 7143 + }, + { + "epoch": 1.0931905126243304, + "grad_norm": 2.076307911023831, + "learning_rate": 8.979528974479169e-06, + "loss": 0.4202, + "step": 7144 + }, + { + "epoch": 1.0933435348125478, + "grad_norm": 2.1286263317947687, + "learning_rate": 8.977063759975251e-06, + "loss": 0.4613, + "step": 7145 + }, + { + "epoch": 1.093496557000765, + "grad_norm": 2.4719234461371986, + "learning_rate": 8.974598608293853e-06, + "loss": 0.4872, + "step": 7146 + }, + { + "epoch": 1.0936495791889824, + "grad_norm": 1.9484181799070321, + "learning_rate": 8.972133519586361e-06, + "loss": 0.349, + "step": 7147 + }, + { + "epoch": 1.0938026013771998, + "grad_norm": 2.149297644516257, + "learning_rate": 8.969668494004173e-06, + "loss": 0.4462, + "step": 7148 + }, + { + "epoch": 1.0939556235654169, + "grad_norm": 2.3154019028958643, + "learning_rate": 8.967203531698672e-06, + "loss": 0.4999, + "step": 7149 + }, + { + "epoch": 1.0941086457536342, + "grad_norm": 2.345475674509572, + "learning_rate": 8.964738632821245e-06, + "loss": 0.4747, + "step": 7150 + }, + { + "epoch": 1.0942616679418515, + "grad_norm": 2.028434119282612, + "learning_rate": 8.962273797523268e-06, + "loss": 0.3636, + "step": 7151 + }, + { + "epoch": 1.0944146901300689, + "grad_norm": 2.5352442491743523, + "learning_rate": 8.959809025956113e-06, + "loss": 0.4937, + "step": 7152 + }, + { + "epoch": 1.0945677123182862, + "grad_norm": 2.0639018071881408, + "learning_rate": 8.957344318271161e-06, + "loss": 0.3728, + "step": 7153 + }, + { + "epoch": 1.0947207345065035, + "grad_norm": 2.3340432211508357, + "learning_rate": 8.954879674619769e-06, + "loss": 0.4858, + "step": 7154 + }, + { + "epoch": 1.0948737566947206, + "grad_norm": 2.0559018209376685, + "learning_rate": 8.952415095153305e-06, + "loss": 0.4086, + "step": 7155 + }, + { + "epoch": 1.095026778882938, + "grad_norm": 2.316442921819214, + "learning_rate": 8.949950580023131e-06, + "loss": 0.4435, + "step": 7156 + }, + { + "epoch": 1.0951798010711553, + "grad_norm": 1.9374221563323648, + "learning_rate": 8.947486129380597e-06, + "loss": 0.4009, + "step": 7157 + }, + { + "epoch": 1.0953328232593726, + "grad_norm": 2.030063875991149, + "learning_rate": 8.945021743377057e-06, + "loss": 0.3603, + "step": 7158 + }, + { + "epoch": 1.09548584544759, + "grad_norm": 2.1826213973706667, + "learning_rate": 8.942557422163864e-06, + "loss": 0.4447, + "step": 7159 + }, + { + "epoch": 1.095638867635807, + "grad_norm": 2.252983268260625, + "learning_rate": 8.94009316589235e-06, + "loss": 0.4963, + "step": 7160 + }, + { + "epoch": 1.0957918898240244, + "grad_norm": 2.6669262080305516, + "learning_rate": 8.937628974713864e-06, + "loss": 0.5089, + "step": 7161 + }, + { + "epoch": 1.0959449120122418, + "grad_norm": 2.12773257212889, + "learning_rate": 8.93516484877974e-06, + "loss": 0.4969, + "step": 7162 + }, + { + "epoch": 1.096097934200459, + "grad_norm": 2.3072830017793313, + "learning_rate": 8.932700788241304e-06, + "loss": 0.479, + "step": 7163 + }, + { + "epoch": 1.0962509563886764, + "grad_norm": 2.2042128737534004, + "learning_rate": 8.93023679324989e-06, + "loss": 0.4839, + "step": 7164 + }, + { + "epoch": 1.0964039785768938, + "grad_norm": 2.793680246758663, + "learning_rate": 8.92777286395682e-06, + "loss": 0.4981, + "step": 7165 + }, + { + "epoch": 1.0965570007651109, + "grad_norm": 2.225104256306868, + "learning_rate": 8.925309000513414e-06, + "loss": 0.4377, + "step": 7166 + }, + { + "epoch": 1.0967100229533282, + "grad_norm": 1.9745454245625953, + "learning_rate": 8.922845203070984e-06, + "loss": 0.4017, + "step": 7167 + }, + { + "epoch": 1.0968630451415455, + "grad_norm": 2.368420047885716, + "learning_rate": 8.920381471780843e-06, + "loss": 0.5215, + "step": 7168 + }, + { + "epoch": 1.0970160673297629, + "grad_norm": 2.051795870663101, + "learning_rate": 8.917917806794305e-06, + "loss": 0.3828, + "step": 7169 + }, + { + "epoch": 1.0971690895179802, + "grad_norm": 2.123677254807295, + "learning_rate": 8.915454208262664e-06, + "loss": 0.4633, + "step": 7170 + }, + { + "epoch": 1.0973221117061973, + "grad_norm": 2.2627608124003604, + "learning_rate": 8.912990676337221e-06, + "loss": 0.4726, + "step": 7171 + }, + { + "epoch": 1.0974751338944146, + "grad_norm": 2.103757846488755, + "learning_rate": 8.910527211169277e-06, + "loss": 0.4227, + "step": 7172 + }, + { + "epoch": 1.097628156082632, + "grad_norm": 2.2143920295035544, + "learning_rate": 8.908063812910116e-06, + "loss": 0.4625, + "step": 7173 + }, + { + "epoch": 1.0977811782708493, + "grad_norm": 2.100586428663275, + "learning_rate": 8.905600481711027e-06, + "loss": 0.4364, + "step": 7174 + }, + { + "epoch": 1.0979342004590666, + "grad_norm": 2.058446058812767, + "learning_rate": 8.9031372177233e-06, + "loss": 0.4965, + "step": 7175 + }, + { + "epoch": 1.0980872226472838, + "grad_norm": 2.588950853621863, + "learning_rate": 8.9006740210982e-06, + "loss": 0.3974, + "step": 7176 + }, + { + "epoch": 1.098240244835501, + "grad_norm": 2.1324434817566065, + "learning_rate": 8.898210891987012e-06, + "loss": 0.5475, + "step": 7177 + }, + { + "epoch": 1.0983932670237184, + "grad_norm": 2.0296371639639674, + "learning_rate": 8.895747830541001e-06, + "loss": 0.4481, + "step": 7178 + }, + { + "epoch": 1.0985462892119358, + "grad_norm": 2.0897999254760973, + "learning_rate": 8.89328483691144e-06, + "loss": 0.4616, + "step": 7179 + }, + { + "epoch": 1.098699311400153, + "grad_norm": 2.14171265984956, + "learning_rate": 8.890821911249585e-06, + "loss": 0.4869, + "step": 7180 + }, + { + "epoch": 1.0988523335883702, + "grad_norm": 2.1623838358382805, + "learning_rate": 8.888359053706694e-06, + "loss": 0.4638, + "step": 7181 + }, + { + "epoch": 1.0990053557765875, + "grad_norm": 1.956343477779932, + "learning_rate": 8.885896264434025e-06, + "loss": 0.3838, + "step": 7182 + }, + { + "epoch": 1.0991583779648049, + "grad_norm": 2.3400499233335768, + "learning_rate": 8.883433543582824e-06, + "loss": 0.4655, + "step": 7183 + }, + { + "epoch": 1.0993114001530222, + "grad_norm": 2.399181740585056, + "learning_rate": 8.880970891304334e-06, + "loss": 0.4578, + "step": 7184 + }, + { + "epoch": 1.0994644223412395, + "grad_norm": 2.250008234169996, + "learning_rate": 8.878508307749804e-06, + "loss": 0.428, + "step": 7185 + }, + { + "epoch": 1.0996174445294569, + "grad_norm": 2.290696083811117, + "learning_rate": 8.87604579307046e-06, + "loss": 0.4269, + "step": 7186 + }, + { + "epoch": 1.099770466717674, + "grad_norm": 2.505122870008909, + "learning_rate": 8.873583347417546e-06, + "loss": 0.5127, + "step": 7187 + }, + { + "epoch": 1.0999234889058913, + "grad_norm": 2.353030872555066, + "learning_rate": 8.871120970942285e-06, + "loss": 0.4578, + "step": 7188 + }, + { + "epoch": 1.1000765110941086, + "grad_norm": 2.301167013530359, + "learning_rate": 8.868658663795898e-06, + "loss": 0.3812, + "step": 7189 + }, + { + "epoch": 1.100229533282326, + "grad_norm": 2.2533182412177513, + "learning_rate": 8.866196426129612e-06, + "loss": 0.427, + "step": 7190 + }, + { + "epoch": 1.1003825554705433, + "grad_norm": 2.4637131571486406, + "learning_rate": 8.863734258094638e-06, + "loss": 0.4376, + "step": 7191 + }, + { + "epoch": 1.1005355776587604, + "grad_norm": 2.0197232101443223, + "learning_rate": 8.861272159842186e-06, + "loss": 0.4344, + "step": 7192 + }, + { + "epoch": 1.1006885998469778, + "grad_norm": 2.251483371734288, + "learning_rate": 8.858810131523467e-06, + "loss": 0.4298, + "step": 7193 + }, + { + "epoch": 1.100841622035195, + "grad_norm": 2.287008056541385, + "learning_rate": 8.85634817328968e-06, + "loss": 0.4718, + "step": 7194 + }, + { + "epoch": 1.1009946442234124, + "grad_norm": 2.209324158741729, + "learning_rate": 8.85388628529203e-06, + "loss": 0.4657, + "step": 7195 + }, + { + "epoch": 1.1011476664116298, + "grad_norm": 2.1505467139385304, + "learning_rate": 8.851424467681705e-06, + "loss": 0.4322, + "step": 7196 + }, + { + "epoch": 1.101300688599847, + "grad_norm": 2.1982904432190984, + "learning_rate": 8.848962720609894e-06, + "loss": 0.4675, + "step": 7197 + }, + { + "epoch": 1.1014537107880642, + "grad_norm": 2.1486253183804767, + "learning_rate": 8.84650104422779e-06, + "loss": 0.4145, + "step": 7198 + }, + { + "epoch": 1.1016067329762815, + "grad_norm": 2.0660350104739864, + "learning_rate": 8.844039438686566e-06, + "loss": 0.4289, + "step": 7199 + }, + { + "epoch": 1.1017597551644989, + "grad_norm": 2.2958785347357793, + "learning_rate": 8.841577904137402e-06, + "loss": 0.4849, + "step": 7200 + }, + { + "epoch": 1.1019127773527162, + "grad_norm": 2.3249009204416216, + "learning_rate": 8.839116440731475e-06, + "loss": 0.4445, + "step": 7201 + }, + { + "epoch": 1.1020657995409335, + "grad_norm": 2.2802870963931188, + "learning_rate": 8.836655048619943e-06, + "loss": 0.4219, + "step": 7202 + }, + { + "epoch": 1.1022188217291506, + "grad_norm": 2.1524504956749304, + "learning_rate": 8.834193727953977e-06, + "loss": 0.4425, + "step": 7203 + }, + { + "epoch": 1.102371843917368, + "grad_norm": 2.3059346698587024, + "learning_rate": 8.831732478884739e-06, + "loss": 0.481, + "step": 7204 + }, + { + "epoch": 1.1025248661055853, + "grad_norm": 2.017182531907496, + "learning_rate": 8.829271301563375e-06, + "loss": 0.3548, + "step": 7205 + }, + { + "epoch": 1.1026778882938026, + "grad_norm": 2.1409796994991286, + "learning_rate": 8.826810196141042e-06, + "loss": 0.5375, + "step": 7206 + }, + { + "epoch": 1.10283091048202, + "grad_norm": 2.1409288991943924, + "learning_rate": 8.824349162768882e-06, + "loss": 0.4341, + "step": 7207 + }, + { + "epoch": 1.102983932670237, + "grad_norm": 2.198060299539323, + "learning_rate": 8.821888201598044e-06, + "loss": 0.4512, + "step": 7208 + }, + { + "epoch": 1.1031369548584544, + "grad_norm": 2.262625789778196, + "learning_rate": 8.819427312779658e-06, + "loss": 0.4792, + "step": 7209 + }, + { + "epoch": 1.1032899770466718, + "grad_norm": 2.2566093525445594, + "learning_rate": 8.816966496464858e-06, + "loss": 0.4995, + "step": 7210 + }, + { + "epoch": 1.103442999234889, + "grad_norm": 2.4283834426361994, + "learning_rate": 8.814505752804776e-06, + "loss": 0.4806, + "step": 7211 + }, + { + "epoch": 1.1035960214231064, + "grad_norm": 2.170904697292499, + "learning_rate": 8.812045081950532e-06, + "loss": 0.4919, + "step": 7212 + }, + { + "epoch": 1.1037490436113235, + "grad_norm": 1.9863702432643318, + "learning_rate": 8.809584484053245e-06, + "loss": 0.375, + "step": 7213 + }, + { + "epoch": 1.1039020657995409, + "grad_norm": 2.5916602921862286, + "learning_rate": 8.807123959264036e-06, + "loss": 0.4797, + "step": 7214 + }, + { + "epoch": 1.1040550879877582, + "grad_norm": 2.434544800191652, + "learning_rate": 8.804663507734004e-06, + "loss": 0.3995, + "step": 7215 + }, + { + "epoch": 1.1042081101759755, + "grad_norm": 2.499246869230129, + "learning_rate": 8.802203129614269e-06, + "loss": 0.5353, + "step": 7216 + }, + { + "epoch": 1.1043611323641929, + "grad_norm": 2.0781080712876117, + "learning_rate": 8.799742825055927e-06, + "loss": 0.38, + "step": 7217 + }, + { + "epoch": 1.1045141545524102, + "grad_norm": 2.394442587470399, + "learning_rate": 8.797282594210066e-06, + "loss": 0.4759, + "step": 7218 + }, + { + "epoch": 1.1046671767406273, + "grad_norm": 2.51859505988219, + "learning_rate": 8.79482243722779e-06, + "loss": 0.4989, + "step": 7219 + }, + { + "epoch": 1.1048201989288446, + "grad_norm": 2.075859944947023, + "learning_rate": 8.792362354260185e-06, + "loss": 0.4434, + "step": 7220 + }, + { + "epoch": 1.104973221117062, + "grad_norm": 2.4451130046615592, + "learning_rate": 8.789902345458326e-06, + "loss": 0.4663, + "step": 7221 + }, + { + "epoch": 1.1051262433052793, + "grad_norm": 2.3113134861788036, + "learning_rate": 8.787442410973305e-06, + "loss": 0.4555, + "step": 7222 + }, + { + "epoch": 1.1052792654934966, + "grad_norm": 2.1639552175738856, + "learning_rate": 8.784982550956184e-06, + "loss": 0.424, + "step": 7223 + }, + { + "epoch": 1.1054322876817138, + "grad_norm": 1.9803878884912123, + "learning_rate": 8.782522765558039e-06, + "loss": 0.4537, + "step": 7224 + }, + { + "epoch": 1.105585309869931, + "grad_norm": 2.2787793998286823, + "learning_rate": 8.780063054929937e-06, + "loss": 0.4371, + "step": 7225 + }, + { + "epoch": 1.1057383320581484, + "grad_norm": 2.3573504102686713, + "learning_rate": 8.77760341922293e-06, + "loss": 0.5008, + "step": 7226 + }, + { + "epoch": 1.1058913542463658, + "grad_norm": 2.1996380165093177, + "learning_rate": 8.775143858588082e-06, + "loss": 0.4169, + "step": 7227 + }, + { + "epoch": 1.106044376434583, + "grad_norm": 2.616789474953216, + "learning_rate": 8.772684373176443e-06, + "loss": 0.4648, + "step": 7228 + }, + { + "epoch": 1.1061973986228004, + "grad_norm": 2.128350016291785, + "learning_rate": 8.770224963139054e-06, + "loss": 0.4217, + "step": 7229 + }, + { + "epoch": 1.1063504208110175, + "grad_norm": 2.579626546907676, + "learning_rate": 8.767765628626962e-06, + "loss": 0.5056, + "step": 7230 + }, + { + "epoch": 1.1065034429992349, + "grad_norm": 2.2513999128668636, + "learning_rate": 8.765306369791204e-06, + "loss": 0.4425, + "step": 7231 + }, + { + "epoch": 1.1066564651874522, + "grad_norm": 2.442135717799566, + "learning_rate": 8.762847186782813e-06, + "loss": 0.506, + "step": 7232 + }, + { + "epoch": 1.1068094873756695, + "grad_norm": 2.090965212912735, + "learning_rate": 8.760388079752815e-06, + "loss": 0.4475, + "step": 7233 + }, + { + "epoch": 1.1069625095638869, + "grad_norm": 2.2371641896853802, + "learning_rate": 8.757929048852233e-06, + "loss": 0.484, + "step": 7234 + }, + { + "epoch": 1.107115531752104, + "grad_norm": 2.3009833508605007, + "learning_rate": 8.75547009423209e-06, + "loss": 0.3488, + "step": 7235 + }, + { + "epoch": 1.1072685539403213, + "grad_norm": 2.1163473200298957, + "learning_rate": 8.753011216043396e-06, + "loss": 0.3902, + "step": 7236 + }, + { + "epoch": 1.1074215761285386, + "grad_norm": 2.3626068037558685, + "learning_rate": 8.75055241443716e-06, + "loss": 0.4614, + "step": 7237 + }, + { + "epoch": 1.107574598316756, + "grad_norm": 1.8373619367737042, + "learning_rate": 8.748093689564392e-06, + "loss": 0.3726, + "step": 7238 + }, + { + "epoch": 1.1077276205049733, + "grad_norm": 1.9346840039209652, + "learning_rate": 8.745635041576082e-06, + "loss": 0.3809, + "step": 7239 + }, + { + "epoch": 1.1078806426931904, + "grad_norm": 2.1799202758607312, + "learning_rate": 8.743176470623236e-06, + "loss": 0.4697, + "step": 7240 + }, + { + "epoch": 1.1080336648814078, + "grad_norm": 2.351668788150053, + "learning_rate": 8.74071797685684e-06, + "loss": 0.3959, + "step": 7241 + }, + { + "epoch": 1.108186687069625, + "grad_norm": 2.0925064474546686, + "learning_rate": 8.738259560427876e-06, + "loss": 0.41, + "step": 7242 + }, + { + "epoch": 1.1083397092578424, + "grad_norm": 2.260684681391703, + "learning_rate": 8.73580122148733e-06, + "loss": 0.4818, + "step": 7243 + }, + { + "epoch": 1.1084927314460598, + "grad_norm": 2.1853032894152995, + "learning_rate": 8.733342960186174e-06, + "loss": 0.4506, + "step": 7244 + }, + { + "epoch": 1.1086457536342769, + "grad_norm": 1.9906423609607566, + "learning_rate": 8.730884776675386e-06, + "loss": 0.3836, + "step": 7245 + }, + { + "epoch": 1.1087987758224942, + "grad_norm": 1.9829231974861674, + "learning_rate": 8.728426671105929e-06, + "loss": 0.497, + "step": 7246 + }, + { + "epoch": 1.1089517980107115, + "grad_norm": 2.0781493892344316, + "learning_rate": 8.72596864362876e-06, + "loss": 0.4439, + "step": 7247 + }, + { + "epoch": 1.1091048201989289, + "grad_norm": 2.380914255234149, + "learning_rate": 8.723510694394845e-06, + "loss": 0.4626, + "step": 7248 + }, + { + "epoch": 1.1092578423871462, + "grad_norm": 2.430407170746528, + "learning_rate": 8.721052823555131e-06, + "loss": 0.4854, + "step": 7249 + }, + { + "epoch": 1.1094108645753635, + "grad_norm": 2.4319507114869667, + "learning_rate": 8.718595031260564e-06, + "loss": 0.3742, + "step": 7250 + }, + { + "epoch": 1.1095638867635806, + "grad_norm": 2.093265509251606, + "learning_rate": 8.716137317662093e-06, + "loss": 0.388, + "step": 7251 + }, + { + "epoch": 1.109716908951798, + "grad_norm": 2.4559992923329106, + "learning_rate": 8.713679682910648e-06, + "loss": 0.4244, + "step": 7252 + }, + { + "epoch": 1.1098699311400153, + "grad_norm": 2.1829029164513982, + "learning_rate": 8.711222127157166e-06, + "loss": 0.3357, + "step": 7253 + }, + { + "epoch": 1.1100229533282326, + "grad_norm": 2.3403552031528903, + "learning_rate": 8.70876465055258e-06, + "loss": 0.4064, + "step": 7254 + }, + { + "epoch": 1.11017597551645, + "grad_norm": 1.989255895084661, + "learning_rate": 8.7063072532478e-06, + "loss": 0.3819, + "step": 7255 + }, + { + "epoch": 1.110328997704667, + "grad_norm": 2.184660067497534, + "learning_rate": 8.703849935393758e-06, + "loss": 0.4368, + "step": 7256 + }, + { + "epoch": 1.1104820198928844, + "grad_norm": 2.349449352310267, + "learning_rate": 8.701392697141363e-06, + "loss": 0.4845, + "step": 7257 + }, + { + "epoch": 1.1106350420811018, + "grad_norm": 2.198702813821092, + "learning_rate": 8.698935538641519e-06, + "loss": 0.4256, + "step": 7258 + }, + { + "epoch": 1.110788064269319, + "grad_norm": 2.342120943064589, + "learning_rate": 8.696478460045134e-06, + "loss": 0.451, + "step": 7259 + }, + { + "epoch": 1.1109410864575364, + "grad_norm": 2.187839864774295, + "learning_rate": 8.694021461503105e-06, + "loss": 0.4097, + "step": 7260 + }, + { + "epoch": 1.1110941086457535, + "grad_norm": 2.165677195044475, + "learning_rate": 8.691564543166332e-06, + "loss": 0.4467, + "step": 7261 + }, + { + "epoch": 1.1112471308339709, + "grad_norm": 2.278960948831545, + "learning_rate": 8.689107705185697e-06, + "loss": 0.4216, + "step": 7262 + }, + { + "epoch": 1.1114001530221882, + "grad_norm": 2.1925602427988085, + "learning_rate": 8.686650947712084e-06, + "loss": 0.4818, + "step": 7263 + }, + { + "epoch": 1.1115531752104055, + "grad_norm": 2.4280978672138365, + "learning_rate": 8.684194270896376e-06, + "loss": 0.4793, + "step": 7264 + }, + { + "epoch": 1.1117061973986229, + "grad_norm": 2.413291630851117, + "learning_rate": 8.681737674889445e-06, + "loss": 0.436, + "step": 7265 + }, + { + "epoch": 1.1118592195868402, + "grad_norm": 2.182619190908532, + "learning_rate": 8.679281159842157e-06, + "loss": 0.4091, + "step": 7266 + }, + { + "epoch": 1.1120122417750573, + "grad_norm": 2.352484482668082, + "learning_rate": 8.676824725905386e-06, + "loss": 0.4431, + "step": 7267 + }, + { + "epoch": 1.1121652639632746, + "grad_norm": 2.104077049629743, + "learning_rate": 8.67436837322998e-06, + "loss": 0.4664, + "step": 7268 + }, + { + "epoch": 1.112318286151492, + "grad_norm": 1.9325614114153729, + "learning_rate": 8.671912101966799e-06, + "loss": 0.4152, + "step": 7269 + }, + { + "epoch": 1.1124713083397093, + "grad_norm": 2.2218232070237933, + "learning_rate": 8.669455912266696e-06, + "loss": 0.4681, + "step": 7270 + }, + { + "epoch": 1.1126243305279266, + "grad_norm": 2.3511511595238925, + "learning_rate": 8.666999804280503e-06, + "loss": 0.4319, + "step": 7271 + }, + { + "epoch": 1.1127773527161438, + "grad_norm": 2.5154012800065515, + "learning_rate": 8.664543778159071e-06, + "loss": 0.5433, + "step": 7272 + }, + { + "epoch": 1.112930374904361, + "grad_norm": 2.094015164774007, + "learning_rate": 8.662087834053232e-06, + "loss": 0.4057, + "step": 7273 + }, + { + "epoch": 1.1130833970925784, + "grad_norm": 2.407236738502629, + "learning_rate": 8.659631972113809e-06, + "loss": 0.4519, + "step": 7274 + }, + { + "epoch": 1.1132364192807958, + "grad_norm": 2.3070387985152503, + "learning_rate": 8.65717619249163e-06, + "loss": 0.4029, + "step": 7275 + }, + { + "epoch": 1.113389441469013, + "grad_norm": 2.2050815747283057, + "learning_rate": 8.654720495337514e-06, + "loss": 0.4293, + "step": 7276 + }, + { + "epoch": 1.1135424636572302, + "grad_norm": 1.9874516443915964, + "learning_rate": 8.652264880802279e-06, + "loss": 0.3515, + "step": 7277 + }, + { + "epoch": 1.1136954858454475, + "grad_norm": 2.283468114473653, + "learning_rate": 8.649809349036729e-06, + "loss": 0.5043, + "step": 7278 + }, + { + "epoch": 1.1138485080336649, + "grad_norm": 2.228147809122041, + "learning_rate": 8.647353900191665e-06, + "loss": 0.4794, + "step": 7279 + }, + { + "epoch": 1.1140015302218822, + "grad_norm": 2.181698558558112, + "learning_rate": 8.644898534417893e-06, + "loss": 0.4362, + "step": 7280 + }, + { + "epoch": 1.1141545524100995, + "grad_norm": 2.7327555296956514, + "learning_rate": 8.642443251866202e-06, + "loss": 0.5515, + "step": 7281 + }, + { + "epoch": 1.1143075745983166, + "grad_norm": 2.172183101436314, + "learning_rate": 8.639988052687383e-06, + "loss": 0.5035, + "step": 7282 + }, + { + "epoch": 1.114460596786534, + "grad_norm": 2.3012502821953564, + "learning_rate": 8.637532937032218e-06, + "loss": 0.42, + "step": 7283 + }, + { + "epoch": 1.1146136189747513, + "grad_norm": 2.3750682620962333, + "learning_rate": 8.635077905051484e-06, + "loss": 0.4683, + "step": 7284 + }, + { + "epoch": 1.1147666411629686, + "grad_norm": 2.295739800025103, + "learning_rate": 8.632622956895956e-06, + "loss": 0.4134, + "step": 7285 + }, + { + "epoch": 1.114919663351186, + "grad_norm": 2.5572348746935862, + "learning_rate": 8.630168092716403e-06, + "loss": 0.4558, + "step": 7286 + }, + { + "epoch": 1.1150726855394033, + "grad_norm": 2.2569408220861042, + "learning_rate": 8.627713312663582e-06, + "loss": 0.4033, + "step": 7287 + }, + { + "epoch": 1.1152257077276204, + "grad_norm": 2.270829840814031, + "learning_rate": 8.625258616888258e-06, + "loss": 0.5059, + "step": 7288 + }, + { + "epoch": 1.1153787299158378, + "grad_norm": 2.402699754110219, + "learning_rate": 8.622804005541179e-06, + "loss": 0.462, + "step": 7289 + }, + { + "epoch": 1.115531752104055, + "grad_norm": 2.4722485610391782, + "learning_rate": 8.620349478773095e-06, + "loss": 0.4529, + "step": 7290 + }, + { + "epoch": 1.1156847742922724, + "grad_norm": 2.1893417797352344, + "learning_rate": 8.617895036734747e-06, + "loss": 0.4421, + "step": 7291 + }, + { + "epoch": 1.1158377964804898, + "grad_norm": 2.1580635465283633, + "learning_rate": 8.61544067957687e-06, + "loss": 0.3836, + "step": 7292 + }, + { + "epoch": 1.1159908186687069, + "grad_norm": 2.4058476302646374, + "learning_rate": 8.6129864074502e-06, + "loss": 0.4373, + "step": 7293 + }, + { + "epoch": 1.1161438408569242, + "grad_norm": 2.3222777156880037, + "learning_rate": 8.610532220505459e-06, + "loss": 0.5089, + "step": 7294 + }, + { + "epoch": 1.1162968630451415, + "grad_norm": 2.1307415290698852, + "learning_rate": 8.608078118893368e-06, + "loss": 0.5199, + "step": 7295 + }, + { + "epoch": 1.1164498852333589, + "grad_norm": 2.383794896617956, + "learning_rate": 8.60562410276465e-06, + "loss": 0.4593, + "step": 7296 + }, + { + "epoch": 1.1166029074215762, + "grad_norm": 2.5837278205838663, + "learning_rate": 8.603170172270002e-06, + "loss": 0.4638, + "step": 7297 + }, + { + "epoch": 1.1167559296097935, + "grad_norm": 2.2525760948688944, + "learning_rate": 8.600716327560146e-06, + "loss": 0.4474, + "step": 7298 + }, + { + "epoch": 1.1169089517980106, + "grad_norm": 2.615571933223899, + "learning_rate": 8.598262568785775e-06, + "loss": 0.4843, + "step": 7299 + }, + { + "epoch": 1.117061973986228, + "grad_norm": 2.439123207803759, + "learning_rate": 8.595808896097575e-06, + "loss": 0.4902, + "step": 7300 + }, + { + "epoch": 1.1172149961744453, + "grad_norm": 2.2731590292985677, + "learning_rate": 8.593355309646252e-06, + "loss": 0.514, + "step": 7301 + }, + { + "epoch": 1.1173680183626626, + "grad_norm": 2.2254431206595773, + "learning_rate": 8.590901809582482e-06, + "loss": 0.4406, + "step": 7302 + }, + { + "epoch": 1.11752104055088, + "grad_norm": 2.1556714339537226, + "learning_rate": 8.588448396056938e-06, + "loss": 0.4179, + "step": 7303 + }, + { + "epoch": 1.117674062739097, + "grad_norm": 2.2555805850430817, + "learning_rate": 8.585995069220305e-06, + "loss": 0.4607, + "step": 7304 + }, + { + "epoch": 1.1178270849273144, + "grad_norm": 2.095833604865627, + "learning_rate": 8.583541829223243e-06, + "loss": 0.5086, + "step": 7305 + }, + { + "epoch": 1.1179801071155318, + "grad_norm": 2.3270010835496397, + "learning_rate": 8.581088676216421e-06, + "loss": 0.4744, + "step": 7306 + }, + { + "epoch": 1.118133129303749, + "grad_norm": 2.491796490650458, + "learning_rate": 8.578635610350497e-06, + "loss": 0.4065, + "step": 7307 + }, + { + "epoch": 1.1182861514919664, + "grad_norm": 2.182575815083912, + "learning_rate": 8.576182631776115e-06, + "loss": 0.4834, + "step": 7308 + }, + { + "epoch": 1.1184391736801835, + "grad_norm": 2.173635352835729, + "learning_rate": 8.57372974064393e-06, + "loss": 0.4341, + "step": 7309 + }, + { + "epoch": 1.1185921958684009, + "grad_norm": 2.3043141043629274, + "learning_rate": 8.571276937104584e-06, + "loss": 0.464, + "step": 7310 + }, + { + "epoch": 1.1187452180566182, + "grad_norm": 2.316805699809123, + "learning_rate": 8.568824221308705e-06, + "loss": 0.4992, + "step": 7311 + }, + { + "epoch": 1.1188982402448355, + "grad_norm": 2.100553392834741, + "learning_rate": 8.566371593406933e-06, + "loss": 0.447, + "step": 7312 + }, + { + "epoch": 1.1190512624330529, + "grad_norm": 2.3347884127487055, + "learning_rate": 8.563919053549887e-06, + "loss": 0.4517, + "step": 7313 + }, + { + "epoch": 1.11920428462127, + "grad_norm": 2.011899264289727, + "learning_rate": 8.561466601888195e-06, + "loss": 0.381, + "step": 7314 + }, + { + "epoch": 1.1193573068094873, + "grad_norm": 2.264732519715413, + "learning_rate": 8.559014238572463e-06, + "loss": 0.4625, + "step": 7315 + }, + { + "epoch": 1.1195103289977046, + "grad_norm": 2.1982141461822944, + "learning_rate": 8.556561963753303e-06, + "loss": 0.433, + "step": 7316 + }, + { + "epoch": 1.119663351185922, + "grad_norm": 2.2287969140588038, + "learning_rate": 8.554109777581322e-06, + "loss": 0.4322, + "step": 7317 + }, + { + "epoch": 1.1198163733741393, + "grad_norm": 2.0805687752430218, + "learning_rate": 8.551657680207114e-06, + "loss": 0.4368, + "step": 7318 + }, + { + "epoch": 1.1199693955623566, + "grad_norm": 2.167889830590893, + "learning_rate": 8.549205671781273e-06, + "loss": 0.6545, + "step": 7319 + }, + { + "epoch": 1.1201224177505738, + "grad_norm": 2.528070985327336, + "learning_rate": 8.54675375245439e-06, + "loss": 0.5258, + "step": 7320 + }, + { + "epoch": 1.120275439938791, + "grad_norm": 1.787445525421544, + "learning_rate": 8.544301922377043e-06, + "loss": 0.3962, + "step": 7321 + }, + { + "epoch": 1.1204284621270084, + "grad_norm": 2.244017891063301, + "learning_rate": 8.54185018169981e-06, + "loss": 0.4742, + "step": 7322 + }, + { + "epoch": 1.1205814843152258, + "grad_norm": 2.249034278168932, + "learning_rate": 8.539398530573264e-06, + "loss": 0.4758, + "step": 7323 + }, + { + "epoch": 1.120734506503443, + "grad_norm": 2.0505223499372796, + "learning_rate": 8.536946969147963e-06, + "loss": 0.4034, + "step": 7324 + }, + { + "epoch": 1.1208875286916602, + "grad_norm": 2.433144003092916, + "learning_rate": 8.534495497574472e-06, + "loss": 0.5097, + "step": 7325 + }, + { + "epoch": 1.1210405508798775, + "grad_norm": 2.136664385459225, + "learning_rate": 8.532044116003347e-06, + "loss": 0.4433, + "step": 7326 + }, + { + "epoch": 1.1211935730680949, + "grad_norm": 2.360936102413662, + "learning_rate": 8.52959282458514e-06, + "loss": 0.4784, + "step": 7327 + }, + { + "epoch": 1.1213465952563122, + "grad_norm": 2.1996065493633106, + "learning_rate": 8.527141623470387e-06, + "loss": 0.4174, + "step": 7328 + }, + { + "epoch": 1.1214996174445295, + "grad_norm": 2.2465195864524463, + "learning_rate": 8.524690512809626e-06, + "loss": 0.3862, + "step": 7329 + }, + { + "epoch": 1.1216526396327469, + "grad_norm": 2.1053183068145045, + "learning_rate": 8.522239492753395e-06, + "loss": 0.475, + "step": 7330 + }, + { + "epoch": 1.121805661820964, + "grad_norm": 2.418720533759001, + "learning_rate": 8.519788563452218e-06, + "loss": 0.4415, + "step": 7331 + }, + { + "epoch": 1.1219586840091813, + "grad_norm": 1.9996187128183676, + "learning_rate": 8.517337725056613e-06, + "loss": 0.4324, + "step": 7332 + }, + { + "epoch": 1.1221117061973986, + "grad_norm": 2.1481582833504693, + "learning_rate": 8.514886977717102e-06, + "loss": 0.4025, + "step": 7333 + }, + { + "epoch": 1.122264728385616, + "grad_norm": 2.2852880745037676, + "learning_rate": 8.51243632158419e-06, + "loss": 0.5204, + "step": 7334 + }, + { + "epoch": 1.1224177505738333, + "grad_norm": 1.9810063469638823, + "learning_rate": 8.509985756808382e-06, + "loss": 0.417, + "step": 7335 + }, + { + "epoch": 1.1225707727620504, + "grad_norm": 2.354682978168584, + "learning_rate": 8.507535283540181e-06, + "loss": 0.4445, + "step": 7336 + }, + { + "epoch": 1.1227237949502678, + "grad_norm": 2.049284352078451, + "learning_rate": 8.505084901930073e-06, + "loss": 0.4622, + "step": 7337 + }, + { + "epoch": 1.122876817138485, + "grad_norm": 2.2985255632558106, + "learning_rate": 8.502634612128551e-06, + "loss": 0.4512, + "step": 7338 + }, + { + "epoch": 1.1230298393267024, + "grad_norm": 2.289298123164532, + "learning_rate": 8.500184414286096e-06, + "loss": 0.4481, + "step": 7339 + }, + { + "epoch": 1.1231828615149198, + "grad_norm": 2.066046084729003, + "learning_rate": 8.49773430855318e-06, + "loss": 0.5145, + "step": 7340 + }, + { + "epoch": 1.1233358837031369, + "grad_norm": 2.4768352381868155, + "learning_rate": 8.495284295080281e-06, + "loss": 0.507, + "step": 7341 + }, + { + "epoch": 1.1234889058913542, + "grad_norm": 2.379390037498325, + "learning_rate": 8.492834374017856e-06, + "loss": 0.4829, + "step": 7342 + }, + { + "epoch": 1.1236419280795715, + "grad_norm": 2.107590759872093, + "learning_rate": 8.490384545516373e-06, + "loss": 0.411, + "step": 7343 + }, + { + "epoch": 1.1237949502677889, + "grad_norm": 1.9761377140115508, + "learning_rate": 8.487934809726277e-06, + "loss": 0.4598, + "step": 7344 + }, + { + "epoch": 1.1239479724560062, + "grad_norm": 1.9653246388829446, + "learning_rate": 8.485485166798019e-06, + "loss": 0.4399, + "step": 7345 + }, + { + "epoch": 1.1241009946442233, + "grad_norm": 2.3005849600149917, + "learning_rate": 8.483035616882046e-06, + "loss": 0.4133, + "step": 7346 + }, + { + "epoch": 1.1242540168324406, + "grad_norm": 2.108212619768816, + "learning_rate": 8.480586160128785e-06, + "loss": 0.4129, + "step": 7347 + }, + { + "epoch": 1.124407039020658, + "grad_norm": 2.189179438990085, + "learning_rate": 8.478136796688674e-06, + "loss": 0.4236, + "step": 7348 + }, + { + "epoch": 1.1245600612088753, + "grad_norm": 2.0614034092253117, + "learning_rate": 8.47568752671214e-06, + "loss": 0.4482, + "step": 7349 + }, + { + "epoch": 1.1247130833970926, + "grad_norm": 2.3106721350735357, + "learning_rate": 8.473238350349593e-06, + "loss": 0.4582, + "step": 7350 + }, + { + "epoch": 1.12486610558531, + "grad_norm": 2.069049495107061, + "learning_rate": 8.470789267751452e-06, + "loss": 0.4362, + "step": 7351 + }, + { + "epoch": 1.125019127773527, + "grad_norm": 2.232653664335247, + "learning_rate": 8.468340279068128e-06, + "loss": 0.4467, + "step": 7352 + }, + { + "epoch": 1.1251721499617444, + "grad_norm": 2.2760784233432263, + "learning_rate": 8.465891384450015e-06, + "loss": 0.4328, + "step": 7353 + }, + { + "epoch": 1.1253251721499617, + "grad_norm": 1.9416601069747, + "learning_rate": 8.463442584047516e-06, + "loss": 0.3905, + "step": 7354 + }, + { + "epoch": 1.125478194338179, + "grad_norm": 2.023979021978067, + "learning_rate": 8.460993878011014e-06, + "loss": 0.4174, + "step": 7355 + }, + { + "epoch": 1.1256312165263964, + "grad_norm": 2.585206276544525, + "learning_rate": 8.458545266490906e-06, + "loss": 0.454, + "step": 7356 + }, + { + "epoch": 1.1257842387146135, + "grad_norm": 2.0345591716158298, + "learning_rate": 8.456096749637561e-06, + "loss": 0.3921, + "step": 7357 + }, + { + "epoch": 1.1259372609028309, + "grad_norm": 2.470455072554232, + "learning_rate": 8.453648327601352e-06, + "loss": 0.4929, + "step": 7358 + }, + { + "epoch": 1.1260902830910482, + "grad_norm": 2.3686162736588385, + "learning_rate": 8.451200000532653e-06, + "loss": 0.4353, + "step": 7359 + }, + { + "epoch": 1.1262433052792655, + "grad_norm": 2.135323906416676, + "learning_rate": 8.448751768581818e-06, + "loss": 0.3795, + "step": 7360 + }, + { + "epoch": 1.1263963274674829, + "grad_norm": 2.33267928641562, + "learning_rate": 8.446303631899203e-06, + "loss": 0.4306, + "step": 7361 + }, + { + "epoch": 1.1265493496557002, + "grad_norm": 2.4218289282958114, + "learning_rate": 8.443855590635165e-06, + "loss": 0.451, + "step": 7362 + }, + { + "epoch": 1.1267023718439173, + "grad_norm": 2.132079052539921, + "learning_rate": 8.441407644940038e-06, + "loss": 0.4516, + "step": 7363 + }, + { + "epoch": 1.1268553940321346, + "grad_norm": 2.270796945887167, + "learning_rate": 8.438959794964168e-06, + "loss": 0.4633, + "step": 7364 + }, + { + "epoch": 1.127008416220352, + "grad_norm": 2.23165428476424, + "learning_rate": 8.436512040857885e-06, + "loss": 0.4663, + "step": 7365 + }, + { + "epoch": 1.1271614384085693, + "grad_norm": 2.474571748588033, + "learning_rate": 8.43406438277151e-06, + "loss": 0.4026, + "step": 7366 + }, + { + "epoch": 1.1273144605967866, + "grad_norm": 1.9023084237155796, + "learning_rate": 8.43161682085537e-06, + "loss": 0.4036, + "step": 7367 + }, + { + "epoch": 1.1274674827850037, + "grad_norm": 2.2417421961042034, + "learning_rate": 8.429169355259778e-06, + "loss": 0.401, + "step": 7368 + }, + { + "epoch": 1.127620504973221, + "grad_norm": 2.4185358282453864, + "learning_rate": 8.426721986135037e-06, + "loss": 0.5283, + "step": 7369 + }, + { + "epoch": 1.1277735271614384, + "grad_norm": 2.27465022088665, + "learning_rate": 8.424274713631455e-06, + "loss": 0.4105, + "step": 7370 + }, + { + "epoch": 1.1279265493496557, + "grad_norm": 2.2414361092641815, + "learning_rate": 8.421827537899325e-06, + "loss": 0.4595, + "step": 7371 + }, + { + "epoch": 1.128079571537873, + "grad_norm": 2.2130236838711457, + "learning_rate": 8.419380459088943e-06, + "loss": 0.3782, + "step": 7372 + }, + { + "epoch": 1.1282325937260902, + "grad_norm": 2.289444586623291, + "learning_rate": 8.41693347735059e-06, + "loss": 0.4152, + "step": 7373 + }, + { + "epoch": 1.1283856159143075, + "grad_norm": 2.1547623456356573, + "learning_rate": 8.414486592834544e-06, + "loss": 0.3986, + "step": 7374 + }, + { + "epoch": 1.1285386381025249, + "grad_norm": 2.3965413396527033, + "learning_rate": 8.412039805691082e-06, + "loss": 0.4543, + "step": 7375 + }, + { + "epoch": 1.1286916602907422, + "grad_norm": 2.0682256832583192, + "learning_rate": 8.409593116070465e-06, + "loss": 0.3893, + "step": 7376 + }, + { + "epoch": 1.1288446824789595, + "grad_norm": 2.1555593129350967, + "learning_rate": 8.407146524122956e-06, + "loss": 0.4843, + "step": 7377 + }, + { + "epoch": 1.1289977046671766, + "grad_norm": 2.235134611714402, + "learning_rate": 8.404700029998813e-06, + "loss": 0.4193, + "step": 7378 + }, + { + "epoch": 1.129150726855394, + "grad_norm": 2.0948079300685025, + "learning_rate": 8.402253633848275e-06, + "loss": 0.4189, + "step": 7379 + }, + { + "epoch": 1.1293037490436113, + "grad_norm": 2.5618929392411385, + "learning_rate": 8.399807335821599e-06, + "loss": 0.4643, + "step": 7380 + }, + { + "epoch": 1.1294567712318286, + "grad_norm": 2.1511509812008645, + "learning_rate": 8.397361136069016e-06, + "loss": 0.4467, + "step": 7381 + }, + { + "epoch": 1.129609793420046, + "grad_norm": 2.386196178306402, + "learning_rate": 8.394915034740746e-06, + "loss": 0.4485, + "step": 7382 + }, + { + "epoch": 1.129762815608263, + "grad_norm": 2.102940746700613, + "learning_rate": 8.39246903198703e-06, + "loss": 0.4044, + "step": 7383 + }, + { + "epoch": 1.1299158377964804, + "grad_norm": 2.122835102459708, + "learning_rate": 8.390023127958076e-06, + "loss": 0.3913, + "step": 7384 + }, + { + "epoch": 1.1300688599846977, + "grad_norm": 2.1996709406789043, + "learning_rate": 8.387577322804102e-06, + "loss": 0.4211, + "step": 7385 + }, + { + "epoch": 1.130221882172915, + "grad_norm": 2.23042537125408, + "learning_rate": 8.385131616675316e-06, + "loss": 0.3713, + "step": 7386 + }, + { + "epoch": 1.1303749043611324, + "grad_norm": 2.1043667069097745, + "learning_rate": 8.382686009721907e-06, + "loss": 0.4529, + "step": 7387 + }, + { + "epoch": 1.1305279265493497, + "grad_norm": 2.227960780238535, + "learning_rate": 8.380240502094083e-06, + "loss": 0.4243, + "step": 7388 + }, + { + "epoch": 1.1306809487375669, + "grad_norm": 2.4418394916459008, + "learning_rate": 8.377795093942026e-06, + "loss": 0.48, + "step": 7389 + }, + { + "epoch": 1.1308339709257842, + "grad_norm": 2.484896572042387, + "learning_rate": 8.375349785415915e-06, + "loss": 0.376, + "step": 7390 + }, + { + "epoch": 1.1309869931140015, + "grad_norm": 2.1153869841882553, + "learning_rate": 8.372904576665931e-06, + "loss": 0.4547, + "step": 7391 + }, + { + "epoch": 1.1311400153022189, + "grad_norm": 2.135980963386847, + "learning_rate": 8.37045946784224e-06, + "loss": 0.3852, + "step": 7392 + }, + { + "epoch": 1.1312930374904362, + "grad_norm": 2.3502007142725856, + "learning_rate": 8.368014459095014e-06, + "loss": 0.4678, + "step": 7393 + }, + { + "epoch": 1.1314460596786535, + "grad_norm": 2.2129245462799525, + "learning_rate": 8.3655695505744e-06, + "loss": 0.3617, + "step": 7394 + }, + { + "epoch": 1.1315990818668706, + "grad_norm": 2.4771896435377085, + "learning_rate": 8.363124742430552e-06, + "loss": 0.4377, + "step": 7395 + }, + { + "epoch": 1.131752104055088, + "grad_norm": 2.1686976601970414, + "learning_rate": 8.360680034813621e-06, + "loss": 0.4344, + "step": 7396 + }, + { + "epoch": 1.1319051262433053, + "grad_norm": 2.219669874900676, + "learning_rate": 8.358235427873741e-06, + "loss": 0.3737, + "step": 7397 + }, + { + "epoch": 1.1320581484315226, + "grad_norm": 2.7668635869641043, + "learning_rate": 8.355790921761045e-06, + "loss": 0.4628, + "step": 7398 + }, + { + "epoch": 1.13221117061974, + "grad_norm": 2.257255357947881, + "learning_rate": 8.353346516625662e-06, + "loss": 0.4475, + "step": 7399 + }, + { + "epoch": 1.132364192807957, + "grad_norm": 1.7893405512069935, + "learning_rate": 8.350902212617708e-06, + "loss": 0.3202, + "step": 7400 + }, + { + "epoch": 1.1325172149961744, + "grad_norm": 2.188151848584422, + "learning_rate": 8.348458009887305e-06, + "loss": 0.4079, + "step": 7401 + }, + { + "epoch": 1.1326702371843917, + "grad_norm": 2.1347935928648623, + "learning_rate": 8.346013908584556e-06, + "loss": 0.4305, + "step": 7402 + }, + { + "epoch": 1.132823259372609, + "grad_norm": 2.3676766269499026, + "learning_rate": 8.343569908859559e-06, + "loss": 0.5252, + "step": 7403 + }, + { + "epoch": 1.1329762815608264, + "grad_norm": 2.284607042130858, + "learning_rate": 8.341126010862417e-06, + "loss": 0.4101, + "step": 7404 + }, + { + "epoch": 1.1331293037490435, + "grad_norm": 2.3344145212373126, + "learning_rate": 8.33868221474322e-06, + "loss": 0.4269, + "step": 7405 + }, + { + "epoch": 1.1332823259372609, + "grad_norm": 2.371166608832448, + "learning_rate": 8.33623852065204e-06, + "loss": 0.4966, + "step": 7406 + }, + { + "epoch": 1.1334353481254782, + "grad_norm": 2.0768928444278565, + "learning_rate": 8.333794928738963e-06, + "loss": 0.4451, + "step": 7407 + }, + { + "epoch": 1.1335883703136955, + "grad_norm": 2.2826255992028726, + "learning_rate": 8.331351439154058e-06, + "loss": 0.4068, + "step": 7408 + }, + { + "epoch": 1.1337413925019129, + "grad_norm": 2.1691141270829264, + "learning_rate": 8.328908052047392e-06, + "loss": 0.449, + "step": 7409 + }, + { + "epoch": 1.13389441469013, + "grad_norm": 2.0987622141051334, + "learning_rate": 8.326464767569018e-06, + "loss": 0.4117, + "step": 7410 + }, + { + "epoch": 1.1340474368783473, + "grad_norm": 1.9073261756337854, + "learning_rate": 8.324021585868987e-06, + "loss": 0.3589, + "step": 7411 + }, + { + "epoch": 1.1342004590665646, + "grad_norm": 2.145654462179916, + "learning_rate": 8.321578507097351e-06, + "loss": 0.4972, + "step": 7412 + }, + { + "epoch": 1.134353481254782, + "grad_norm": 2.285381702770092, + "learning_rate": 8.319135531404143e-06, + "loss": 0.4493, + "step": 7413 + }, + { + "epoch": 1.1345065034429993, + "grad_norm": 2.0970514541562153, + "learning_rate": 8.316692658939396e-06, + "loss": 0.4087, + "step": 7414 + }, + { + "epoch": 1.1346595256312164, + "grad_norm": 2.2048734136664607, + "learning_rate": 8.314249889853141e-06, + "loss": 0.4177, + "step": 7415 + }, + { + "epoch": 1.1348125478194337, + "grad_norm": 2.1074902936965225, + "learning_rate": 8.311807224295391e-06, + "loss": 0.4048, + "step": 7416 + }, + { + "epoch": 1.134965570007651, + "grad_norm": 2.4923095199826726, + "learning_rate": 8.309364662416167e-06, + "loss": 0.4147, + "step": 7417 + }, + { + "epoch": 1.1351185921958684, + "grad_norm": 2.108320231994888, + "learning_rate": 8.306922204365476e-06, + "loss": 0.3964, + "step": 7418 + }, + { + "epoch": 1.1352716143840857, + "grad_norm": 2.3299560327530404, + "learning_rate": 8.304479850293308e-06, + "loss": 0.4453, + "step": 7419 + }, + { + "epoch": 1.135424636572303, + "grad_norm": 1.9868538731801448, + "learning_rate": 8.302037600349669e-06, + "loss": 0.3799, + "step": 7420 + }, + { + "epoch": 1.1355776587605202, + "grad_norm": 1.9462273485980057, + "learning_rate": 8.29959545468454e-06, + "loss": 0.4148, + "step": 7421 + }, + { + "epoch": 1.1357306809487375, + "grad_norm": 2.234376101383108, + "learning_rate": 8.297153413447914e-06, + "loss": 0.4836, + "step": 7422 + }, + { + "epoch": 1.1358837031369549, + "grad_norm": 2.0793008207140047, + "learning_rate": 8.294711476789754e-06, + "loss": 0.4197, + "step": 7423 + }, + { + "epoch": 1.1360367253251722, + "grad_norm": 2.257538544402379, + "learning_rate": 8.292269644860032e-06, + "loss": 0.4509, + "step": 7424 + }, + { + "epoch": 1.1361897475133895, + "grad_norm": 2.200890531426487, + "learning_rate": 8.289827917808715e-06, + "loss": 0.472, + "step": 7425 + }, + { + "epoch": 1.1363427697016066, + "grad_norm": 2.278989470946727, + "learning_rate": 8.287386295785755e-06, + "loss": 0.3801, + "step": 7426 + }, + { + "epoch": 1.136495791889824, + "grad_norm": 2.39384075061946, + "learning_rate": 8.284944778941098e-06, + "loss": 0.5339, + "step": 7427 + }, + { + "epoch": 1.1366488140780413, + "grad_norm": 2.1283550645597367, + "learning_rate": 8.282503367424697e-06, + "loss": 0.4964, + "step": 7428 + }, + { + "epoch": 1.1368018362662586, + "grad_norm": 2.0117879432634798, + "learning_rate": 8.280062061386478e-06, + "loss": 0.4534, + "step": 7429 + }, + { + "epoch": 1.136954858454476, + "grad_norm": 2.2323821418668883, + "learning_rate": 8.277620860976382e-06, + "loss": 0.468, + "step": 7430 + }, + { + "epoch": 1.1371078806426933, + "grad_norm": 2.1809204167250678, + "learning_rate": 8.275179766344325e-06, + "loss": 0.4446, + "step": 7431 + }, + { + "epoch": 1.1372609028309104, + "grad_norm": 2.193452628067021, + "learning_rate": 8.272738777640224e-06, + "loss": 0.4116, + "step": 7432 + }, + { + "epoch": 1.1374139250191277, + "grad_norm": 2.288415436946979, + "learning_rate": 8.270297895013992e-06, + "loss": 0.496, + "step": 7433 + }, + { + "epoch": 1.137566947207345, + "grad_norm": 2.18335735896481, + "learning_rate": 8.267857118615536e-06, + "loss": 0.4222, + "step": 7434 + }, + { + "epoch": 1.1377199693955624, + "grad_norm": 2.181193321791857, + "learning_rate": 8.265416448594747e-06, + "loss": 0.372, + "step": 7435 + }, + { + "epoch": 1.1378729915837797, + "grad_norm": 2.268232897097735, + "learning_rate": 8.26297588510152e-06, + "loss": 0.494, + "step": 7436 + }, + { + "epoch": 1.1380260137719969, + "grad_norm": 2.122086095027104, + "learning_rate": 8.260535428285739e-06, + "loss": 0.3753, + "step": 7437 + }, + { + "epoch": 1.1381790359602142, + "grad_norm": 2.526263316541223, + "learning_rate": 8.258095078297286e-06, + "loss": 0.442, + "step": 7438 + }, + { + "epoch": 1.1383320581484315, + "grad_norm": 2.1160673691906653, + "learning_rate": 8.255654835286025e-06, + "loss": 0.3435, + "step": 7439 + }, + { + "epoch": 1.1384850803366489, + "grad_norm": 2.114820062384801, + "learning_rate": 8.253214699401824e-06, + "loss": 0.4397, + "step": 7440 + }, + { + "epoch": 1.1386381025248662, + "grad_norm": 2.3148968139526325, + "learning_rate": 8.250774670794545e-06, + "loss": 0.4095, + "step": 7441 + }, + { + "epoch": 1.1387911247130833, + "grad_norm": 2.3769438544271257, + "learning_rate": 8.248334749614035e-06, + "loss": 0.4592, + "step": 7442 + }, + { + "epoch": 1.1389441469013006, + "grad_norm": 1.9651717159386775, + "learning_rate": 8.245894936010139e-06, + "loss": 0.4159, + "step": 7443 + }, + { + "epoch": 1.139097169089518, + "grad_norm": 1.93039744862964, + "learning_rate": 8.243455230132699e-06, + "loss": 0.389, + "step": 7444 + }, + { + "epoch": 1.1392501912777353, + "grad_norm": 2.2701040105758095, + "learning_rate": 8.241015632131543e-06, + "loss": 0.4002, + "step": 7445 + }, + { + "epoch": 1.1394032134659526, + "grad_norm": 2.456302414590909, + "learning_rate": 8.2385761421565e-06, + "loss": 0.5128, + "step": 7446 + }, + { + "epoch": 1.1395562356541697, + "grad_norm": 2.441590053887338, + "learning_rate": 8.236136760357388e-06, + "loss": 0.4405, + "step": 7447 + }, + { + "epoch": 1.139709257842387, + "grad_norm": 2.142496840381271, + "learning_rate": 8.233697486884014e-06, + "loss": 0.3596, + "step": 7448 + }, + { + "epoch": 1.1398622800306044, + "grad_norm": 2.2436694830487847, + "learning_rate": 8.23125832188619e-06, + "loss": 0.3891, + "step": 7449 + }, + { + "epoch": 1.1400153022188217, + "grad_norm": 2.3067821364697085, + "learning_rate": 8.228819265513713e-06, + "loss": 0.356, + "step": 7450 + }, + { + "epoch": 1.140168324407039, + "grad_norm": 2.2973291155628903, + "learning_rate": 8.22638031791637e-06, + "loss": 0.4147, + "step": 7451 + }, + { + "epoch": 1.1403213465952562, + "grad_norm": 2.3150537810209344, + "learning_rate": 8.223941479243952e-06, + "loss": 0.4021, + "step": 7452 + }, + { + "epoch": 1.1404743687834735, + "grad_norm": 2.1101437473072373, + "learning_rate": 8.221502749646233e-06, + "loss": 0.3884, + "step": 7453 + }, + { + "epoch": 1.1406273909716909, + "grad_norm": 2.2940668477947015, + "learning_rate": 8.219064129272993e-06, + "loss": 0.4888, + "step": 7454 + }, + { + "epoch": 1.1407804131599082, + "grad_norm": 2.5443419586116107, + "learning_rate": 8.21662561827399e-06, + "loss": 0.4524, + "step": 7455 + }, + { + "epoch": 1.1409334353481255, + "grad_norm": 2.2535777983806433, + "learning_rate": 8.214187216798982e-06, + "loss": 0.4283, + "step": 7456 + }, + { + "epoch": 1.1410864575363429, + "grad_norm": 2.2630948114110048, + "learning_rate": 8.211748924997727e-06, + "loss": 0.3295, + "step": 7457 + }, + { + "epoch": 1.14123947972456, + "grad_norm": 2.3979824581281535, + "learning_rate": 8.20931074301996e-06, + "loss": 0.4793, + "step": 7458 + }, + { + "epoch": 1.1413925019127773, + "grad_norm": 2.849113718168752, + "learning_rate": 8.206872671015431e-06, + "loss": 0.4199, + "step": 7459 + }, + { + "epoch": 1.1415455241009946, + "grad_norm": 2.4386612095527807, + "learning_rate": 8.204434709133868e-06, + "loss": 0.3872, + "step": 7460 + }, + { + "epoch": 1.141698546289212, + "grad_norm": 2.6483087030550845, + "learning_rate": 8.201996857524985e-06, + "loss": 0.4643, + "step": 7461 + }, + { + "epoch": 1.1418515684774293, + "grad_norm": 1.8880437723686634, + "learning_rate": 8.199559116338516e-06, + "loss": 0.3426, + "step": 7462 + }, + { + "epoch": 1.1420045906656466, + "grad_norm": 2.252181667335893, + "learning_rate": 8.197121485724165e-06, + "loss": 0.3879, + "step": 7463 + }, + { + "epoch": 1.1421576128538637, + "grad_norm": 2.2676544247977684, + "learning_rate": 8.194683965831632e-06, + "loss": 0.4494, + "step": 7464 + }, + { + "epoch": 1.142310635042081, + "grad_norm": 2.5427993887949705, + "learning_rate": 8.192246556810623e-06, + "loss": 0.4561, + "step": 7465 + }, + { + "epoch": 1.1424636572302984, + "grad_norm": 2.4818084453252025, + "learning_rate": 8.189809258810821e-06, + "loss": 0.4889, + "step": 7466 + }, + { + "epoch": 1.1426166794185157, + "grad_norm": 2.1968397362218144, + "learning_rate": 8.187372071981918e-06, + "loss": 0.4769, + "step": 7467 + }, + { + "epoch": 1.142769701606733, + "grad_norm": 2.382228924959736, + "learning_rate": 8.184934996473588e-06, + "loss": 0.4139, + "step": 7468 + }, + { + "epoch": 1.1429227237949502, + "grad_norm": 2.2384393655517107, + "learning_rate": 8.182498032435495e-06, + "loss": 0.4245, + "step": 7469 + }, + { + "epoch": 1.1430757459831675, + "grad_norm": 2.1015126974026064, + "learning_rate": 8.180061180017312e-06, + "loss": 0.3678, + "step": 7470 + }, + { + "epoch": 1.1432287681713849, + "grad_norm": 2.428458221772654, + "learning_rate": 8.177624439368692e-06, + "loss": 0.378, + "step": 7471 + }, + { + "epoch": 1.1433817903596022, + "grad_norm": 2.2063195875131902, + "learning_rate": 8.175187810639281e-06, + "loss": 0.4007, + "step": 7472 + }, + { + "epoch": 1.1435348125478195, + "grad_norm": 2.1957528996819184, + "learning_rate": 8.172751293978727e-06, + "loss": 0.4849, + "step": 7473 + }, + { + "epoch": 1.1436878347360366, + "grad_norm": 2.3310595610922826, + "learning_rate": 8.170314889536663e-06, + "loss": 0.4719, + "step": 7474 + }, + { + "epoch": 1.143840856924254, + "grad_norm": 2.1956159717965815, + "learning_rate": 8.167878597462724e-06, + "loss": 0.4452, + "step": 7475 + }, + { + "epoch": 1.1439938791124713, + "grad_norm": 2.6239595075786855, + "learning_rate": 8.165442417906523e-06, + "loss": 0.401, + "step": 7476 + }, + { + "epoch": 1.1441469013006886, + "grad_norm": 2.4291514293723875, + "learning_rate": 8.163006351017681e-06, + "loss": 0.4379, + "step": 7477 + }, + { + "epoch": 1.144299923488906, + "grad_norm": 2.1139992285994014, + "learning_rate": 8.160570396945808e-06, + "loss": 0.4388, + "step": 7478 + }, + { + "epoch": 1.144452945677123, + "grad_norm": 2.1633213676632015, + "learning_rate": 8.1581345558405e-06, + "loss": 0.4151, + "step": 7479 + }, + { + "epoch": 1.1446059678653404, + "grad_norm": 2.2390982385776397, + "learning_rate": 8.155698827851354e-06, + "loss": 0.412, + "step": 7480 + }, + { + "epoch": 1.1447589900535577, + "grad_norm": 2.2089163980451088, + "learning_rate": 8.153263213127961e-06, + "loss": 0.4565, + "step": 7481 + }, + { + "epoch": 1.144912012241775, + "grad_norm": 2.176734428087088, + "learning_rate": 8.150827711819894e-06, + "loss": 0.4647, + "step": 7482 + }, + { + "epoch": 1.1450650344299924, + "grad_norm": 2.253278709863533, + "learning_rate": 8.148392324076733e-06, + "loss": 0.4017, + "step": 7483 + }, + { + "epoch": 1.1452180566182095, + "grad_norm": 2.152174433588315, + "learning_rate": 8.145957050048047e-06, + "loss": 0.3346, + "step": 7484 + }, + { + "epoch": 1.1453710788064269, + "grad_norm": 2.047079847330625, + "learning_rate": 8.143521889883385e-06, + "loss": 0.3476, + "step": 7485 + }, + { + "epoch": 1.1455241009946442, + "grad_norm": 2.4767797743598443, + "learning_rate": 8.141086843732311e-06, + "loss": 0.4391, + "step": 7486 + }, + { + "epoch": 1.1456771231828615, + "grad_norm": 2.0852254844893476, + "learning_rate": 8.138651911744362e-06, + "loss": 0.3421, + "step": 7487 + }, + { + "epoch": 1.1458301453710789, + "grad_norm": 2.3333992177155394, + "learning_rate": 8.136217094069085e-06, + "loss": 0.4186, + "step": 7488 + }, + { + "epoch": 1.1459831675592962, + "grad_norm": 2.242432354816371, + "learning_rate": 8.133782390856007e-06, + "loss": 0.4564, + "step": 7489 + }, + { + "epoch": 1.1461361897475133, + "grad_norm": 2.1801434144194447, + "learning_rate": 8.131347802254649e-06, + "loss": 0.4897, + "step": 7490 + }, + { + "epoch": 1.1462892119357306, + "grad_norm": 2.7231872288313315, + "learning_rate": 8.128913328414538e-06, + "loss": 0.417, + "step": 7491 + }, + { + "epoch": 1.146442234123948, + "grad_norm": 2.113303644811708, + "learning_rate": 8.126478969485176e-06, + "loss": 0.5045, + "step": 7492 + }, + { + "epoch": 1.1465952563121653, + "grad_norm": 2.095819049578236, + "learning_rate": 8.124044725616069e-06, + "loss": 0.426, + "step": 7493 + }, + { + "epoch": 1.1467482785003826, + "grad_norm": 2.040810152227828, + "learning_rate": 8.121610596956718e-06, + "loss": 0.4056, + "step": 7494 + }, + { + "epoch": 1.1469013006886, + "grad_norm": 2.1692836388967125, + "learning_rate": 8.119176583656603e-06, + "loss": 0.44, + "step": 7495 + }, + { + "epoch": 1.147054322876817, + "grad_norm": 2.2409161920316434, + "learning_rate": 8.116742685865217e-06, + "loss": 0.3894, + "step": 7496 + }, + { + "epoch": 1.1472073450650344, + "grad_norm": 2.1774102605411776, + "learning_rate": 8.114308903732028e-06, + "loss": 0.4041, + "step": 7497 + }, + { + "epoch": 1.1473603672532517, + "grad_norm": 1.728871470396512, + "learning_rate": 8.111875237406506e-06, + "loss": 0.3032, + "step": 7498 + }, + { + "epoch": 1.147513389441469, + "grad_norm": 2.3391781721148743, + "learning_rate": 8.109441687038111e-06, + "loss": 0.4852, + "step": 7499 + }, + { + "epoch": 1.1476664116296864, + "grad_norm": 2.104355287358558, + "learning_rate": 8.107008252776301e-06, + "loss": 0.4481, + "step": 7500 + }, + { + "epoch": 1.1478194338179035, + "grad_norm": 2.2728732595750474, + "learning_rate": 8.104574934770516e-06, + "loss": 0.3932, + "step": 7501 + }, + { + "epoch": 1.1479724560061209, + "grad_norm": 2.7508028520265686, + "learning_rate": 8.102141733170202e-06, + "loss": 0.422, + "step": 7502 + }, + { + "epoch": 1.1481254781943382, + "grad_norm": 2.3486796594495885, + "learning_rate": 8.099708648124785e-06, + "loss": 0.542, + "step": 7503 + }, + { + "epoch": 1.1482785003825555, + "grad_norm": 2.2815892384437806, + "learning_rate": 8.097275679783698e-06, + "loss": 0.4602, + "step": 7504 + }, + { + "epoch": 1.1484315225707729, + "grad_norm": 2.371707040162622, + "learning_rate": 8.094842828296354e-06, + "loss": 0.4856, + "step": 7505 + }, + { + "epoch": 1.14858454475899, + "grad_norm": 1.8999703078986259, + "learning_rate": 8.092410093812161e-06, + "loss": 0.3464, + "step": 7506 + }, + { + "epoch": 1.1487375669472073, + "grad_norm": 1.9999448169209695, + "learning_rate": 8.089977476480533e-06, + "loss": 0.3793, + "step": 7507 + }, + { + "epoch": 1.1488905891354246, + "grad_norm": 2.101338405608802, + "learning_rate": 8.08754497645086e-06, + "loss": 0.367, + "step": 7508 + }, + { + "epoch": 1.149043611323642, + "grad_norm": 2.254916559111806, + "learning_rate": 8.085112593872524e-06, + "loss": 0.4823, + "step": 7509 + }, + { + "epoch": 1.1491966335118593, + "grad_norm": 2.2298089722186103, + "learning_rate": 8.082680328894923e-06, + "loss": 0.4959, + "step": 7510 + }, + { + "epoch": 1.1493496557000764, + "grad_norm": 2.017233130477621, + "learning_rate": 8.080248181667417e-06, + "loss": 0.4274, + "step": 7511 + }, + { + "epoch": 1.1495026778882937, + "grad_norm": 2.358574713364482, + "learning_rate": 8.077816152339383e-06, + "loss": 0.5025, + "step": 7512 + }, + { + "epoch": 1.149655700076511, + "grad_norm": 2.0398205910476657, + "learning_rate": 8.075384241060183e-06, + "loss": 0.4287, + "step": 7513 + }, + { + "epoch": 1.1498087222647284, + "grad_norm": 2.0429358861791305, + "learning_rate": 8.072952447979159e-06, + "loss": 0.3928, + "step": 7514 + }, + { + "epoch": 1.1499617444529457, + "grad_norm": 2.141370554482417, + "learning_rate": 8.070520773245667e-06, + "loss": 0.3852, + "step": 7515 + }, + { + "epoch": 1.1501147666411629, + "grad_norm": 2.169035469722936, + "learning_rate": 8.068089217009045e-06, + "loss": 0.3634, + "step": 7516 + }, + { + "epoch": 1.1502677888293802, + "grad_norm": 2.4022699362669058, + "learning_rate": 8.065657779418617e-06, + "loss": 0.4754, + "step": 7517 + }, + { + "epoch": 1.1504208110175975, + "grad_norm": 2.0431104935895847, + "learning_rate": 8.063226460623714e-06, + "loss": 0.371, + "step": 7518 + }, + { + "epoch": 1.1505738332058149, + "grad_norm": 2.0616275176271235, + "learning_rate": 8.060795260773649e-06, + "loss": 0.4364, + "step": 7519 + }, + { + "epoch": 1.1507268553940322, + "grad_norm": 2.389435632037756, + "learning_rate": 8.058364180017738e-06, + "loss": 0.4303, + "step": 7520 + }, + { + "epoch": 1.1508798775822495, + "grad_norm": 2.390938065826764, + "learning_rate": 8.055933218505275e-06, + "loss": 0.4162, + "step": 7521 + }, + { + "epoch": 1.1510328997704666, + "grad_norm": 2.0817572989263735, + "learning_rate": 8.053502376385555e-06, + "loss": 0.4079, + "step": 7522 + }, + { + "epoch": 1.151185921958684, + "grad_norm": 1.9572766628941676, + "learning_rate": 8.051071653807877e-06, + "loss": 0.3664, + "step": 7523 + }, + { + "epoch": 1.1513389441469013, + "grad_norm": 1.97525311017114, + "learning_rate": 8.048641050921505e-06, + "loss": 0.3817, + "step": 7524 + }, + { + "epoch": 1.1514919663351186, + "grad_norm": 2.189527479601311, + "learning_rate": 8.046210567875725e-06, + "loss": 0.4619, + "step": 7525 + }, + { + "epoch": 1.151644988523336, + "grad_norm": 2.0684633513104136, + "learning_rate": 8.043780204819796e-06, + "loss": 0.3981, + "step": 7526 + }, + { + "epoch": 1.1517980107115533, + "grad_norm": 2.199448040768017, + "learning_rate": 8.041349961902976e-06, + "loss": 0.3961, + "step": 7527 + }, + { + "epoch": 1.1519510328997704, + "grad_norm": 2.2150480648254236, + "learning_rate": 8.038919839274519e-06, + "loss": 0.4216, + "step": 7528 + }, + { + "epoch": 1.1521040550879877, + "grad_norm": 2.292813109070026, + "learning_rate": 8.036489837083668e-06, + "loss": 0.3829, + "step": 7529 + }, + { + "epoch": 1.152257077276205, + "grad_norm": 2.065860761981022, + "learning_rate": 8.034059955479652e-06, + "loss": 0.398, + "step": 7530 + }, + { + "epoch": 1.1524100994644224, + "grad_norm": 2.1957137938311764, + "learning_rate": 8.031630194611708e-06, + "loss": 0.3777, + "step": 7531 + }, + { + "epoch": 1.1525631216526397, + "grad_norm": 2.395586780136042, + "learning_rate": 8.029200554629052e-06, + "loss": 0.4595, + "step": 7532 + }, + { + "epoch": 1.1527161438408569, + "grad_norm": 2.354007174674176, + "learning_rate": 8.026771035680905e-06, + "loss": 0.4501, + "step": 7533 + }, + { + "epoch": 1.1528691660290742, + "grad_norm": 2.1309239235307276, + "learning_rate": 8.024341637916465e-06, + "loss": 0.4385, + "step": 7534 + }, + { + "epoch": 1.1530221882172915, + "grad_norm": 2.0989633908232554, + "learning_rate": 8.02191236148493e-06, + "loss": 0.4596, + "step": 7535 + }, + { + "epoch": 1.1531752104055089, + "grad_norm": 2.0051365553355307, + "learning_rate": 8.019483206535501e-06, + "loss": 0.3603, + "step": 7536 + }, + { + "epoch": 1.1533282325937262, + "grad_norm": 2.1379711591576602, + "learning_rate": 8.017054173217354e-06, + "loss": 0.4969, + "step": 7537 + }, + { + "epoch": 1.1534812547819433, + "grad_norm": 2.0414756729412007, + "learning_rate": 8.014625261679666e-06, + "loss": 0.3722, + "step": 7538 + }, + { + "epoch": 1.1536342769701606, + "grad_norm": 1.960059689751741, + "learning_rate": 8.012196472071612e-06, + "loss": 0.5155, + "step": 7539 + }, + { + "epoch": 1.153787299158378, + "grad_norm": 1.9148456988264915, + "learning_rate": 8.009767804542341e-06, + "loss": 0.3806, + "step": 7540 + }, + { + "epoch": 1.1539403213465953, + "grad_norm": 2.1346428516068507, + "learning_rate": 8.007339259241022e-06, + "loss": 0.3781, + "step": 7541 + }, + { + "epoch": 1.1540933435348126, + "grad_norm": 2.341555927456598, + "learning_rate": 8.004910836316796e-06, + "loss": 0.4134, + "step": 7542 + }, + { + "epoch": 1.1542463657230297, + "grad_norm": 2.1533856307933448, + "learning_rate": 8.002482535918792e-06, + "loss": 0.3918, + "step": 7543 + }, + { + "epoch": 1.154399387911247, + "grad_norm": 2.203898129272653, + "learning_rate": 8.000054358196156e-06, + "loss": 0.3875, + "step": 7544 + }, + { + "epoch": 1.1545524100994644, + "grad_norm": 2.0530530827506106, + "learning_rate": 7.997626303298008e-06, + "loss": 0.4047, + "step": 7545 + }, + { + "epoch": 1.1547054322876817, + "grad_norm": 2.1846781568623026, + "learning_rate": 7.995198371373455e-06, + "loss": 0.4748, + "step": 7546 + }, + { + "epoch": 1.154858454475899, + "grad_norm": 2.369642955439486, + "learning_rate": 7.992770562571616e-06, + "loss": 0.4418, + "step": 7547 + }, + { + "epoch": 1.1550114766641162, + "grad_norm": 2.876693910486257, + "learning_rate": 7.990342877041588e-06, + "loss": 0.3995, + "step": 7548 + }, + { + "epoch": 1.1551644988523335, + "grad_norm": 2.2881944470526503, + "learning_rate": 7.987915314932467e-06, + "loss": 0.4277, + "step": 7549 + }, + { + "epoch": 1.1553175210405509, + "grad_norm": 2.082026776655421, + "learning_rate": 7.98548787639334e-06, + "loss": 0.474, + "step": 7550 + }, + { + "epoch": 1.1554705432287682, + "grad_norm": 2.4770772431505623, + "learning_rate": 7.98306056157328e-06, + "loss": 0.4932, + "step": 7551 + }, + { + "epoch": 1.1556235654169855, + "grad_norm": 2.400411126727818, + "learning_rate": 7.980633370621361e-06, + "loss": 0.417, + "step": 7552 + }, + { + "epoch": 1.1557765876052026, + "grad_norm": 2.249144501782921, + "learning_rate": 7.97820630368665e-06, + "loss": 0.4198, + "step": 7553 + }, + { + "epoch": 1.15592960979342, + "grad_norm": 2.4416647224932526, + "learning_rate": 7.975779360918196e-06, + "loss": 0.4977, + "step": 7554 + }, + { + "epoch": 1.1560826319816373, + "grad_norm": 2.315267686111871, + "learning_rate": 7.973352542465052e-06, + "loss": 0.3786, + "step": 7555 + }, + { + "epoch": 1.1562356541698546, + "grad_norm": 2.12994967417012, + "learning_rate": 7.970925848476253e-06, + "loss": 0.4149, + "step": 7556 + }, + { + "epoch": 1.156388676358072, + "grad_norm": 2.4649859720017604, + "learning_rate": 7.968499279100841e-06, + "loss": 0.5204, + "step": 7557 + }, + { + "epoch": 1.1565416985462893, + "grad_norm": 2.546695379639944, + "learning_rate": 7.966072834487832e-06, + "loss": 0.4823, + "step": 7558 + }, + { + "epoch": 1.1566947207345064, + "grad_norm": 2.111908023754244, + "learning_rate": 7.963646514786246e-06, + "loss": 0.4027, + "step": 7559 + }, + { + "epoch": 1.1568477429227237, + "grad_norm": 2.079272062182409, + "learning_rate": 7.961220320145099e-06, + "loss": 0.3618, + "step": 7560 + }, + { + "epoch": 1.157000765110941, + "grad_norm": 2.0099331247968517, + "learning_rate": 7.958794250713381e-06, + "loss": 0.3389, + "step": 7561 + }, + { + "epoch": 1.1571537872991584, + "grad_norm": 2.3728785319967693, + "learning_rate": 7.956368306640097e-06, + "loss": 0.4472, + "step": 7562 + }, + { + "epoch": 1.1573068094873757, + "grad_norm": 2.124001664400862, + "learning_rate": 7.953942488074233e-06, + "loss": 0.4035, + "step": 7563 + }, + { + "epoch": 1.157459831675593, + "grad_norm": 2.2562599575262685, + "learning_rate": 7.95151679516476e-06, + "loss": 0.4425, + "step": 7564 + }, + { + "epoch": 1.1576128538638102, + "grad_norm": 2.1706628764944798, + "learning_rate": 7.949091228060657e-06, + "loss": 0.4145, + "step": 7565 + }, + { + "epoch": 1.1577658760520275, + "grad_norm": 1.9386706013096928, + "learning_rate": 7.946665786910885e-06, + "loss": 0.3171, + "step": 7566 + }, + { + "epoch": 1.1579188982402449, + "grad_norm": 2.2644379442856297, + "learning_rate": 7.944240471864398e-06, + "loss": 0.4577, + "step": 7567 + }, + { + "epoch": 1.1580719204284622, + "grad_norm": 2.2324278833799864, + "learning_rate": 7.941815283070147e-06, + "loss": 0.4197, + "step": 7568 + }, + { + "epoch": 1.1582249426166795, + "grad_norm": 2.4366820568521512, + "learning_rate": 7.939390220677068e-06, + "loss": 0.4956, + "step": 7569 + }, + { + "epoch": 1.1583779648048966, + "grad_norm": 2.274130967656845, + "learning_rate": 7.936965284834102e-06, + "loss": 0.4817, + "step": 7570 + }, + { + "epoch": 1.158530986993114, + "grad_norm": 2.214947057393825, + "learning_rate": 7.934540475690167e-06, + "loss": 0.4042, + "step": 7571 + }, + { + "epoch": 1.1586840091813313, + "grad_norm": 2.2361223952851823, + "learning_rate": 7.932115793394177e-06, + "loss": 0.3838, + "step": 7572 + }, + { + "epoch": 1.1588370313695486, + "grad_norm": 2.4647857683566823, + "learning_rate": 7.929691238095053e-06, + "loss": 0.4323, + "step": 7573 + }, + { + "epoch": 1.158990053557766, + "grad_norm": 2.2605032776577287, + "learning_rate": 7.927266809941684e-06, + "loss": 0.4403, + "step": 7574 + }, + { + "epoch": 1.159143075745983, + "grad_norm": 2.307062509681339, + "learning_rate": 7.924842509082968e-06, + "loss": 0.4219, + "step": 7575 + }, + { + "epoch": 1.1592960979342004, + "grad_norm": 2.2580402550302336, + "learning_rate": 7.922418335667796e-06, + "loss": 0.4076, + "step": 7576 + }, + { + "epoch": 1.1594491201224177, + "grad_norm": 1.926770123976774, + "learning_rate": 7.919994289845038e-06, + "loss": 0.3209, + "step": 7577 + }, + { + "epoch": 1.159602142310635, + "grad_norm": 1.88033873862793, + "learning_rate": 7.917570371763568e-06, + "loss": 0.3106, + "step": 7578 + }, + { + "epoch": 1.1597551644988524, + "grad_norm": 2.3143555284094726, + "learning_rate": 7.915146581572253e-06, + "loss": 0.4199, + "step": 7579 + }, + { + "epoch": 1.1599081866870695, + "grad_norm": 2.5111769532452524, + "learning_rate": 7.912722919419936e-06, + "loss": 0.4582, + "step": 7580 + }, + { + "epoch": 1.1600612088752869, + "grad_norm": 2.159383285210872, + "learning_rate": 7.910299385455472e-06, + "loss": 0.3882, + "step": 7581 + }, + { + "epoch": 1.1602142310635042, + "grad_norm": 2.0463292222580227, + "learning_rate": 7.9078759798277e-06, + "loss": 0.3195, + "step": 7582 + }, + { + "epoch": 1.1603672532517215, + "grad_norm": 2.315690123330286, + "learning_rate": 7.905452702685446e-06, + "loss": 0.4594, + "step": 7583 + }, + { + "epoch": 1.1605202754399389, + "grad_norm": 2.4911195808536464, + "learning_rate": 7.903029554177535e-06, + "loss": 0.4797, + "step": 7584 + }, + { + "epoch": 1.160673297628156, + "grad_norm": 2.0446690195381847, + "learning_rate": 7.900606534452782e-06, + "loss": 0.4061, + "step": 7585 + }, + { + "epoch": 1.1608263198163733, + "grad_norm": 2.1561715447958574, + "learning_rate": 7.898183643659998e-06, + "loss": 0.4537, + "step": 7586 + }, + { + "epoch": 1.1609793420045906, + "grad_norm": 2.3006046638561366, + "learning_rate": 7.895760881947976e-06, + "loss": 0.5039, + "step": 7587 + }, + { + "epoch": 1.161132364192808, + "grad_norm": 2.495860237065185, + "learning_rate": 7.89333824946551e-06, + "loss": 0.4654, + "step": 7588 + }, + { + "epoch": 1.1612853863810253, + "grad_norm": 2.1959113314709873, + "learning_rate": 7.890915746361388e-06, + "loss": 0.3754, + "step": 7589 + }, + { + "epoch": 1.1614384085692426, + "grad_norm": 2.2157690704589954, + "learning_rate": 7.888493372784375e-06, + "loss": 0.4643, + "step": 7590 + }, + { + "epoch": 1.1615914307574597, + "grad_norm": 2.1585183274845083, + "learning_rate": 7.886071128883245e-06, + "loss": 0.454, + "step": 7591 + }, + { + "epoch": 1.161744452945677, + "grad_norm": 2.3488517000482165, + "learning_rate": 7.883649014806762e-06, + "loss": 0.4765, + "step": 7592 + }, + { + "epoch": 1.1618974751338944, + "grad_norm": 2.3071998807798417, + "learning_rate": 7.881227030703666e-06, + "loss": 0.4174, + "step": 7593 + }, + { + "epoch": 1.1620504973221117, + "grad_norm": 2.0712309131234066, + "learning_rate": 7.878805176722708e-06, + "loss": 0.4144, + "step": 7594 + }, + { + "epoch": 1.162203519510329, + "grad_norm": 2.09316779209673, + "learning_rate": 7.876383453012626e-06, + "loss": 0.4189, + "step": 7595 + }, + { + "epoch": 1.1623565416985464, + "grad_norm": 1.9814309260192482, + "learning_rate": 7.873961859722139e-06, + "loss": 0.3391, + "step": 7596 + }, + { + "epoch": 1.1625095638867635, + "grad_norm": 2.1146901984425783, + "learning_rate": 7.871540396999974e-06, + "loss": 0.3784, + "step": 7597 + }, + { + "epoch": 1.1626625860749809, + "grad_norm": 2.0404107507386264, + "learning_rate": 7.869119064994836e-06, + "loss": 0.4149, + "step": 7598 + }, + { + "epoch": 1.1628156082631982, + "grad_norm": 2.083732346909204, + "learning_rate": 7.866697863855439e-06, + "loss": 0.4429, + "step": 7599 + }, + { + "epoch": 1.1629686304514155, + "grad_norm": 2.3775803032947636, + "learning_rate": 7.864276793730468e-06, + "loss": 0.3781, + "step": 7600 + }, + { + "epoch": 1.1631216526396329, + "grad_norm": 2.115787174371097, + "learning_rate": 7.861855854768611e-06, + "loss": 0.3634, + "step": 7601 + }, + { + "epoch": 1.16327467482785, + "grad_norm": 2.203471890719275, + "learning_rate": 7.859435047118558e-06, + "loss": 0.407, + "step": 7602 + }, + { + "epoch": 1.1634276970160673, + "grad_norm": 2.121708542637144, + "learning_rate": 7.857014370928968e-06, + "loss": 0.4098, + "step": 7603 + }, + { + "epoch": 1.1635807192042846, + "grad_norm": 2.324513780877982, + "learning_rate": 7.854593826348506e-06, + "loss": 0.4553, + "step": 7604 + }, + { + "epoch": 1.163733741392502, + "grad_norm": 2.1593388977675976, + "learning_rate": 7.852173413525837e-06, + "loss": 0.3437, + "step": 7605 + }, + { + "epoch": 1.1638867635807193, + "grad_norm": 2.2737818262158362, + "learning_rate": 7.849753132609595e-06, + "loss": 0.4773, + "step": 7606 + }, + { + "epoch": 1.1640397857689364, + "grad_norm": 2.4196407421903534, + "learning_rate": 7.847332983748427e-06, + "loss": 0.4916, + "step": 7607 + }, + { + "epoch": 1.1641928079571537, + "grad_norm": 2.2265966790800853, + "learning_rate": 7.844912967090965e-06, + "loss": 0.3524, + "step": 7608 + }, + { + "epoch": 1.164345830145371, + "grad_norm": 2.300774220942024, + "learning_rate": 7.842493082785823e-06, + "loss": 0.4525, + "step": 7609 + }, + { + "epoch": 1.1644988523335884, + "grad_norm": 2.1765958482202246, + "learning_rate": 7.840073330981623e-06, + "loss": 0.4291, + "step": 7610 + }, + { + "epoch": 1.1646518745218057, + "grad_norm": 2.545845268184133, + "learning_rate": 7.837653711826973e-06, + "loss": 0.4652, + "step": 7611 + }, + { + "epoch": 1.1648048967100229, + "grad_norm": 2.2236738456132437, + "learning_rate": 7.835234225470462e-06, + "loss": 0.4389, + "step": 7612 + }, + { + "epoch": 1.1649579188982402, + "grad_norm": 2.161295283860229, + "learning_rate": 7.832814872060688e-06, + "loss": 0.429, + "step": 7613 + }, + { + "epoch": 1.1651109410864575, + "grad_norm": 2.267543706470478, + "learning_rate": 7.83039565174623e-06, + "loss": 0.4238, + "step": 7614 + }, + { + "epoch": 1.1652639632746749, + "grad_norm": 2.096609865562028, + "learning_rate": 7.827976564675666e-06, + "loss": 0.4182, + "step": 7615 + }, + { + "epoch": 1.1654169854628922, + "grad_norm": 1.9474587819076785, + "learning_rate": 7.825557610997556e-06, + "loss": 0.4151, + "step": 7616 + }, + { + "epoch": 1.1655700076511093, + "grad_norm": 2.231859405584811, + "learning_rate": 7.823138790860457e-06, + "loss": 0.4394, + "step": 7617 + }, + { + "epoch": 1.1657230298393266, + "grad_norm": 2.1358801598651036, + "learning_rate": 7.820720104412926e-06, + "loss": 0.4202, + "step": 7618 + }, + { + "epoch": 1.165876052027544, + "grad_norm": 2.273267880308986, + "learning_rate": 7.818301551803495e-06, + "loss": 0.4307, + "step": 7619 + }, + { + "epoch": 1.1660290742157613, + "grad_norm": 2.2176807501043183, + "learning_rate": 7.8158831331807e-06, + "loss": 0.3873, + "step": 7620 + }, + { + "epoch": 1.1661820964039786, + "grad_norm": 2.164388307223667, + "learning_rate": 7.813464848693071e-06, + "loss": 0.418, + "step": 7621 + }, + { + "epoch": 1.166335118592196, + "grad_norm": 2.3247978450960405, + "learning_rate": 7.81104669848911e-06, + "loss": 0.4521, + "step": 7622 + }, + { + "epoch": 1.166488140780413, + "grad_norm": 2.1664042575123195, + "learning_rate": 7.808628682717344e-06, + "loss": 0.4207, + "step": 7623 + }, + { + "epoch": 1.1666411629686304, + "grad_norm": 2.0898246135348004, + "learning_rate": 7.806210801526262e-06, + "loss": 0.3832, + "step": 7624 + }, + { + "epoch": 1.1667941851568477, + "grad_norm": 2.0408662892538523, + "learning_rate": 7.80379305506435e-06, + "loss": 0.3524, + "step": 7625 + }, + { + "epoch": 1.166947207345065, + "grad_norm": 2.2001590021513397, + "learning_rate": 7.801375443480106e-06, + "loss": 0.4377, + "step": 7626 + }, + { + "epoch": 1.1671002295332824, + "grad_norm": 2.35247171593286, + "learning_rate": 7.798957966921992e-06, + "loss": 0.4511, + "step": 7627 + }, + { + "epoch": 1.1672532517214997, + "grad_norm": 2.093124942157242, + "learning_rate": 7.796540625538482e-06, + "loss": 0.3984, + "step": 7628 + }, + { + "epoch": 1.1674062739097169, + "grad_norm": 2.3270121913506743, + "learning_rate": 7.794123419478034e-06, + "loss": 0.4626, + "step": 7629 + }, + { + "epoch": 1.1675592960979342, + "grad_norm": 2.235583289520111, + "learning_rate": 7.791706348889092e-06, + "loss": 0.421, + "step": 7630 + }, + { + "epoch": 1.1677123182861515, + "grad_norm": 2.2800471006152336, + "learning_rate": 7.789289413920104e-06, + "loss": 0.4149, + "step": 7631 + }, + { + "epoch": 1.1678653404743689, + "grad_norm": 2.11975063545846, + "learning_rate": 7.786872614719504e-06, + "loss": 0.4212, + "step": 7632 + }, + { + "epoch": 1.1680183626625862, + "grad_norm": 1.976637872464587, + "learning_rate": 7.78445595143571e-06, + "loss": 0.3641, + "step": 7633 + }, + { + "epoch": 1.1681713848508033, + "grad_norm": 2.5678946494338426, + "learning_rate": 7.782039424217145e-06, + "loss": 0.3952, + "step": 7634 + }, + { + "epoch": 1.1683244070390206, + "grad_norm": 2.4311254543526712, + "learning_rate": 7.779623033212213e-06, + "loss": 0.5068, + "step": 7635 + }, + { + "epoch": 1.168477429227238, + "grad_norm": 2.125309875951746, + "learning_rate": 7.777206778569323e-06, + "loss": 0.39, + "step": 7636 + }, + { + "epoch": 1.1686304514154553, + "grad_norm": 1.9029565175583325, + "learning_rate": 7.774790660436857e-06, + "loss": 0.3747, + "step": 7637 + }, + { + "epoch": 1.1687834736036726, + "grad_norm": 1.8974621969583034, + "learning_rate": 7.772374678963199e-06, + "loss": 0.3954, + "step": 7638 + }, + { + "epoch": 1.1689364957918897, + "grad_norm": 2.1731954934867397, + "learning_rate": 7.769958834296733e-06, + "loss": 0.4806, + "step": 7639 + }, + { + "epoch": 1.169089517980107, + "grad_norm": 2.164959683654725, + "learning_rate": 7.767543126585815e-06, + "loss": 0.3921, + "step": 7640 + }, + { + "epoch": 1.1692425401683244, + "grad_norm": 2.0519145883295917, + "learning_rate": 7.765127555978805e-06, + "loss": 0.3916, + "step": 7641 + }, + { + "epoch": 1.1693955623565417, + "grad_norm": 2.022149191067335, + "learning_rate": 7.762712122624059e-06, + "loss": 0.5355, + "step": 7642 + }, + { + "epoch": 1.169548584544759, + "grad_norm": 2.2553693306957934, + "learning_rate": 7.76029682666991e-06, + "loss": 0.4656, + "step": 7643 + }, + { + "epoch": 1.1697016067329762, + "grad_norm": 2.1677346659488395, + "learning_rate": 7.757881668264696e-06, + "loss": 0.4061, + "step": 7644 + }, + { + "epoch": 1.1698546289211935, + "grad_norm": 2.2303487915393183, + "learning_rate": 7.755466647556742e-06, + "loss": 0.4142, + "step": 7645 + }, + { + "epoch": 1.1700076511094109, + "grad_norm": 2.2087393847961745, + "learning_rate": 7.753051764694358e-06, + "loss": 0.4007, + "step": 7646 + }, + { + "epoch": 1.1701606732976282, + "grad_norm": 2.1866091937779326, + "learning_rate": 7.750637019825858e-06, + "loss": 0.3785, + "step": 7647 + }, + { + "epoch": 1.1703136954858455, + "grad_norm": 2.333612890088411, + "learning_rate": 7.74822241309954e-06, + "loss": 0.488, + "step": 7648 + }, + { + "epoch": 1.1704667176740626, + "grad_norm": 2.152719088963061, + "learning_rate": 7.745807944663688e-06, + "loss": 0.421, + "step": 7649 + }, + { + "epoch": 1.17061973986228, + "grad_norm": 2.322270994926128, + "learning_rate": 7.74339361466659e-06, + "loss": 0.4393, + "step": 7650 + }, + { + "epoch": 1.1707727620504973, + "grad_norm": 2.195968124463279, + "learning_rate": 7.740979423256518e-06, + "loss": 0.4031, + "step": 7651 + }, + { + "epoch": 1.1709257842387146, + "grad_norm": 2.136519735206509, + "learning_rate": 7.738565370581739e-06, + "loss": 0.3473, + "step": 7652 + }, + { + "epoch": 1.171078806426932, + "grad_norm": 2.2023901399428305, + "learning_rate": 7.736151456790506e-06, + "loss": 0.4934, + "step": 7653 + }, + { + "epoch": 1.171231828615149, + "grad_norm": 2.2416044379867994, + "learning_rate": 7.733737682031066e-06, + "loss": 0.4688, + "step": 7654 + }, + { + "epoch": 1.1713848508033664, + "grad_norm": 1.9033203376997059, + "learning_rate": 7.731324046451665e-06, + "loss": 0.4178, + "step": 7655 + }, + { + "epoch": 1.1715378729915837, + "grad_norm": 2.007408800033132, + "learning_rate": 7.728910550200528e-06, + "loss": 0.4644, + "step": 7656 + }, + { + "epoch": 1.171690895179801, + "grad_norm": 2.187477421883523, + "learning_rate": 7.726497193425875e-06, + "loss": 0.4285, + "step": 7657 + }, + { + "epoch": 1.1718439173680184, + "grad_norm": 2.3383291826778363, + "learning_rate": 7.72408397627593e-06, + "loss": 0.4946, + "step": 7658 + }, + { + "epoch": 1.1719969395562357, + "grad_norm": 2.097809814469768, + "learning_rate": 7.721670898898886e-06, + "loss": 0.4198, + "step": 7659 + }, + { + "epoch": 1.1721499617444529, + "grad_norm": 2.134611181161027, + "learning_rate": 7.719257961442946e-06, + "loss": 0.3677, + "step": 7660 + }, + { + "epoch": 1.1723029839326702, + "grad_norm": 2.0635524558988165, + "learning_rate": 7.716845164056301e-06, + "loss": 0.3861, + "step": 7661 + }, + { + "epoch": 1.1724560061208875, + "grad_norm": 2.1668260879700285, + "learning_rate": 7.714432506887119e-06, + "loss": 0.3923, + "step": 7662 + }, + { + "epoch": 1.1726090283091049, + "grad_norm": 2.165575599965999, + "learning_rate": 7.712019990083583e-06, + "loss": 0.3705, + "step": 7663 + }, + { + "epoch": 1.1727620504973222, + "grad_norm": 2.3510883434857717, + "learning_rate": 7.709607613793847e-06, + "loss": 0.4414, + "step": 7664 + }, + { + "epoch": 1.1729150726855395, + "grad_norm": 2.4418747152055733, + "learning_rate": 7.707195378166071e-06, + "loss": 0.4232, + "step": 7665 + }, + { + "epoch": 1.1730680948737566, + "grad_norm": 1.8774606626370103, + "learning_rate": 7.704783283348396e-06, + "loss": 0.3496, + "step": 7666 + }, + { + "epoch": 1.173221117061974, + "grad_norm": 2.4097174796880125, + "learning_rate": 7.702371329488954e-06, + "loss": 0.402, + "step": 7667 + }, + { + "epoch": 1.1733741392501913, + "grad_norm": 2.141900723164932, + "learning_rate": 7.699959516735884e-06, + "loss": 0.4609, + "step": 7668 + }, + { + "epoch": 1.1735271614384086, + "grad_norm": 1.9648011311206592, + "learning_rate": 7.697547845237294e-06, + "loss": 0.307, + "step": 7669 + }, + { + "epoch": 1.173680183626626, + "grad_norm": 2.014416906522851, + "learning_rate": 7.695136315141295e-06, + "loss": 0.4412, + "step": 7670 + }, + { + "epoch": 1.173833205814843, + "grad_norm": 1.9892079843533914, + "learning_rate": 7.692724926595998e-06, + "loss": 0.386, + "step": 7671 + }, + { + "epoch": 1.1739862280030604, + "grad_norm": 2.123848925579515, + "learning_rate": 7.690313679749484e-06, + "loss": 0.4088, + "step": 7672 + }, + { + "epoch": 1.1741392501912777, + "grad_norm": 2.4469979298476146, + "learning_rate": 7.687902574749844e-06, + "loss": 0.538, + "step": 7673 + }, + { + "epoch": 1.174292272379495, + "grad_norm": 2.1653847556064414, + "learning_rate": 7.685491611745155e-06, + "loss": 0.4599, + "step": 7674 + }, + { + "epoch": 1.1744452945677124, + "grad_norm": 2.1231595403483468, + "learning_rate": 7.683080790883477e-06, + "loss": 0.4341, + "step": 7675 + }, + { + "epoch": 1.1745983167559295, + "grad_norm": 2.180437643743477, + "learning_rate": 7.680670112312871e-06, + "loss": 0.387, + "step": 7676 + }, + { + "epoch": 1.1747513389441469, + "grad_norm": 2.1797587090293455, + "learning_rate": 7.678259576181391e-06, + "loss": 0.4162, + "step": 7677 + }, + { + "epoch": 1.1749043611323642, + "grad_norm": 1.9300874158917527, + "learning_rate": 7.675849182637069e-06, + "loss": 0.3977, + "step": 7678 + }, + { + "epoch": 1.1750573833205815, + "grad_norm": 2.176277192004729, + "learning_rate": 7.67343893182794e-06, + "loss": 0.3371, + "step": 7679 + }, + { + "epoch": 1.1752104055087988, + "grad_norm": 2.1934070557560537, + "learning_rate": 7.671028823902029e-06, + "loss": 0.4144, + "step": 7680 + }, + { + "epoch": 1.175363427697016, + "grad_norm": 2.2349384227823914, + "learning_rate": 7.668618859007351e-06, + "loss": 0.3794, + "step": 7681 + }, + { + "epoch": 1.1755164498852333, + "grad_norm": 2.2353031611686416, + "learning_rate": 7.666209037291909e-06, + "loss": 0.439, + "step": 7682 + }, + { + "epoch": 1.1756694720734506, + "grad_norm": 2.511919213185073, + "learning_rate": 7.663799358903698e-06, + "loss": 0.455, + "step": 7683 + }, + { + "epoch": 1.175822494261668, + "grad_norm": 2.0920595984190626, + "learning_rate": 7.661389823990711e-06, + "loss": 0.3794, + "step": 7684 + }, + { + "epoch": 1.1759755164498853, + "grad_norm": 2.247879750828007, + "learning_rate": 7.65898043270092e-06, + "loss": 0.4846, + "step": 7685 + }, + { + "epoch": 1.1761285386381024, + "grad_norm": 2.1945571365625653, + "learning_rate": 7.656571185182298e-06, + "loss": 0.4249, + "step": 7686 + }, + { + "epoch": 1.1762815608263197, + "grad_norm": 2.043387306270332, + "learning_rate": 7.654162081582812e-06, + "loss": 0.3923, + "step": 7687 + }, + { + "epoch": 1.176434583014537, + "grad_norm": 2.1797498439512193, + "learning_rate": 7.651753122050404e-06, + "loss": 0.4203, + "step": 7688 + }, + { + "epoch": 1.1765876052027544, + "grad_norm": 1.9302525156159196, + "learning_rate": 7.649344306733026e-06, + "loss": 0.3811, + "step": 7689 + }, + { + "epoch": 1.1767406273909717, + "grad_norm": 2.3062243258549735, + "learning_rate": 7.646935635778612e-06, + "loss": 0.4832, + "step": 7690 + }, + { + "epoch": 1.176893649579189, + "grad_norm": 2.2193174384523315, + "learning_rate": 7.64452710933508e-06, + "loss": 0.4152, + "step": 7691 + }, + { + "epoch": 1.1770466717674062, + "grad_norm": 2.1635717890959754, + "learning_rate": 7.642118727550358e-06, + "loss": 0.3953, + "step": 7692 + }, + { + "epoch": 1.1771996939556235, + "grad_norm": 1.9823843125750913, + "learning_rate": 7.639710490572348e-06, + "loss": 0.3899, + "step": 7693 + }, + { + "epoch": 1.1773527161438408, + "grad_norm": 2.0966731774522684, + "learning_rate": 7.637302398548949e-06, + "loss": 0.3648, + "step": 7694 + }, + { + "epoch": 1.1775057383320582, + "grad_norm": 2.159828350633099, + "learning_rate": 7.634894451628053e-06, + "loss": 0.4285, + "step": 7695 + }, + { + "epoch": 1.1776587605202755, + "grad_norm": 2.314588251676099, + "learning_rate": 7.632486649957539e-06, + "loss": 0.4209, + "step": 7696 + }, + { + "epoch": 1.1778117827084928, + "grad_norm": 2.234225308705491, + "learning_rate": 7.630078993685286e-06, + "loss": 0.3981, + "step": 7697 + }, + { + "epoch": 1.17796480489671, + "grad_norm": 2.360790111872005, + "learning_rate": 7.627671482959152e-06, + "loss": 0.4557, + "step": 7698 + }, + { + "epoch": 1.1781178270849273, + "grad_norm": 2.206115497805982, + "learning_rate": 7.625264117926989e-06, + "loss": 0.4567, + "step": 7699 + }, + { + "epoch": 1.1782708492731446, + "grad_norm": 2.7009488220205395, + "learning_rate": 7.622856898736652e-06, + "loss": 0.3969, + "step": 7700 + }, + { + "epoch": 1.178423871461362, + "grad_norm": 1.9240114837643656, + "learning_rate": 7.6204498255359674e-06, + "loss": 0.4017, + "step": 7701 + }, + { + "epoch": 1.1785768936495793, + "grad_norm": 2.0628563892650558, + "learning_rate": 7.618042898472771e-06, + "loss": 0.4437, + "step": 7702 + }, + { + "epoch": 1.1787299158377964, + "grad_norm": 1.8012689942102933, + "learning_rate": 7.6156361176948804e-06, + "loss": 0.3697, + "step": 7703 + }, + { + "epoch": 1.1788829380260137, + "grad_norm": 2.22070307433343, + "learning_rate": 7.613229483350095e-06, + "loss": 0.4171, + "step": 7704 + }, + { + "epoch": 1.179035960214231, + "grad_norm": 2.312836250788219, + "learning_rate": 7.610822995586234e-06, + "loss": 0.4379, + "step": 7705 + }, + { + "epoch": 1.1791889824024484, + "grad_norm": 2.2321765546942665, + "learning_rate": 7.608416654551077e-06, + "loss": 0.3946, + "step": 7706 + }, + { + "epoch": 1.1793420045906657, + "grad_norm": 2.1449678132243677, + "learning_rate": 7.606010460392402e-06, + "loss": 0.3959, + "step": 7707 + }, + { + "epoch": 1.1794950267788828, + "grad_norm": 2.1178482207877836, + "learning_rate": 7.603604413257998e-06, + "loss": 0.4641, + "step": 7708 + }, + { + "epoch": 1.1796480489671002, + "grad_norm": 2.277472552097201, + "learning_rate": 7.601198513295618e-06, + "loss": 0.4133, + "step": 7709 + }, + { + "epoch": 1.1798010711553175, + "grad_norm": 2.3060438068546674, + "learning_rate": 7.598792760653022e-06, + "loss": 0.3717, + "step": 7710 + }, + { + "epoch": 1.1799540933435348, + "grad_norm": 2.212645777195339, + "learning_rate": 7.5963871554779586e-06, + "loss": 0.4233, + "step": 7711 + }, + { + "epoch": 1.1801071155317522, + "grad_norm": 2.288782015626369, + "learning_rate": 7.593981697918159e-06, + "loss": 0.3845, + "step": 7712 + }, + { + "epoch": 1.1802601377199693, + "grad_norm": 1.9850372928797795, + "learning_rate": 7.5915763881213576e-06, + "loss": 0.3853, + "step": 7713 + }, + { + "epoch": 1.1804131599081866, + "grad_norm": 2.1714931935713127, + "learning_rate": 7.589171226235276e-06, + "loss": 0.3907, + "step": 7714 + }, + { + "epoch": 1.180566182096404, + "grad_norm": 1.9973204967252072, + "learning_rate": 7.586766212407615e-06, + "loss": 0.3507, + "step": 7715 + }, + { + "epoch": 1.1807192042846213, + "grad_norm": 2.041853555787875, + "learning_rate": 7.584361346786082e-06, + "loss": 0.4383, + "step": 7716 + }, + { + "epoch": 1.1808722264728386, + "grad_norm": 2.0279848709164843, + "learning_rate": 7.581956629518369e-06, + "loss": 0.4229, + "step": 7717 + }, + { + "epoch": 1.1810252486610557, + "grad_norm": 2.06147392563384, + "learning_rate": 7.579552060752162e-06, + "loss": 0.4688, + "step": 7718 + }, + { + "epoch": 1.181178270849273, + "grad_norm": 2.257617092359455, + "learning_rate": 7.57714764063513e-06, + "loss": 0.4485, + "step": 7719 + }, + { + "epoch": 1.1813312930374904, + "grad_norm": 2.089931676977981, + "learning_rate": 7.574743369314937e-06, + "loss": 0.4228, + "step": 7720 + }, + { + "epoch": 1.1814843152257077, + "grad_norm": 2.018731297347733, + "learning_rate": 7.572339246939247e-06, + "loss": 0.3592, + "step": 7721 + }, + { + "epoch": 1.181637337413925, + "grad_norm": 1.9133562808684277, + "learning_rate": 7.569935273655696e-06, + "loss": 0.349, + "step": 7722 + }, + { + "epoch": 1.1817903596021424, + "grad_norm": 2.3225293651885828, + "learning_rate": 7.567531449611927e-06, + "loss": 0.3854, + "step": 7723 + }, + { + "epoch": 1.1819433817903595, + "grad_norm": 2.080329898307183, + "learning_rate": 7.565127774955571e-06, + "loss": 0.4338, + "step": 7724 + }, + { + "epoch": 1.1820964039785768, + "grad_norm": 2.127662122852471, + "learning_rate": 7.562724249834239e-06, + "loss": 0.4683, + "step": 7725 + }, + { + "epoch": 1.1822494261667942, + "grad_norm": 2.1114355637464133, + "learning_rate": 7.560320874395547e-06, + "loss": 0.3616, + "step": 7726 + }, + { + "epoch": 1.1824024483550115, + "grad_norm": 2.3223541842836237, + "learning_rate": 7.557917648787097e-06, + "loss": 0.4541, + "step": 7727 + }, + { + "epoch": 1.1825554705432288, + "grad_norm": 2.153187629446042, + "learning_rate": 7.555514573156474e-06, + "loss": 0.4665, + "step": 7728 + }, + { + "epoch": 1.1827084927314462, + "grad_norm": 2.115658998412094, + "learning_rate": 7.553111647651266e-06, + "loss": 0.3996, + "step": 7729 + }, + { + "epoch": 1.1828615149196633, + "grad_norm": 2.293884325074409, + "learning_rate": 7.5507088724190445e-06, + "loss": 0.4079, + "step": 7730 + }, + { + "epoch": 1.1830145371078806, + "grad_norm": 2.3765090288857658, + "learning_rate": 7.54830624760737e-06, + "loss": 0.4425, + "step": 7731 + }, + { + "epoch": 1.183167559296098, + "grad_norm": 1.9207125051904854, + "learning_rate": 7.5459037733638016e-06, + "loss": 0.3427, + "step": 7732 + }, + { + "epoch": 1.1833205814843153, + "grad_norm": 2.115216638237201, + "learning_rate": 7.54350144983588e-06, + "loss": 0.4216, + "step": 7733 + }, + { + "epoch": 1.1834736036725326, + "grad_norm": 2.072322814839418, + "learning_rate": 7.54109927717115e-06, + "loss": 0.4091, + "step": 7734 + }, + { + "epoch": 1.1836266258607497, + "grad_norm": 1.972905904780838, + "learning_rate": 7.53869725551713e-06, + "loss": 0.3766, + "step": 7735 + }, + { + "epoch": 1.183779648048967, + "grad_norm": 2.169516411743189, + "learning_rate": 7.536295385021338e-06, + "loss": 0.3662, + "step": 7736 + }, + { + "epoch": 1.1839326702371844, + "grad_norm": 2.0957775433273538, + "learning_rate": 7.5338936658312885e-06, + "loss": 0.398, + "step": 7737 + }, + { + "epoch": 1.1840856924254017, + "grad_norm": 2.1343320274405793, + "learning_rate": 7.531492098094473e-06, + "loss": 0.4014, + "step": 7738 + }, + { + "epoch": 1.184238714613619, + "grad_norm": 2.378363664272812, + "learning_rate": 7.5290906819583865e-06, + "loss": 0.4425, + "step": 7739 + }, + { + "epoch": 1.1843917368018362, + "grad_norm": 2.2596488259073535, + "learning_rate": 7.52668941757051e-06, + "loss": 0.4086, + "step": 7740 + }, + { + "epoch": 1.1845447589900535, + "grad_norm": 2.2518267482599135, + "learning_rate": 7.524288305078307e-06, + "loss": 0.3888, + "step": 7741 + }, + { + "epoch": 1.1846977811782708, + "grad_norm": 2.2319015517268386, + "learning_rate": 7.521887344629246e-06, + "loss": 0.4249, + "step": 7742 + }, + { + "epoch": 1.1848508033664882, + "grad_norm": 2.099102832105148, + "learning_rate": 7.519486536370779e-06, + "loss": 0.3981, + "step": 7743 + }, + { + "epoch": 1.1850038255547055, + "grad_norm": 2.326704059378749, + "learning_rate": 7.517085880450345e-06, + "loss": 0.4581, + "step": 7744 + }, + { + "epoch": 1.1851568477429226, + "grad_norm": 2.3914827027493297, + "learning_rate": 7.514685377015383e-06, + "loss": 0.394, + "step": 7745 + }, + { + "epoch": 1.18530986993114, + "grad_norm": 1.8983373970935058, + "learning_rate": 7.512285026213311e-06, + "loss": 0.3382, + "step": 7746 + }, + { + "epoch": 1.1854628921193573, + "grad_norm": 2.3908709484472523, + "learning_rate": 7.509884828191552e-06, + "loss": 0.4461, + "step": 7747 + }, + { + "epoch": 1.1856159143075746, + "grad_norm": 2.483737125826371, + "learning_rate": 7.5074847830975054e-06, + "loss": 0.4697, + "step": 7748 + }, + { + "epoch": 1.185768936495792, + "grad_norm": 2.1147090426635224, + "learning_rate": 7.505084891078566e-06, + "loss": 0.3533, + "step": 7749 + }, + { + "epoch": 1.185921958684009, + "grad_norm": 2.1135289311255407, + "learning_rate": 7.5026851522821286e-06, + "loss": 0.4444, + "step": 7750 + }, + { + "epoch": 1.1860749808722264, + "grad_norm": 2.1675730484291864, + "learning_rate": 7.500285566855564e-06, + "loss": 0.3763, + "step": 7751 + }, + { + "epoch": 1.1862280030604437, + "grad_norm": 2.1575555766890706, + "learning_rate": 7.497886134946238e-06, + "loss": 0.4758, + "step": 7752 + }, + { + "epoch": 1.186381025248661, + "grad_norm": 1.7799980041205554, + "learning_rate": 7.49548685670152e-06, + "loss": 0.3235, + "step": 7753 + }, + { + "epoch": 1.1865340474368784, + "grad_norm": 2.344961863500604, + "learning_rate": 7.493087732268744e-06, + "loss": 0.4517, + "step": 7754 + }, + { + "epoch": 1.1866870696250955, + "grad_norm": 2.3828288288807684, + "learning_rate": 7.490688761795262e-06, + "loss": 0.4251, + "step": 7755 + }, + { + "epoch": 1.1868400918133128, + "grad_norm": 2.224048831830438, + "learning_rate": 7.4882899454284e-06, + "loss": 0.4147, + "step": 7756 + }, + { + "epoch": 1.1869931140015302, + "grad_norm": 2.3063506729017655, + "learning_rate": 7.485891283315475e-06, + "loss": 0.3875, + "step": 7757 + }, + { + "epoch": 1.1871461361897475, + "grad_norm": 2.369613366127151, + "learning_rate": 7.483492775603803e-06, + "loss": 0.4198, + "step": 7758 + }, + { + "epoch": 1.1872991583779648, + "grad_norm": 2.2857022695312095, + "learning_rate": 7.481094422440688e-06, + "loss": 0.4651, + "step": 7759 + }, + { + "epoch": 1.1874521805661822, + "grad_norm": 1.9585673421612289, + "learning_rate": 7.478696223973413e-06, + "loss": 0.3196, + "step": 7760 + }, + { + "epoch": 1.1876052027543993, + "grad_norm": 2.103682155279868, + "learning_rate": 7.476298180349269e-06, + "loss": 0.403, + "step": 7761 + }, + { + "epoch": 1.1877582249426166, + "grad_norm": 1.8589192664213134, + "learning_rate": 7.4739002917155235e-06, + "loss": 0.3739, + "step": 7762 + }, + { + "epoch": 1.187911247130834, + "grad_norm": 2.2039683638286203, + "learning_rate": 7.471502558219448e-06, + "loss": 0.4164, + "step": 7763 + }, + { + "epoch": 1.1880642693190513, + "grad_norm": 2.285658684641684, + "learning_rate": 7.469104980008291e-06, + "loss": 0.4457, + "step": 7764 + }, + { + "epoch": 1.1882172915072686, + "grad_norm": 2.0870031789407077, + "learning_rate": 7.466707557229294e-06, + "loss": 0.4712, + "step": 7765 + }, + { + "epoch": 1.188370313695486, + "grad_norm": 1.981110909098681, + "learning_rate": 7.464310290029702e-06, + "loss": 0.442, + "step": 7766 + }, + { + "epoch": 1.188523335883703, + "grad_norm": 2.146376730190204, + "learning_rate": 7.461913178556731e-06, + "loss": 0.4094, + "step": 7767 + }, + { + "epoch": 1.1886763580719204, + "grad_norm": 2.1228310670875827, + "learning_rate": 7.4595162229576005e-06, + "loss": 0.425, + "step": 7768 + }, + { + "epoch": 1.1888293802601377, + "grad_norm": 2.0450434435134115, + "learning_rate": 7.45711942337952e-06, + "loss": 0.3992, + "step": 7769 + }, + { + "epoch": 1.188982402448355, + "grad_norm": 2.072140769338632, + "learning_rate": 7.45472277996968e-06, + "loss": 0.3567, + "step": 7770 + }, + { + "epoch": 1.1891354246365724, + "grad_norm": 2.1452549233035163, + "learning_rate": 7.452326292875273e-06, + "loss": 0.3996, + "step": 7771 + }, + { + "epoch": 1.1892884468247895, + "grad_norm": 2.0564488471141025, + "learning_rate": 7.449929962243478e-06, + "loss": 0.4003, + "step": 7772 + }, + { + "epoch": 1.1894414690130068, + "grad_norm": 2.2765367450207177, + "learning_rate": 7.447533788221454e-06, + "loss": 0.4523, + "step": 7773 + }, + { + "epoch": 1.1895944912012242, + "grad_norm": 2.474886870340549, + "learning_rate": 7.445137770956368e-06, + "loss": 0.4561, + "step": 7774 + }, + { + "epoch": 1.1897475133894415, + "grad_norm": 2.296110569783041, + "learning_rate": 7.442741910595365e-06, + "loss": 0.4152, + "step": 7775 + }, + { + "epoch": 1.1899005355776588, + "grad_norm": 2.3301560185251726, + "learning_rate": 7.440346207285589e-06, + "loss": 0.4871, + "step": 7776 + }, + { + "epoch": 1.190053557765876, + "grad_norm": 1.8039552048846907, + "learning_rate": 7.437950661174164e-06, + "loss": 0.3184, + "step": 7777 + }, + { + "epoch": 1.1902065799540933, + "grad_norm": 2.1926490325759, + "learning_rate": 7.435555272408208e-06, + "loss": 0.4312, + "step": 7778 + }, + { + "epoch": 1.1903596021423106, + "grad_norm": 2.104249637935987, + "learning_rate": 7.433160041134841e-06, + "loss": 0.3409, + "step": 7779 + }, + { + "epoch": 1.190512624330528, + "grad_norm": 2.20756794295377, + "learning_rate": 7.430764967501155e-06, + "loss": 0.3749, + "step": 7780 + }, + { + "epoch": 1.1906656465187453, + "grad_norm": 2.069229329976173, + "learning_rate": 7.428370051654241e-06, + "loss": 0.3903, + "step": 7781 + }, + { + "epoch": 1.1908186687069624, + "grad_norm": 2.2207924833034087, + "learning_rate": 7.425975293741186e-06, + "loss": 0.4389, + "step": 7782 + }, + { + "epoch": 1.1909716908951797, + "grad_norm": 2.2331530214251623, + "learning_rate": 7.4235806939090515e-06, + "loss": 0.4131, + "step": 7783 + }, + { + "epoch": 1.191124713083397, + "grad_norm": 2.2504109035986026, + "learning_rate": 7.421186252304913e-06, + "loss": 0.4731, + "step": 7784 + }, + { + "epoch": 1.1912777352716144, + "grad_norm": 2.371044724991984, + "learning_rate": 7.4187919690758145e-06, + "loss": 0.4377, + "step": 7785 + }, + { + "epoch": 1.1914307574598317, + "grad_norm": 2.242523834065211, + "learning_rate": 7.416397844368792e-06, + "loss": 0.3621, + "step": 7786 + }, + { + "epoch": 1.1915837796480488, + "grad_norm": 2.2945870940039668, + "learning_rate": 7.414003878330893e-06, + "loss": 0.3849, + "step": 7787 + }, + { + "epoch": 1.1917368018362662, + "grad_norm": 2.266870340168214, + "learning_rate": 7.411610071109131e-06, + "loss": 0.4038, + "step": 7788 + }, + { + "epoch": 1.1918898240244835, + "grad_norm": 2.1237321269660834, + "learning_rate": 7.409216422850514e-06, + "loss": 0.3927, + "step": 7789 + }, + { + "epoch": 1.1920428462127008, + "grad_norm": 2.3142228870616126, + "learning_rate": 7.406822933702058e-06, + "loss": 0.4434, + "step": 7790 + }, + { + "epoch": 1.1921958684009182, + "grad_norm": 2.0955604959527068, + "learning_rate": 7.404429603810747e-06, + "loss": 0.3222, + "step": 7791 + }, + { + "epoch": 1.1923488905891355, + "grad_norm": 1.9617282423466649, + "learning_rate": 7.402036433323569e-06, + "loss": 0.3517, + "step": 7792 + }, + { + "epoch": 1.1925019127773526, + "grad_norm": 1.9869255508557389, + "learning_rate": 7.399643422387499e-06, + "loss": 0.3596, + "step": 7793 + }, + { + "epoch": 1.19265493496557, + "grad_norm": 2.2481717873813367, + "learning_rate": 7.397250571149496e-06, + "loss": 0.4018, + "step": 7794 + }, + { + "epoch": 1.1928079571537873, + "grad_norm": 2.267844809299893, + "learning_rate": 7.3948578797565185e-06, + "loss": 0.4399, + "step": 7795 + }, + { + "epoch": 1.1929609793420046, + "grad_norm": 2.296245229262881, + "learning_rate": 7.392465348355512e-06, + "loss": 0.4375, + "step": 7796 + }, + { + "epoch": 1.193114001530222, + "grad_norm": 2.4275607536962585, + "learning_rate": 7.390072977093405e-06, + "loss": 0.4886, + "step": 7797 + }, + { + "epoch": 1.1932670237184393, + "grad_norm": 2.1878475452491633, + "learning_rate": 7.387680766117129e-06, + "loss": 0.4665, + "step": 7798 + }, + { + "epoch": 1.1934200459066564, + "grad_norm": 2.2456587752248445, + "learning_rate": 7.3852887155735955e-06, + "loss": 0.4437, + "step": 7799 + }, + { + "epoch": 1.1935730680948737, + "grad_norm": 2.395071968132589, + "learning_rate": 7.382896825609714e-06, + "loss": 0.456, + "step": 7800 + }, + { + "epoch": 1.193726090283091, + "grad_norm": 2.1758010578869738, + "learning_rate": 7.380505096372375e-06, + "loss": 0.3582, + "step": 7801 + }, + { + "epoch": 1.1938791124713084, + "grad_norm": 2.457008422878569, + "learning_rate": 7.3781135280084615e-06, + "loss": 0.4464, + "step": 7802 + }, + { + "epoch": 1.1940321346595257, + "grad_norm": 2.0253508595174394, + "learning_rate": 7.375722120664859e-06, + "loss": 0.3641, + "step": 7803 + }, + { + "epoch": 1.1941851568477428, + "grad_norm": 2.207183732033762, + "learning_rate": 7.373330874488422e-06, + "loss": 0.4679, + "step": 7804 + }, + { + "epoch": 1.1943381790359602, + "grad_norm": 1.9874369076730363, + "learning_rate": 7.370939789626016e-06, + "loss": 0.3426, + "step": 7805 + }, + { + "epoch": 1.1944912012241775, + "grad_norm": 2.2001806026700153, + "learning_rate": 7.368548866224483e-06, + "loss": 0.3841, + "step": 7806 + }, + { + "epoch": 1.1946442234123948, + "grad_norm": 1.9436191856311789, + "learning_rate": 7.366158104430654e-06, + "loss": 0.3105, + "step": 7807 + }, + { + "epoch": 1.1947972456006122, + "grad_norm": 2.0716591900676145, + "learning_rate": 7.363767504391362e-06, + "loss": 0.3567, + "step": 7808 + }, + { + "epoch": 1.1949502677888293, + "grad_norm": 2.3756768448765504, + "learning_rate": 7.361377066253424e-06, + "loss": 0.4912, + "step": 7809 + }, + { + "epoch": 1.1951032899770466, + "grad_norm": 2.044315279391537, + "learning_rate": 7.358986790163636e-06, + "loss": 0.3538, + "step": 7810 + }, + { + "epoch": 1.195256312165264, + "grad_norm": 1.9855034404666652, + "learning_rate": 7.356596676268804e-06, + "loss": 0.3895, + "step": 7811 + }, + { + "epoch": 1.1954093343534813, + "grad_norm": 2.315701745879983, + "learning_rate": 7.354206724715709e-06, + "loss": 0.4436, + "step": 7812 + }, + { + "epoch": 1.1955623565416986, + "grad_norm": 2.134487707383376, + "learning_rate": 7.3518169356511335e-06, + "loss": 0.3905, + "step": 7813 + }, + { + "epoch": 1.1957153787299157, + "grad_norm": 2.0187548298884543, + "learning_rate": 7.349427309221838e-06, + "loss": 0.3984, + "step": 7814 + }, + { + "epoch": 1.195868400918133, + "grad_norm": 2.271455573960681, + "learning_rate": 7.347037845574578e-06, + "loss": 0.3748, + "step": 7815 + }, + { + "epoch": 1.1960214231063504, + "grad_norm": 2.5486034536590987, + "learning_rate": 7.344648544856108e-06, + "loss": 0.5062, + "step": 7816 + }, + { + "epoch": 1.1961744452945677, + "grad_norm": 2.222378931229507, + "learning_rate": 7.342259407213155e-06, + "loss": 0.3106, + "step": 7817 + }, + { + "epoch": 1.196327467482785, + "grad_norm": 2.384115619572824, + "learning_rate": 7.339870432792448e-06, + "loss": 0.4188, + "step": 7818 + }, + { + "epoch": 1.1964804896710022, + "grad_norm": 2.4136145453264652, + "learning_rate": 7.337481621740707e-06, + "loss": 0.4704, + "step": 7819 + }, + { + "epoch": 1.1966335118592195, + "grad_norm": 2.081016144614866, + "learning_rate": 7.335092974204632e-06, + "loss": 0.409, + "step": 7820 + }, + { + "epoch": 1.1967865340474368, + "grad_norm": 2.0694517272109563, + "learning_rate": 7.332704490330924e-06, + "loss": 0.3886, + "step": 7821 + }, + { + "epoch": 1.1969395562356542, + "grad_norm": 2.015225672566741, + "learning_rate": 7.3303161702662715e-06, + "loss": 0.3692, + "step": 7822 + }, + { + "epoch": 1.1970925784238715, + "grad_norm": 2.503758938970734, + "learning_rate": 7.327928014157341e-06, + "loss": 0.4168, + "step": 7823 + }, + { + "epoch": 1.1972456006120888, + "grad_norm": 2.1923521871806626, + "learning_rate": 7.3255400221508076e-06, + "loss": 0.4147, + "step": 7824 + }, + { + "epoch": 1.197398622800306, + "grad_norm": 1.9996996439024073, + "learning_rate": 7.323152194393326e-06, + "loss": 0.424, + "step": 7825 + }, + { + "epoch": 1.1975516449885233, + "grad_norm": 2.3176090940404004, + "learning_rate": 7.320764531031535e-06, + "loss": 0.4398, + "step": 7826 + }, + { + "epoch": 1.1977046671767406, + "grad_norm": 2.0145284165200503, + "learning_rate": 7.318377032212078e-06, + "loss": 0.361, + "step": 7827 + }, + { + "epoch": 1.197857689364958, + "grad_norm": 1.818639838520028, + "learning_rate": 7.315989698081576e-06, + "loss": 0.3775, + "step": 7828 + }, + { + "epoch": 1.1980107115531753, + "grad_norm": 2.1098549855251676, + "learning_rate": 7.313602528786649e-06, + "loss": 0.3689, + "step": 7829 + }, + { + "epoch": 1.1981637337413926, + "grad_norm": 2.128136439868593, + "learning_rate": 7.3112155244739e-06, + "loss": 0.4344, + "step": 7830 + }, + { + "epoch": 1.1983167559296097, + "grad_norm": 2.4135696579714314, + "learning_rate": 7.3088286852899224e-06, + "loss": 0.4493, + "step": 7831 + }, + { + "epoch": 1.198469778117827, + "grad_norm": 2.0006879555739894, + "learning_rate": 7.306442011381307e-06, + "loss": 0.4525, + "step": 7832 + }, + { + "epoch": 1.1986228003060444, + "grad_norm": 2.051025939461601, + "learning_rate": 7.3040555028946225e-06, + "loss": 0.4063, + "step": 7833 + }, + { + "epoch": 1.1987758224942617, + "grad_norm": 2.034625467920058, + "learning_rate": 7.301669159976434e-06, + "loss": 0.4244, + "step": 7834 + }, + { + "epoch": 1.198928844682479, + "grad_norm": 2.0888212266302326, + "learning_rate": 7.299282982773301e-06, + "loss": 0.3443, + "step": 7835 + }, + { + "epoch": 1.1990818668706962, + "grad_norm": 1.9813426666478027, + "learning_rate": 7.296896971431764e-06, + "loss": 0.3808, + "step": 7836 + }, + { + "epoch": 1.1992348890589135, + "grad_norm": 2.1295621512831446, + "learning_rate": 7.2945111260983584e-06, + "loss": 0.3701, + "step": 7837 + }, + { + "epoch": 1.1993879112471308, + "grad_norm": 2.0436205396614646, + "learning_rate": 7.292125446919611e-06, + "loss": 0.3835, + "step": 7838 + }, + { + "epoch": 1.1995409334353482, + "grad_norm": 2.119557743892499, + "learning_rate": 7.289739934042031e-06, + "loss": 0.3901, + "step": 7839 + }, + { + "epoch": 1.1996939556235655, + "grad_norm": 2.300357163359858, + "learning_rate": 7.287354587612123e-06, + "loss": 0.424, + "step": 7840 + }, + { + "epoch": 1.1998469778117826, + "grad_norm": 2.085956552889289, + "learning_rate": 7.284969407776382e-06, + "loss": 0.3756, + "step": 7841 + }, + { + "epoch": 1.2, + "grad_norm": 2.3081369285656725, + "learning_rate": 7.282584394681294e-06, + "loss": 0.4268, + "step": 7842 + }, + { + "epoch": 1.2001530221882173, + "grad_norm": 2.353996381043388, + "learning_rate": 7.280199548473328e-06, + "loss": 0.4335, + "step": 7843 + }, + { + "epoch": 1.2003060443764346, + "grad_norm": 2.414828978611619, + "learning_rate": 7.277814869298945e-06, + "loss": 0.4901, + "step": 7844 + }, + { + "epoch": 1.200459066564652, + "grad_norm": 2.3718399813628452, + "learning_rate": 7.275430357304604e-06, + "loss": 0.4031, + "step": 7845 + }, + { + "epoch": 1.200612088752869, + "grad_norm": 2.2121027430213163, + "learning_rate": 7.273046012636742e-06, + "loss": 0.4049, + "step": 7846 + }, + { + "epoch": 1.2007651109410864, + "grad_norm": 2.0996973379693897, + "learning_rate": 7.270661835441789e-06, + "loss": 0.3518, + "step": 7847 + }, + { + "epoch": 1.2009181331293037, + "grad_norm": 2.3792243056952844, + "learning_rate": 7.268277825866175e-06, + "loss": 0.4506, + "step": 7848 + }, + { + "epoch": 1.201071155317521, + "grad_norm": 2.2470123496158294, + "learning_rate": 7.265893984056302e-06, + "loss": 0.4264, + "step": 7849 + }, + { + "epoch": 1.2012241775057384, + "grad_norm": 2.6193999301938273, + "learning_rate": 7.263510310158577e-06, + "loss": 0.4304, + "step": 7850 + }, + { + "epoch": 1.2013771996939555, + "grad_norm": 1.9781200787965287, + "learning_rate": 7.261126804319391e-06, + "loss": 0.3284, + "step": 7851 + }, + { + "epoch": 1.2015302218821728, + "grad_norm": 2.0903308853535227, + "learning_rate": 7.258743466685119e-06, + "loss": 0.3898, + "step": 7852 + }, + { + "epoch": 1.2016832440703902, + "grad_norm": 2.0779995520765464, + "learning_rate": 7.2563602974021365e-06, + "loss": 0.3994, + "step": 7853 + }, + { + "epoch": 1.2018362662586075, + "grad_norm": 2.185208271217259, + "learning_rate": 7.253977296616802e-06, + "loss": 0.4303, + "step": 7854 + }, + { + "epoch": 1.2019892884468248, + "grad_norm": 2.093158167390441, + "learning_rate": 7.251594464475462e-06, + "loss": 0.4102, + "step": 7855 + }, + { + "epoch": 1.2021423106350422, + "grad_norm": 2.474345761675588, + "learning_rate": 7.249211801124459e-06, + "loss": 0.3867, + "step": 7856 + }, + { + "epoch": 1.2022953328232593, + "grad_norm": 2.354161944422409, + "learning_rate": 7.246829306710118e-06, + "loss": 0.4842, + "step": 7857 + }, + { + "epoch": 1.2024483550114766, + "grad_norm": 2.066474315946716, + "learning_rate": 7.244446981378764e-06, + "loss": 0.3929, + "step": 7858 + }, + { + "epoch": 1.202601377199694, + "grad_norm": 2.2841546405429742, + "learning_rate": 7.242064825276699e-06, + "loss": 0.3933, + "step": 7859 + }, + { + "epoch": 1.2027543993879113, + "grad_norm": 2.421878768693007, + "learning_rate": 7.239682838550219e-06, + "loss": 0.4667, + "step": 7860 + }, + { + "epoch": 1.2029074215761286, + "grad_norm": 2.121335990190176, + "learning_rate": 7.237301021345621e-06, + "loss": 0.4164, + "step": 7861 + }, + { + "epoch": 1.203060443764346, + "grad_norm": 2.2477816426607347, + "learning_rate": 7.23491937380917e-06, + "loss": 0.4141, + "step": 7862 + }, + { + "epoch": 1.203213465952563, + "grad_norm": 2.0842774961175965, + "learning_rate": 7.232537896087138e-06, + "loss": 0.4256, + "step": 7863 + }, + { + "epoch": 1.2033664881407804, + "grad_norm": 2.3760441894624034, + "learning_rate": 7.230156588325783e-06, + "loss": 0.4749, + "step": 7864 + }, + { + "epoch": 1.2035195103289977, + "grad_norm": 2.8127024874878117, + "learning_rate": 7.2277754506713415e-06, + "loss": 0.3896, + "step": 7865 + }, + { + "epoch": 1.203672532517215, + "grad_norm": 2.3056681652953546, + "learning_rate": 7.22539448327006e-06, + "loss": 0.4713, + "step": 7866 + }, + { + "epoch": 1.2038255547054324, + "grad_norm": 2.038914674694339, + "learning_rate": 7.223013686268159e-06, + "loss": 0.3701, + "step": 7867 + }, + { + "epoch": 1.2039785768936495, + "grad_norm": 2.279184568859362, + "learning_rate": 7.2206330598118435e-06, + "loss": 0.5052, + "step": 7868 + }, + { + "epoch": 1.2041315990818668, + "grad_norm": 2.362094871311031, + "learning_rate": 7.218252604047331e-06, + "loss": 0.4475, + "step": 7869 + }, + { + "epoch": 1.2042846212700842, + "grad_norm": 2.2299992118588188, + "learning_rate": 7.215872319120809e-06, + "loss": 0.4853, + "step": 7870 + }, + { + "epoch": 1.2044376434583015, + "grad_norm": 1.8836964001194594, + "learning_rate": 7.213492205178454e-06, + "loss": 0.2751, + "step": 7871 + }, + { + "epoch": 1.2045906656465188, + "grad_norm": 2.1731065858170515, + "learning_rate": 7.2111122623664486e-06, + "loss": 0.4221, + "step": 7872 + }, + { + "epoch": 1.204743687834736, + "grad_norm": 2.042405359275383, + "learning_rate": 7.208732490830945e-06, + "loss": 0.3539, + "step": 7873 + }, + { + "epoch": 1.2048967100229533, + "grad_norm": 1.916086959905358, + "learning_rate": 7.206352890718102e-06, + "loss": 0.3526, + "step": 7874 + }, + { + "epoch": 1.2050497322111706, + "grad_norm": 2.08298656195251, + "learning_rate": 7.203973462174059e-06, + "loss": 0.3444, + "step": 7875 + }, + { + "epoch": 1.205202754399388, + "grad_norm": 2.127087834921326, + "learning_rate": 7.201594205344937e-06, + "loss": 0.3774, + "step": 7876 + }, + { + "epoch": 1.2053557765876053, + "grad_norm": 1.8822819513462488, + "learning_rate": 7.199215120376866e-06, + "loss": 0.4222, + "step": 7877 + }, + { + "epoch": 1.2055087987758224, + "grad_norm": 2.0069738268178043, + "learning_rate": 7.19683620741595e-06, + "loss": 0.3617, + "step": 7878 + }, + { + "epoch": 1.2056618209640397, + "grad_norm": 2.347032617268608, + "learning_rate": 7.1944574666082925e-06, + "loss": 0.4167, + "step": 7879 + }, + { + "epoch": 1.205814843152257, + "grad_norm": 1.9841895774201872, + "learning_rate": 7.192078898099975e-06, + "loss": 0.3959, + "step": 7880 + }, + { + "epoch": 1.2059678653404744, + "grad_norm": 2.1545952125519254, + "learning_rate": 7.189700502037073e-06, + "loss": 0.3581, + "step": 7881 + }, + { + "epoch": 1.2061208875286917, + "grad_norm": 2.2942907440257345, + "learning_rate": 7.187322278565665e-06, + "loss": 0.4123, + "step": 7882 + }, + { + "epoch": 1.2062739097169088, + "grad_norm": 2.3337214188107076, + "learning_rate": 7.184944227831794e-06, + "loss": 0.4186, + "step": 7883 + }, + { + "epoch": 1.2064269319051262, + "grad_norm": 2.1105765004274546, + "learning_rate": 7.18256634998151e-06, + "loss": 0.3696, + "step": 7884 + }, + { + "epoch": 1.2065799540933435, + "grad_norm": 1.8495228363779856, + "learning_rate": 7.180188645160851e-06, + "loss": 0.3263, + "step": 7885 + }, + { + "epoch": 1.2067329762815608, + "grad_norm": 2.178466457135846, + "learning_rate": 7.1778111135158355e-06, + "loss": 0.4567, + "step": 7886 + }, + { + "epoch": 1.2068859984697782, + "grad_norm": 2.329996121904899, + "learning_rate": 7.175433755192483e-06, + "loss": 0.3929, + "step": 7887 + }, + { + "epoch": 1.2070390206579953, + "grad_norm": 2.7057684229341143, + "learning_rate": 7.173056570336794e-06, + "loss": 0.4107, + "step": 7888 + }, + { + "epoch": 1.2071920428462126, + "grad_norm": 2.2813650165867045, + "learning_rate": 7.170679559094755e-06, + "loss": 0.3918, + "step": 7889 + }, + { + "epoch": 1.20734506503443, + "grad_norm": 2.2649980429048617, + "learning_rate": 7.168302721612357e-06, + "loss": 0.3625, + "step": 7890 + }, + { + "epoch": 1.2074980872226473, + "grad_norm": 1.9690419186555863, + "learning_rate": 7.165926058035566e-06, + "loss": 0.3964, + "step": 7891 + }, + { + "epoch": 1.2076511094108646, + "grad_norm": 2.402097955904389, + "learning_rate": 7.163549568510341e-06, + "loss": 0.4267, + "step": 7892 + }, + { + "epoch": 1.207804131599082, + "grad_norm": 2.241109535031182, + "learning_rate": 7.161173253182634e-06, + "loss": 0.3604, + "step": 7893 + }, + { + "epoch": 1.207957153787299, + "grad_norm": 2.3475139013875213, + "learning_rate": 7.1587971121983815e-06, + "loss": 0.4277, + "step": 7894 + }, + { + "epoch": 1.2081101759755164, + "grad_norm": 2.4465913526801844, + "learning_rate": 7.156421145703517e-06, + "loss": 0.3985, + "step": 7895 + }, + { + "epoch": 1.2082631981637337, + "grad_norm": 2.3863337251537318, + "learning_rate": 7.154045353843952e-06, + "loss": 0.3679, + "step": 7896 + }, + { + "epoch": 1.208416220351951, + "grad_norm": 2.126924455000003, + "learning_rate": 7.151669736765594e-06, + "loss": 0.3994, + "step": 7897 + }, + { + "epoch": 1.2085692425401684, + "grad_norm": 2.304629645464992, + "learning_rate": 7.149294294614344e-06, + "loss": 0.3881, + "step": 7898 + }, + { + "epoch": 1.2087222647283857, + "grad_norm": 2.10307375531413, + "learning_rate": 7.146919027536081e-06, + "loss": 0.407, + "step": 7899 + }, + { + "epoch": 1.2088752869166028, + "grad_norm": 2.1356508233173357, + "learning_rate": 7.14454393567668e-06, + "loss": 0.3948, + "step": 7900 + }, + { + "epoch": 1.2090283091048202, + "grad_norm": 2.1661367713580146, + "learning_rate": 7.14216901918201e-06, + "loss": 0.369, + "step": 7901 + }, + { + "epoch": 1.2091813312930375, + "grad_norm": 2.073359287919676, + "learning_rate": 7.1397942781979165e-06, + "loss": 0.4001, + "step": 7902 + }, + { + "epoch": 1.2093343534812548, + "grad_norm": 2.285306774993513, + "learning_rate": 7.137419712870248e-06, + "loss": 0.4791, + "step": 7903 + }, + { + "epoch": 1.2094873756694722, + "grad_norm": 2.2565147922670272, + "learning_rate": 7.135045323344836e-06, + "loss": 0.4129, + "step": 7904 + }, + { + "epoch": 1.2096403978576893, + "grad_norm": 2.205771600598996, + "learning_rate": 7.132671109767493e-06, + "loss": 0.4151, + "step": 7905 + }, + { + "epoch": 1.2097934200459066, + "grad_norm": 2.3554198855954804, + "learning_rate": 7.130297072284037e-06, + "loss": 0.3735, + "step": 7906 + }, + { + "epoch": 1.209946442234124, + "grad_norm": 2.460816659992818, + "learning_rate": 7.127923211040267e-06, + "loss": 0.4421, + "step": 7907 + }, + { + "epoch": 1.2100994644223413, + "grad_norm": 2.2894306166581697, + "learning_rate": 7.125549526181963e-06, + "loss": 0.4287, + "step": 7908 + }, + { + "epoch": 1.2102524866105586, + "grad_norm": 2.2431820015839734, + "learning_rate": 7.123176017854909e-06, + "loss": 0.3583, + "step": 7909 + }, + { + "epoch": 1.2104055087987757, + "grad_norm": 2.3312818039925576, + "learning_rate": 7.120802686204869e-06, + "loss": 0.4208, + "step": 7910 + }, + { + "epoch": 1.210558530986993, + "grad_norm": 2.3032859188438737, + "learning_rate": 7.118429531377604e-06, + "loss": 0.4121, + "step": 7911 + }, + { + "epoch": 1.2107115531752104, + "grad_norm": 2.251670699371594, + "learning_rate": 7.116056553518851e-06, + "loss": 0.4033, + "step": 7912 + }, + { + "epoch": 1.2108645753634277, + "grad_norm": 2.0986050604327278, + "learning_rate": 7.113683752774345e-06, + "loss": 0.3579, + "step": 7913 + }, + { + "epoch": 1.211017597551645, + "grad_norm": 2.2418784167236545, + "learning_rate": 7.1113111292898174e-06, + "loss": 0.407, + "step": 7914 + }, + { + "epoch": 1.2111706197398622, + "grad_norm": 2.163620185778059, + "learning_rate": 7.108938683210968e-06, + "loss": 0.4055, + "step": 7915 + }, + { + "epoch": 1.2113236419280795, + "grad_norm": 2.355202682771081, + "learning_rate": 7.106566414683506e-06, + "loss": 0.4669, + "step": 7916 + }, + { + "epoch": 1.2114766641162968, + "grad_norm": 2.1271629737901625, + "learning_rate": 7.104194323853122e-06, + "loss": 0.3561, + "step": 7917 + }, + { + "epoch": 1.2116296863045142, + "grad_norm": 2.0575868876645096, + "learning_rate": 7.101822410865489e-06, + "loss": 0.3592, + "step": 7918 + }, + { + "epoch": 1.2117827084927315, + "grad_norm": 2.0538971546012696, + "learning_rate": 7.099450675866282e-06, + "loss": 0.4481, + "step": 7919 + }, + { + "epoch": 1.2119357306809486, + "grad_norm": 1.919676635657094, + "learning_rate": 7.0970791190011566e-06, + "loss": 0.3355, + "step": 7920 + }, + { + "epoch": 1.212088752869166, + "grad_norm": 2.204903650709108, + "learning_rate": 7.094707740415756e-06, + "loss": 0.4253, + "step": 7921 + }, + { + "epoch": 1.2122417750573833, + "grad_norm": 2.3272233967248335, + "learning_rate": 7.09233654025572e-06, + "loss": 0.3645, + "step": 7922 + }, + { + "epoch": 1.2123947972456006, + "grad_norm": 2.116304203277378, + "learning_rate": 7.08996551866667e-06, + "loss": 0.4008, + "step": 7923 + }, + { + "epoch": 1.212547819433818, + "grad_norm": 2.12004466283652, + "learning_rate": 7.087594675794226e-06, + "loss": 0.3908, + "step": 7924 + }, + { + "epoch": 1.2127008416220353, + "grad_norm": 2.3012142929784414, + "learning_rate": 7.085224011783984e-06, + "loss": 0.4094, + "step": 7925 + }, + { + "epoch": 1.2128538638102524, + "grad_norm": 2.225388844694236, + "learning_rate": 7.082853526781537e-06, + "loss": 0.3643, + "step": 7926 + }, + { + "epoch": 1.2130068859984697, + "grad_norm": 2.076324760976028, + "learning_rate": 7.080483220932469e-06, + "loss": 0.5358, + "step": 7927 + }, + { + "epoch": 1.213159908186687, + "grad_norm": 2.108445837070922, + "learning_rate": 7.078113094382348e-06, + "loss": 0.3903, + "step": 7928 + }, + { + "epoch": 1.2133129303749044, + "grad_norm": 2.541491652948375, + "learning_rate": 7.075743147276728e-06, + "loss": 0.4761, + "step": 7929 + }, + { + "epoch": 1.2134659525631217, + "grad_norm": 2.1264106715257003, + "learning_rate": 7.073373379761166e-06, + "loss": 0.4052, + "step": 7930 + }, + { + "epoch": 1.213618974751339, + "grad_norm": 2.500809227928582, + "learning_rate": 7.07100379198119e-06, + "loss": 0.4861, + "step": 7931 + }, + { + "epoch": 1.2137719969395562, + "grad_norm": 2.0731214372985245, + "learning_rate": 7.068634384082331e-06, + "loss": 0.3271, + "step": 7932 + }, + { + "epoch": 1.2139250191277735, + "grad_norm": 2.2473665302879544, + "learning_rate": 7.066265156210105e-06, + "loss": 0.3925, + "step": 7933 + }, + { + "epoch": 1.2140780413159908, + "grad_norm": 2.023127298800279, + "learning_rate": 7.063896108510008e-06, + "loss": 0.374, + "step": 7934 + }, + { + "epoch": 1.2142310635042082, + "grad_norm": 2.1057736936995712, + "learning_rate": 7.0615272411275395e-06, + "loss": 0.4494, + "step": 7935 + }, + { + "epoch": 1.2143840856924255, + "grad_norm": 2.1942854844397583, + "learning_rate": 7.05915855420818e-06, + "loss": 0.4072, + "step": 7936 + }, + { + "epoch": 1.2145371078806426, + "grad_norm": 2.16395490685311, + "learning_rate": 7.056790047897394e-06, + "loss": 0.3736, + "step": 7937 + }, + { + "epoch": 1.21469013006886, + "grad_norm": 2.2678730333273793, + "learning_rate": 7.054421722340647e-06, + "loss": 0.444, + "step": 7938 + }, + { + "epoch": 1.2148431522570773, + "grad_norm": 2.200405364638579, + "learning_rate": 7.052053577683384e-06, + "loss": 0.3841, + "step": 7939 + }, + { + "epoch": 1.2149961744452946, + "grad_norm": 2.277747587318131, + "learning_rate": 7.049685614071047e-06, + "loss": 0.476, + "step": 7940 + }, + { + "epoch": 1.215149196633512, + "grad_norm": 2.010732008758074, + "learning_rate": 7.047317831649056e-06, + "loss": 0.3571, + "step": 7941 + }, + { + "epoch": 1.215302218821729, + "grad_norm": 2.252647658079372, + "learning_rate": 7.044950230562826e-06, + "loss": 0.4632, + "step": 7942 + }, + { + "epoch": 1.2154552410099464, + "grad_norm": 2.3309241879798486, + "learning_rate": 7.042582810957767e-06, + "loss": 0.407, + "step": 7943 + }, + { + "epoch": 1.2156082631981637, + "grad_norm": 1.9771222565408002, + "learning_rate": 7.040215572979262e-06, + "loss": 0.3727, + "step": 7944 + }, + { + "epoch": 1.215761285386381, + "grad_norm": 2.2011196818146517, + "learning_rate": 7.0378485167727e-06, + "loss": 0.3456, + "step": 7945 + }, + { + "epoch": 1.2159143075745984, + "grad_norm": 2.0983866502398154, + "learning_rate": 7.035481642483451e-06, + "loss": 0.3567, + "step": 7946 + }, + { + "epoch": 1.2160673297628155, + "grad_norm": 2.391056276231594, + "learning_rate": 7.033114950256865e-06, + "loss": 0.4006, + "step": 7947 + }, + { + "epoch": 1.2162203519510328, + "grad_norm": 2.3512508079427223, + "learning_rate": 7.0307484402383015e-06, + "loss": 0.4517, + "step": 7948 + }, + { + "epoch": 1.2163733741392502, + "grad_norm": 2.1246473090309577, + "learning_rate": 7.028382112573093e-06, + "loss": 0.3766, + "step": 7949 + }, + { + "epoch": 1.2165263963274675, + "grad_norm": 2.075108528173804, + "learning_rate": 7.026015967406559e-06, + "loss": 0.3935, + "step": 7950 + }, + { + "epoch": 1.2166794185156848, + "grad_norm": 2.121208516187529, + "learning_rate": 7.023650004884024e-06, + "loss": 0.3355, + "step": 7951 + }, + { + "epoch": 1.216832440703902, + "grad_norm": 2.3563143997973084, + "learning_rate": 7.021284225150782e-06, + "loss": 0.398, + "step": 7952 + }, + { + "epoch": 1.2169854628921193, + "grad_norm": 2.073709554473588, + "learning_rate": 7.0189186283521295e-06, + "loss": 0.3789, + "step": 7953 + }, + { + "epoch": 1.2171384850803366, + "grad_norm": 2.528122947279484, + "learning_rate": 7.016553214633348e-06, + "loss": 0.4205, + "step": 7954 + }, + { + "epoch": 1.217291507268554, + "grad_norm": 2.2205808799791393, + "learning_rate": 7.014187984139702e-06, + "loss": 0.3803, + "step": 7955 + }, + { + "epoch": 1.2174445294567713, + "grad_norm": 2.0751870027824526, + "learning_rate": 7.0118229370164545e-06, + "loss": 0.3547, + "step": 7956 + }, + { + "epoch": 1.2175975516449886, + "grad_norm": 2.2315342903135393, + "learning_rate": 7.009458073408852e-06, + "loss": 0.4626, + "step": 7957 + }, + { + "epoch": 1.2177505738332057, + "grad_norm": 2.1810279357679, + "learning_rate": 7.007093393462124e-06, + "loss": 0.4244, + "step": 7958 + }, + { + "epoch": 1.217903596021423, + "grad_norm": 2.0422908944115594, + "learning_rate": 7.004728897321501e-06, + "loss": 0.5326, + "step": 7959 + }, + { + "epoch": 1.2180566182096404, + "grad_norm": 2.20462359397819, + "learning_rate": 7.002364585132192e-06, + "loss": 0.3102, + "step": 7960 + }, + { + "epoch": 1.2182096403978577, + "grad_norm": 2.3536974554981125, + "learning_rate": 7.000000457039404e-06, + "loss": 0.4173, + "step": 7961 + }, + { + "epoch": 1.218362662586075, + "grad_norm": 2.038361961156172, + "learning_rate": 6.997636513188324e-06, + "loss": 0.39, + "step": 7962 + }, + { + "epoch": 1.2185156847742924, + "grad_norm": 2.210987365831342, + "learning_rate": 6.995272753724127e-06, + "loss": 0.4057, + "step": 7963 + }, + { + "epoch": 1.2186687069625095, + "grad_norm": 2.1544425173767907, + "learning_rate": 6.99290917879199e-06, + "loss": 0.3487, + "step": 7964 + }, + { + "epoch": 1.2188217291507268, + "grad_norm": 2.345572013946747, + "learning_rate": 6.990545788537062e-06, + "loss": 0.4078, + "step": 7965 + }, + { + "epoch": 1.2189747513389442, + "grad_norm": 2.0550826810477854, + "learning_rate": 6.988182583104488e-06, + "loss": 0.3286, + "step": 7966 + }, + { + "epoch": 1.2191277735271615, + "grad_norm": 2.474145020752616, + "learning_rate": 6.985819562639406e-06, + "loss": 0.3959, + "step": 7967 + }, + { + "epoch": 1.2192807957153788, + "grad_norm": 2.1499005174076764, + "learning_rate": 6.9834567272869345e-06, + "loss": 0.4703, + "step": 7968 + }, + { + "epoch": 1.219433817903596, + "grad_norm": 1.9949459289594664, + "learning_rate": 6.981094077192188e-06, + "loss": 0.4444, + "step": 7969 + }, + { + "epoch": 1.2195868400918133, + "grad_norm": 2.159287629056209, + "learning_rate": 6.978731612500266e-06, + "loss": 0.4424, + "step": 7970 + }, + { + "epoch": 1.2197398622800306, + "grad_norm": 2.319018078003906, + "learning_rate": 6.97636933335625e-06, + "loss": 0.456, + "step": 7971 + }, + { + "epoch": 1.219892884468248, + "grad_norm": 2.2501379725026194, + "learning_rate": 6.974007239905225e-06, + "loss": 0.3941, + "step": 7972 + }, + { + "epoch": 1.2200459066564653, + "grad_norm": 2.376289275218683, + "learning_rate": 6.971645332292255e-06, + "loss": 0.479, + "step": 7973 + }, + { + "epoch": 1.2201989288446824, + "grad_norm": 2.4438939561516597, + "learning_rate": 6.969283610662387e-06, + "loss": 0.4189, + "step": 7974 + }, + { + "epoch": 1.2203519510328997, + "grad_norm": 2.0860769225758347, + "learning_rate": 6.966922075160671e-06, + "loss": 0.3817, + "step": 7975 + }, + { + "epoch": 1.220504973221117, + "grad_norm": 2.2571172531938952, + "learning_rate": 6.964560725932136e-06, + "loss": 0.3865, + "step": 7976 + }, + { + "epoch": 1.2206579954093344, + "grad_norm": 2.0870809930095686, + "learning_rate": 6.962199563121803e-06, + "loss": 0.4098, + "step": 7977 + }, + { + "epoch": 1.2208110175975517, + "grad_norm": 2.3337013901306065, + "learning_rate": 6.959838586874679e-06, + "loss": 0.4347, + "step": 7978 + }, + { + "epoch": 1.2209640397857688, + "grad_norm": 1.6879922077725298, + "learning_rate": 6.957477797335757e-06, + "loss": 0.3284, + "step": 7979 + }, + { + "epoch": 1.2211170619739862, + "grad_norm": 2.21500011103774, + "learning_rate": 6.955117194650029e-06, + "loss": 0.4315, + "step": 7980 + }, + { + "epoch": 1.2212700841622035, + "grad_norm": 2.365885263450686, + "learning_rate": 6.952756778962463e-06, + "loss": 0.4451, + "step": 7981 + }, + { + "epoch": 1.2214231063504208, + "grad_norm": 2.1970439077941983, + "learning_rate": 6.950396550418026e-06, + "loss": 0.3978, + "step": 7982 + }, + { + "epoch": 1.2215761285386382, + "grad_norm": 2.00539883100848, + "learning_rate": 6.9480365091616685e-06, + "loss": 0.3948, + "step": 7983 + }, + { + "epoch": 1.2217291507268553, + "grad_norm": 2.4169358456913175, + "learning_rate": 6.945676655338324e-06, + "loss": 0.4454, + "step": 7984 + }, + { + "epoch": 1.2218821729150726, + "grad_norm": 1.9858033554942487, + "learning_rate": 6.943316989092928e-06, + "loss": 0.3527, + "step": 7985 + }, + { + "epoch": 1.22203519510329, + "grad_norm": 2.432567266913346, + "learning_rate": 6.940957510570395e-06, + "loss": 0.4424, + "step": 7986 + }, + { + "epoch": 1.2221882172915073, + "grad_norm": 2.0487590024042253, + "learning_rate": 6.938598219915624e-06, + "loss": 0.4262, + "step": 7987 + }, + { + "epoch": 1.2223412394797246, + "grad_norm": 2.1278806090330153, + "learning_rate": 6.9362391172735155e-06, + "loss": 0.4153, + "step": 7988 + }, + { + "epoch": 1.2224942616679417, + "grad_norm": 1.9392705667390682, + "learning_rate": 6.933880202788945e-06, + "loss": 0.2756, + "step": 7989 + }, + { + "epoch": 1.222647283856159, + "grad_norm": 2.0982874308568746, + "learning_rate": 6.931521476606791e-06, + "loss": 0.3839, + "step": 7990 + }, + { + "epoch": 1.2228003060443764, + "grad_norm": 1.9905106427152186, + "learning_rate": 6.929162938871905e-06, + "loss": 0.3439, + "step": 7991 + }, + { + "epoch": 1.2229533282325937, + "grad_norm": 2.137110660745122, + "learning_rate": 6.926804589729133e-06, + "loss": 0.3376, + "step": 7992 + }, + { + "epoch": 1.223106350420811, + "grad_norm": 2.0719186341939326, + "learning_rate": 6.924446429323318e-06, + "loss": 0.3714, + "step": 7993 + }, + { + "epoch": 1.2232593726090284, + "grad_norm": 1.9949333583357212, + "learning_rate": 6.922088457799278e-06, + "loss": 0.3984, + "step": 7994 + }, + { + "epoch": 1.2234123947972455, + "grad_norm": 2.0901002764905763, + "learning_rate": 6.919730675301824e-06, + "loss": 0.3392, + "step": 7995 + }, + { + "epoch": 1.2235654169854628, + "grad_norm": 2.0467946793168212, + "learning_rate": 6.917373081975764e-06, + "loss": 0.4335, + "step": 7996 + }, + { + "epoch": 1.2237184391736802, + "grad_norm": 2.099938498749769, + "learning_rate": 6.9150156779658775e-06, + "loss": 0.3334, + "step": 7997 + }, + { + "epoch": 1.2238714613618975, + "grad_norm": 2.315397070406997, + "learning_rate": 6.91265846341695e-06, + "loss": 0.4228, + "step": 7998 + }, + { + "epoch": 1.2240244835501148, + "grad_norm": 2.3280652559278465, + "learning_rate": 6.910301438473746e-06, + "loss": 0.4021, + "step": 7999 + }, + { + "epoch": 1.2241775057383322, + "grad_norm": 2.3985582799227076, + "learning_rate": 6.907944603281011e-06, + "loss": 0.3723, + "step": 8000 + }, + { + "epoch": 1.2243305279265493, + "grad_norm": 2.130601512927696, + "learning_rate": 6.905587957983499e-06, + "loss": 0.3039, + "step": 8001 + }, + { + "epoch": 1.2244835501147666, + "grad_norm": 2.1247620729874748, + "learning_rate": 6.903231502725936e-06, + "loss": 0.4024, + "step": 8002 + }, + { + "epoch": 1.224636572302984, + "grad_norm": 2.497906614459289, + "learning_rate": 6.900875237653039e-06, + "loss": 0.4527, + "step": 8003 + }, + { + "epoch": 1.2247895944912013, + "grad_norm": 2.234719038902452, + "learning_rate": 6.8985191629095185e-06, + "loss": 0.3417, + "step": 8004 + }, + { + "epoch": 1.2249426166794186, + "grad_norm": 2.5449686391944337, + "learning_rate": 6.8961632786400665e-06, + "loss": 0.479, + "step": 8005 + }, + { + "epoch": 1.2250956388676357, + "grad_norm": 2.0912533506618227, + "learning_rate": 6.893807584989375e-06, + "loss": 0.3885, + "step": 8006 + }, + { + "epoch": 1.225248661055853, + "grad_norm": 2.061740186429149, + "learning_rate": 6.891452082102108e-06, + "loss": 0.3069, + "step": 8007 + }, + { + "epoch": 1.2254016832440704, + "grad_norm": 2.4195881426112176, + "learning_rate": 6.889096770122928e-06, + "loss": 0.3886, + "step": 8008 + }, + { + "epoch": 1.2255547054322877, + "grad_norm": 2.257160486240451, + "learning_rate": 6.88674164919649e-06, + "loss": 0.442, + "step": 8009 + }, + { + "epoch": 1.225707727620505, + "grad_norm": 1.9940025514019444, + "learning_rate": 6.8843867194674244e-06, + "loss": 0.3627, + "step": 8010 + }, + { + "epoch": 1.2258607498087222, + "grad_norm": 2.0383222857719385, + "learning_rate": 6.882031981080355e-06, + "loss": 0.4049, + "step": 8011 + }, + { + "epoch": 1.2260137719969395, + "grad_norm": 2.0190329535880545, + "learning_rate": 6.879677434179904e-06, + "loss": 0.2951, + "step": 8012 + }, + { + "epoch": 1.2261667941851568, + "grad_norm": 2.055375099684583, + "learning_rate": 6.877323078910665e-06, + "loss": 0.3451, + "step": 8013 + }, + { + "epoch": 1.2263198163733742, + "grad_norm": 2.4333399428901563, + "learning_rate": 6.874968915417234e-06, + "loss": 0.3886, + "step": 8014 + }, + { + "epoch": 1.2264728385615915, + "grad_norm": 2.526027909918158, + "learning_rate": 6.872614943844189e-06, + "loss": 0.4255, + "step": 8015 + }, + { + "epoch": 1.2266258607498086, + "grad_norm": 2.1961298260330318, + "learning_rate": 6.870261164336089e-06, + "loss": 0.3917, + "step": 8016 + }, + { + "epoch": 1.226778882938026, + "grad_norm": 1.9173991175517395, + "learning_rate": 6.867907577037498e-06, + "loss": 0.3306, + "step": 8017 + }, + { + "epoch": 1.2269319051262433, + "grad_norm": 1.9365177251844081, + "learning_rate": 6.865554182092954e-06, + "loss": 0.3061, + "step": 8018 + }, + { + "epoch": 1.2270849273144606, + "grad_norm": 1.8093240085090097, + "learning_rate": 6.8632009796469935e-06, + "loss": 0.4069, + "step": 8019 + }, + { + "epoch": 1.227237949502678, + "grad_norm": 2.149860287286806, + "learning_rate": 6.860847969844129e-06, + "loss": 0.4024, + "step": 8020 + }, + { + "epoch": 1.227390971690895, + "grad_norm": 2.2115737004229428, + "learning_rate": 6.85849515282887e-06, + "loss": 0.3812, + "step": 8021 + }, + { + "epoch": 1.2275439938791124, + "grad_norm": 1.9495383178771113, + "learning_rate": 6.856142528745717e-06, + "loss": 0.3352, + "step": 8022 + }, + { + "epoch": 1.2276970160673297, + "grad_norm": 2.271878221084577, + "learning_rate": 6.853790097739148e-06, + "loss": 0.392, + "step": 8023 + }, + { + "epoch": 1.227850038255547, + "grad_norm": 2.0960449848601144, + "learning_rate": 6.851437859953636e-06, + "loss": 0.4094, + "step": 8024 + }, + { + "epoch": 1.2280030604437644, + "grad_norm": 2.1636547073328654, + "learning_rate": 6.849085815533644e-06, + "loss": 0.4111, + "step": 8025 + }, + { + "epoch": 1.2281560826319817, + "grad_norm": 2.4084541084097606, + "learning_rate": 6.846733964623618e-06, + "loss": 0.3485, + "step": 8026 + }, + { + "epoch": 1.2283091048201988, + "grad_norm": 1.9854369939079013, + "learning_rate": 6.844382307367994e-06, + "loss": 0.42, + "step": 8027 + }, + { + "epoch": 1.2284621270084162, + "grad_norm": 2.1105981109720138, + "learning_rate": 6.842030843911201e-06, + "loss": 0.3964, + "step": 8028 + }, + { + "epoch": 1.2286151491966335, + "grad_norm": 2.218758898126614, + "learning_rate": 6.839679574397641e-06, + "loss": 0.3978, + "step": 8029 + }, + { + "epoch": 1.2287681713848508, + "grad_norm": 2.2227044245916154, + "learning_rate": 6.837328498971727e-06, + "loss": 0.4151, + "step": 8030 + }, + { + "epoch": 1.2289211935730682, + "grad_norm": 2.140680430133614, + "learning_rate": 6.834977617777844e-06, + "loss": 0.3219, + "step": 8031 + }, + { + "epoch": 1.2290742157612855, + "grad_norm": 1.9854371782404445, + "learning_rate": 6.83262693096036e-06, + "loss": 0.3803, + "step": 8032 + }, + { + "epoch": 1.2292272379495026, + "grad_norm": 2.205663362151382, + "learning_rate": 6.830276438663654e-06, + "loss": 0.3955, + "step": 8033 + }, + { + "epoch": 1.22938026013772, + "grad_norm": 2.10792051555875, + "learning_rate": 6.827926141032066e-06, + "loss": 0.3967, + "step": 8034 + }, + { + "epoch": 1.2295332823259373, + "grad_norm": 2.0734884900888386, + "learning_rate": 6.8255760382099465e-06, + "loss": 0.4093, + "step": 8035 + }, + { + "epoch": 1.2296863045141546, + "grad_norm": 2.2376128890445077, + "learning_rate": 6.823226130341623e-06, + "loss": 0.4023, + "step": 8036 + }, + { + "epoch": 1.229839326702372, + "grad_norm": 2.401436260509043, + "learning_rate": 6.820876417571405e-06, + "loss": 0.461, + "step": 8037 + }, + { + "epoch": 1.229992348890589, + "grad_norm": 2.0884246258149406, + "learning_rate": 6.818526900043606e-06, + "loss": 0.3659, + "step": 8038 + }, + { + "epoch": 1.2301453710788064, + "grad_norm": 2.0483232952456545, + "learning_rate": 6.816177577902518e-06, + "loss": 0.3641, + "step": 8039 + }, + { + "epoch": 1.2302983932670237, + "grad_norm": 2.382850648132974, + "learning_rate": 6.813828451292417e-06, + "loss": 0.4182, + "step": 8040 + }, + { + "epoch": 1.230451415455241, + "grad_norm": 2.2418447789270948, + "learning_rate": 6.811479520357576e-06, + "loss": 0.4607, + "step": 8041 + }, + { + "epoch": 1.2306044376434584, + "grad_norm": 1.9382911340909974, + "learning_rate": 6.80913078524225e-06, + "loss": 0.3438, + "step": 8042 + }, + { + "epoch": 1.2307574598316755, + "grad_norm": 1.8422515197448954, + "learning_rate": 6.806782246090688e-06, + "loss": 0.3331, + "step": 8043 + }, + { + "epoch": 1.2309104820198928, + "grad_norm": 2.225159881582689, + "learning_rate": 6.804433903047118e-06, + "loss": 0.3753, + "step": 8044 + }, + { + "epoch": 1.2310635042081102, + "grad_norm": 2.1226629248453763, + "learning_rate": 6.802085756255763e-06, + "loss": 0.357, + "step": 8045 + }, + { + "epoch": 1.2312165263963275, + "grad_norm": 2.281272129316277, + "learning_rate": 6.7997378058608355e-06, + "loss": 0.4267, + "step": 8046 + }, + { + "epoch": 1.2313695485845448, + "grad_norm": 2.409255082789223, + "learning_rate": 6.797390052006526e-06, + "loss": 0.4512, + "step": 8047 + }, + { + "epoch": 1.231522570772762, + "grad_norm": 2.022442772507196, + "learning_rate": 6.7950424948370205e-06, + "loss": 0.3694, + "step": 8048 + }, + { + "epoch": 1.2316755929609793, + "grad_norm": 2.048689723681348, + "learning_rate": 6.792695134496497e-06, + "loss": 0.3168, + "step": 8049 + }, + { + "epoch": 1.2318286151491966, + "grad_norm": 2.1639453889819347, + "learning_rate": 6.7903479711291095e-06, + "loss": 0.373, + "step": 8050 + }, + { + "epoch": 1.231981637337414, + "grad_norm": 2.306482688733974, + "learning_rate": 6.78800100487901e-06, + "loss": 0.41, + "step": 8051 + }, + { + "epoch": 1.2321346595256313, + "grad_norm": 2.1460361773704255, + "learning_rate": 6.785654235890338e-06, + "loss": 0.3688, + "step": 8052 + }, + { + "epoch": 1.2322876817138484, + "grad_norm": 2.5735775872854028, + "learning_rate": 6.7833076643072085e-06, + "loss": 0.4729, + "step": 8053 + }, + { + "epoch": 1.2324407039020657, + "grad_norm": 2.332483797336673, + "learning_rate": 6.78096129027374e-06, + "loss": 0.4682, + "step": 8054 + }, + { + "epoch": 1.232593726090283, + "grad_norm": 2.271105481517045, + "learning_rate": 6.778615113934031e-06, + "loss": 0.4461, + "step": 8055 + }, + { + "epoch": 1.2327467482785004, + "grad_norm": 2.1420714053603236, + "learning_rate": 6.776269135432174e-06, + "loss": 0.3699, + "step": 8056 + }, + { + "epoch": 1.2328997704667177, + "grad_norm": 2.333313250766813, + "learning_rate": 6.773923354912239e-06, + "loss": 0.349, + "step": 8057 + }, + { + "epoch": 1.233052792654935, + "grad_norm": 2.3201571423666594, + "learning_rate": 6.7715777725182875e-06, + "loss": 0.4587, + "step": 8058 + }, + { + "epoch": 1.2332058148431522, + "grad_norm": 2.350700286167741, + "learning_rate": 6.76923238839438e-06, + "loss": 0.4382, + "step": 8059 + }, + { + "epoch": 1.2333588370313695, + "grad_norm": 2.4481880445812005, + "learning_rate": 6.766887202684549e-06, + "loss": 0.4408, + "step": 8060 + }, + { + "epoch": 1.2335118592195868, + "grad_norm": 2.1630515616270882, + "learning_rate": 6.76454221553282e-06, + "loss": 0.3706, + "step": 8061 + }, + { + "epoch": 1.2336648814078042, + "grad_norm": 2.0092456944627766, + "learning_rate": 6.7621974270832145e-06, + "loss": 0.3611, + "step": 8062 + }, + { + "epoch": 1.2338179035960215, + "grad_norm": 2.1523204344513838, + "learning_rate": 6.7598528374797275e-06, + "loss": 0.386, + "step": 8063 + }, + { + "epoch": 1.2339709257842388, + "grad_norm": 2.3086250501937777, + "learning_rate": 6.757508446866357e-06, + "loss": 0.4011, + "step": 8064 + }, + { + "epoch": 1.234123947972456, + "grad_norm": 2.4251052079549575, + "learning_rate": 6.755164255387078e-06, + "loss": 0.3798, + "step": 8065 + }, + { + "epoch": 1.2342769701606733, + "grad_norm": 2.24168987184976, + "learning_rate": 6.752820263185851e-06, + "loss": 0.3947, + "step": 8066 + }, + { + "epoch": 1.2344299923488906, + "grad_norm": 2.097126542099647, + "learning_rate": 6.750476470406638e-06, + "loss": 0.4268, + "step": 8067 + }, + { + "epoch": 1.234583014537108, + "grad_norm": 2.289927186551288, + "learning_rate": 6.748132877193379e-06, + "loss": 0.3988, + "step": 8068 + }, + { + "epoch": 1.2347360367253253, + "grad_norm": 2.122315650716571, + "learning_rate": 6.745789483689997e-06, + "loss": 0.4257, + "step": 8069 + }, + { + "epoch": 1.2348890589135424, + "grad_norm": 1.9968670376809796, + "learning_rate": 6.743446290040417e-06, + "loss": 0.4458, + "step": 8070 + }, + { + "epoch": 1.2350420811017597, + "grad_norm": 1.8989966989084186, + "learning_rate": 6.7411032963885355e-06, + "loss": 0.411, + "step": 8071 + }, + { + "epoch": 1.235195103289977, + "grad_norm": 2.1196167992227637, + "learning_rate": 6.738760502878254e-06, + "loss": 0.3371, + "step": 8072 + }, + { + "epoch": 1.2353481254781944, + "grad_norm": 2.27319423233866, + "learning_rate": 6.736417909653448e-06, + "loss": 0.4286, + "step": 8073 + }, + { + "epoch": 1.2355011476664117, + "grad_norm": 2.1531922370964884, + "learning_rate": 6.734075516857981e-06, + "loss": 0.3935, + "step": 8074 + }, + { + "epoch": 1.2356541698546288, + "grad_norm": 2.169345191566362, + "learning_rate": 6.731733324635717e-06, + "loss": 0.3872, + "step": 8075 + }, + { + "epoch": 1.2358071920428462, + "grad_norm": 2.3456728853488804, + "learning_rate": 6.729391333130495e-06, + "loss": 0.4262, + "step": 8076 + }, + { + "epoch": 1.2359602142310635, + "grad_norm": 2.116438016922821, + "learning_rate": 6.727049542486142e-06, + "loss": 0.3507, + "step": 8077 + }, + { + "epoch": 1.2361132364192808, + "grad_norm": 2.2905850835193675, + "learning_rate": 6.724707952846485e-06, + "loss": 0.474, + "step": 8078 + }, + { + "epoch": 1.2362662586074982, + "grad_norm": 2.126345341606041, + "learning_rate": 6.722366564355322e-06, + "loss": 0.4478, + "step": 8079 + }, + { + "epoch": 1.2364192807957153, + "grad_norm": 2.2306359092945516, + "learning_rate": 6.720025377156453e-06, + "loss": 0.4082, + "step": 8080 + }, + { + "epoch": 1.2365723029839326, + "grad_norm": 2.4936280034611946, + "learning_rate": 6.7176843913936584e-06, + "loss": 0.3708, + "step": 8081 + }, + { + "epoch": 1.23672532517215, + "grad_norm": 2.060763607982872, + "learning_rate": 6.715343607210701e-06, + "loss": 0.4245, + "step": 8082 + }, + { + "epoch": 1.2368783473603673, + "grad_norm": 2.4140780179422334, + "learning_rate": 6.713003024751345e-06, + "loss": 0.4348, + "step": 8083 + }, + { + "epoch": 1.2370313695485846, + "grad_norm": 2.0437392400835552, + "learning_rate": 6.7106626441593315e-06, + "loss": 0.4317, + "step": 8084 + }, + { + "epoch": 1.2371843917368017, + "grad_norm": 2.166562578027147, + "learning_rate": 6.708322465578394e-06, + "loss": 0.412, + "step": 8085 + }, + { + "epoch": 1.237337413925019, + "grad_norm": 2.1138549681897807, + "learning_rate": 6.705982489152252e-06, + "loss": 0.4011, + "step": 8086 + }, + { + "epoch": 1.2374904361132364, + "grad_norm": 2.031131307063908, + "learning_rate": 6.703642715024608e-06, + "loss": 0.3462, + "step": 8087 + }, + { + "epoch": 1.2376434583014537, + "grad_norm": 2.1949317305374674, + "learning_rate": 6.701303143339164e-06, + "loss": 0.3736, + "step": 8088 + }, + { + "epoch": 1.237796480489671, + "grad_norm": 2.1993897401590603, + "learning_rate": 6.6989637742395975e-06, + "loss": 0.3788, + "step": 8089 + }, + { + "epoch": 1.2379495026778882, + "grad_norm": 2.5296632495849334, + "learning_rate": 6.696624607869576e-06, + "loss": 0.4256, + "step": 8090 + }, + { + "epoch": 1.2381025248661055, + "grad_norm": 2.2823327820853794, + "learning_rate": 6.694285644372765e-06, + "loss": 0.3772, + "step": 8091 + }, + { + "epoch": 1.2382555470543228, + "grad_norm": 2.0040278866366528, + "learning_rate": 6.691946883892802e-06, + "loss": 0.3082, + "step": 8092 + }, + { + "epoch": 1.2384085692425402, + "grad_norm": 1.928916255534059, + "learning_rate": 6.689608326573322e-06, + "loss": 0.4043, + "step": 8093 + }, + { + "epoch": 1.2385615914307575, + "grad_norm": 1.92290953923648, + "learning_rate": 6.6872699725579485e-06, + "loss": 0.3487, + "step": 8094 + }, + { + "epoch": 1.2387146136189748, + "grad_norm": 2.3340519002998414, + "learning_rate": 6.684931821990279e-06, + "loss": 0.4364, + "step": 8095 + }, + { + "epoch": 1.238867635807192, + "grad_norm": 2.212440455612955, + "learning_rate": 6.6825938750139185e-06, + "loss": 0.4273, + "step": 8096 + }, + { + "epoch": 1.2390206579954093, + "grad_norm": 2.111364367290438, + "learning_rate": 6.680256131772449e-06, + "loss": 0.3948, + "step": 8097 + }, + { + "epoch": 1.2391736801836266, + "grad_norm": 2.2003773037744505, + "learning_rate": 6.677918592409432e-06, + "loss": 0.4448, + "step": 8098 + }, + { + "epoch": 1.239326702371844, + "grad_norm": 1.8536781014593984, + "learning_rate": 6.675581257068433e-06, + "loss": 0.3131, + "step": 8099 + }, + { + "epoch": 1.2394797245600613, + "grad_norm": 2.0146176424945805, + "learning_rate": 6.673244125892991e-06, + "loss": 0.3832, + "step": 8100 + }, + { + "epoch": 1.2396327467482786, + "grad_norm": 2.346769756302333, + "learning_rate": 6.670907199026647e-06, + "loss": 0.4161, + "step": 8101 + }, + { + "epoch": 1.2397857689364957, + "grad_norm": 2.1642485454000058, + "learning_rate": 6.668570476612912e-06, + "loss": 0.344, + "step": 8102 + }, + { + "epoch": 1.239938791124713, + "grad_norm": 2.0988507243659997, + "learning_rate": 6.6662339587952966e-06, + "loss": 0.3651, + "step": 8103 + }, + { + "epoch": 1.2400918133129304, + "grad_norm": 1.9921147928691025, + "learning_rate": 6.6638976457172985e-06, + "loss": 0.3652, + "step": 8104 + }, + { + "epoch": 1.2402448355011477, + "grad_norm": 2.0682253549190404, + "learning_rate": 6.6615615375223945e-06, + "loss": 0.3633, + "step": 8105 + }, + { + "epoch": 1.240397857689365, + "grad_norm": 2.0394982635306493, + "learning_rate": 6.659225634354055e-06, + "loss": 0.3469, + "step": 8106 + }, + { + "epoch": 1.2405508798775822, + "grad_norm": 2.017362842165284, + "learning_rate": 6.656889936355742e-06, + "loss": 0.3196, + "step": 8107 + }, + { + "epoch": 1.2407039020657995, + "grad_norm": 2.1998384971210565, + "learning_rate": 6.654554443670893e-06, + "loss": 0.4183, + "step": 8108 + }, + { + "epoch": 1.2408569242540168, + "grad_norm": 2.194051345196769, + "learning_rate": 6.652219156442945e-06, + "loss": 0.3716, + "step": 8109 + }, + { + "epoch": 1.2410099464422342, + "grad_norm": 1.8924912286597517, + "learning_rate": 6.649884074815316e-06, + "loss": 0.3739, + "step": 8110 + }, + { + "epoch": 1.2411629686304515, + "grad_norm": 2.105527117813446, + "learning_rate": 6.647549198931407e-06, + "loss": 0.358, + "step": 8111 + }, + { + "epoch": 1.2413159908186686, + "grad_norm": 2.2597521686833333, + "learning_rate": 6.645214528934623e-06, + "loss": 0.3437, + "step": 8112 + }, + { + "epoch": 1.241469013006886, + "grad_norm": 2.200249191211589, + "learning_rate": 6.6428800649683374e-06, + "loss": 0.3841, + "step": 8113 + }, + { + "epoch": 1.2416220351951033, + "grad_norm": 2.3942357392351474, + "learning_rate": 6.640545807175914e-06, + "loss": 0.4409, + "step": 8114 + }, + { + "epoch": 1.2417750573833206, + "grad_norm": 2.2397158209349715, + "learning_rate": 6.638211755700722e-06, + "loss": 0.4461, + "step": 8115 + }, + { + "epoch": 1.241928079571538, + "grad_norm": 2.348508538360569, + "learning_rate": 6.6358779106860924e-06, + "loss": 0.4139, + "step": 8116 + }, + { + "epoch": 1.242081101759755, + "grad_norm": 2.189259772021628, + "learning_rate": 6.633544272275364e-06, + "loss": 0.4399, + "step": 8117 + }, + { + "epoch": 1.2422341239479724, + "grad_norm": 2.0453409050667197, + "learning_rate": 6.631210840611852e-06, + "loss": 0.4082, + "step": 8118 + }, + { + "epoch": 1.2423871461361897, + "grad_norm": 2.639988084591451, + "learning_rate": 6.628877615838859e-06, + "loss": 0.4131, + "step": 8119 + }, + { + "epoch": 1.242540168324407, + "grad_norm": 2.4274196063214677, + "learning_rate": 6.62654459809968e-06, + "loss": 0.4368, + "step": 8120 + }, + { + "epoch": 1.2426931905126244, + "grad_norm": 2.14359024994293, + "learning_rate": 6.624211787537594e-06, + "loss": 0.4156, + "step": 8121 + }, + { + "epoch": 1.2428462127008415, + "grad_norm": 2.292958397908944, + "learning_rate": 6.621879184295869e-06, + "loss": 0.4314, + "step": 8122 + }, + { + "epoch": 1.2429992348890588, + "grad_norm": 1.9339362064576864, + "learning_rate": 6.619546788517759e-06, + "loss": 0.3408, + "step": 8123 + }, + { + "epoch": 1.2431522570772762, + "grad_norm": 2.3692855461901927, + "learning_rate": 6.617214600346503e-06, + "loss": 0.4277, + "step": 8124 + }, + { + "epoch": 1.2433052792654935, + "grad_norm": 2.3900418782194226, + "learning_rate": 6.614882619925336e-06, + "loss": 0.4539, + "step": 8125 + }, + { + "epoch": 1.2434583014537108, + "grad_norm": 2.1281143454275444, + "learning_rate": 6.612550847397468e-06, + "loss": 0.3837, + "step": 8126 + }, + { + "epoch": 1.2436113236419282, + "grad_norm": 1.9287068128994407, + "learning_rate": 6.610219282906101e-06, + "loss": 0.391, + "step": 8127 + }, + { + "epoch": 1.2437643458301453, + "grad_norm": 2.2439153783777033, + "learning_rate": 6.607887926594434e-06, + "loss": 0.3952, + "step": 8128 + }, + { + "epoch": 1.2439173680183626, + "grad_norm": 2.4364819100764668, + "learning_rate": 6.605556778605635e-06, + "loss": 0.4225, + "step": 8129 + }, + { + "epoch": 1.24407039020658, + "grad_norm": 1.950810491447188, + "learning_rate": 6.603225839082875e-06, + "loss": 0.3582, + "step": 8130 + }, + { + "epoch": 1.2442234123947973, + "grad_norm": 2.1343110248130905, + "learning_rate": 6.600895108169308e-06, + "loss": 0.3882, + "step": 8131 + }, + { + "epoch": 1.2443764345830146, + "grad_norm": 2.133074845103394, + "learning_rate": 6.598564586008064e-06, + "loss": 0.3467, + "step": 8132 + }, + { + "epoch": 1.244529456771232, + "grad_norm": 2.4865499659321606, + "learning_rate": 6.596234272742278e-06, + "loss": 0.3426, + "step": 8133 + }, + { + "epoch": 1.244682478959449, + "grad_norm": 2.2551479642677403, + "learning_rate": 6.593904168515062e-06, + "loss": 0.3678, + "step": 8134 + }, + { + "epoch": 1.2448355011476664, + "grad_norm": 2.4241006823885494, + "learning_rate": 6.591574273469513e-06, + "loss": 0.3655, + "step": 8135 + }, + { + "epoch": 1.2449885233358837, + "grad_norm": 1.9992035540833695, + "learning_rate": 6.589244587748723e-06, + "loss": 0.3564, + "step": 8136 + }, + { + "epoch": 1.245141545524101, + "grad_norm": 2.3241941093092335, + "learning_rate": 6.586915111495764e-06, + "loss": 0.4287, + "step": 8137 + }, + { + "epoch": 1.2452945677123184, + "grad_norm": 2.287427281357684, + "learning_rate": 6.5845858448537045e-06, + "loss": 0.4589, + "step": 8138 + }, + { + "epoch": 1.2454475899005355, + "grad_norm": 1.857212005781437, + "learning_rate": 6.582256787965586e-06, + "loss": 0.3074, + "step": 8139 + }, + { + "epoch": 1.2456006120887528, + "grad_norm": 2.195369821049141, + "learning_rate": 6.579927940974449e-06, + "loss": 0.3611, + "step": 8140 + }, + { + "epoch": 1.2457536342769702, + "grad_norm": 2.199397634076349, + "learning_rate": 6.577599304023319e-06, + "loss": 0.3836, + "step": 8141 + }, + { + "epoch": 1.2459066564651875, + "grad_norm": 2.3932894708522023, + "learning_rate": 6.575270877255202e-06, + "loss": 0.4379, + "step": 8142 + }, + { + "epoch": 1.2460596786534048, + "grad_norm": 2.4586397278695524, + "learning_rate": 6.572942660813096e-06, + "loss": 0.4097, + "step": 8143 + }, + { + "epoch": 1.246212700841622, + "grad_norm": 2.2564384469631396, + "learning_rate": 6.570614654839993e-06, + "loss": 0.616, + "step": 8144 + }, + { + "epoch": 1.2463657230298393, + "grad_norm": 2.027426896315064, + "learning_rate": 6.568286859478854e-06, + "loss": 0.2986, + "step": 8145 + }, + { + "epoch": 1.2465187452180566, + "grad_norm": 1.9998258578563708, + "learning_rate": 6.565959274872648e-06, + "loss": 0.3829, + "step": 8146 + }, + { + "epoch": 1.246671767406274, + "grad_norm": 2.294962618851666, + "learning_rate": 6.563631901164318e-06, + "loss": 0.3712, + "step": 8147 + }, + { + "epoch": 1.2468247895944913, + "grad_norm": 2.2436925088267703, + "learning_rate": 6.561304738496792e-06, + "loss": 0.3324, + "step": 8148 + }, + { + "epoch": 1.2469778117827084, + "grad_norm": 2.4639587946632284, + "learning_rate": 6.558977787012997e-06, + "loss": 0.4681, + "step": 8149 + }, + { + "epoch": 1.2471308339709257, + "grad_norm": 2.381414016939421, + "learning_rate": 6.556651046855838e-06, + "loss": 0.3656, + "step": 8150 + }, + { + "epoch": 1.247283856159143, + "grad_norm": 1.8783872615246844, + "learning_rate": 6.554324518168206e-06, + "loss": 0.2745, + "step": 8151 + }, + { + "epoch": 1.2474368783473604, + "grad_norm": 2.259284316104315, + "learning_rate": 6.551998201092986e-06, + "loss": 0.44, + "step": 8152 + }, + { + "epoch": 1.2475899005355777, + "grad_norm": 2.0064902849502424, + "learning_rate": 6.549672095773043e-06, + "loss": 0.3091, + "step": 8153 + }, + { + "epoch": 1.2477429227237948, + "grad_norm": 2.0524252981887967, + "learning_rate": 6.547346202351239e-06, + "loss": 0.3426, + "step": 8154 + }, + { + "epoch": 1.2478959449120122, + "grad_norm": 2.0469596722487564, + "learning_rate": 6.54502052097041e-06, + "loss": 0.3511, + "step": 8155 + }, + { + "epoch": 1.2480489671002295, + "grad_norm": 2.2105301690125487, + "learning_rate": 6.542695051773384e-06, + "loss": 0.3652, + "step": 8156 + }, + { + "epoch": 1.2482019892884468, + "grad_norm": 2.0530242393989413, + "learning_rate": 6.540369794902984e-06, + "loss": 0.3917, + "step": 8157 + }, + { + "epoch": 1.2483550114766642, + "grad_norm": 1.936389285927461, + "learning_rate": 6.538044750502005e-06, + "loss": 0.3831, + "step": 8158 + }, + { + "epoch": 1.2485080336648815, + "grad_norm": 2.1162540345200056, + "learning_rate": 6.535719918713244e-06, + "loss": 0.3493, + "step": 8159 + }, + { + "epoch": 1.2486610558530986, + "grad_norm": 2.275333849051231, + "learning_rate": 6.533395299679475e-06, + "loss": 0.4457, + "step": 8160 + }, + { + "epoch": 1.248814078041316, + "grad_norm": 2.150509646064332, + "learning_rate": 6.531070893543461e-06, + "loss": 0.3997, + "step": 8161 + }, + { + "epoch": 1.2489671002295333, + "grad_norm": 2.0081462458065764, + "learning_rate": 6.5287467004479545e-06, + "loss": 0.3871, + "step": 8162 + }, + { + "epoch": 1.2491201224177506, + "grad_norm": 2.5256472370868592, + "learning_rate": 6.526422720535694e-06, + "loss": 0.4369, + "step": 8163 + }, + { + "epoch": 1.249273144605968, + "grad_norm": 2.264997586366189, + "learning_rate": 6.5240989539494e-06, + "loss": 0.3933, + "step": 8164 + }, + { + "epoch": 1.2494261667941853, + "grad_norm": 2.0935312426949495, + "learning_rate": 6.521775400831789e-06, + "loss": 0.4138, + "step": 8165 + }, + { + "epoch": 1.2495791889824024, + "grad_norm": 1.993012753242544, + "learning_rate": 6.519452061325555e-06, + "loss": 0.3732, + "step": 8166 + }, + { + "epoch": 1.2497322111706197, + "grad_norm": 2.0981709407547404, + "learning_rate": 6.517128935573389e-06, + "loss": 0.3587, + "step": 8167 + }, + { + "epoch": 1.249885233358837, + "grad_norm": 2.126075996875917, + "learning_rate": 6.51480602371796e-06, + "loss": 0.4134, + "step": 8168 + }, + { + "epoch": 1.2500382555470544, + "grad_norm": 2.2515728182518178, + "learning_rate": 6.5124833259019235e-06, + "loss": 0.4175, + "step": 8169 + }, + { + "epoch": 1.2501912777352717, + "grad_norm": 2.1486259337174594, + "learning_rate": 6.510160842267935e-06, + "loss": 0.3988, + "step": 8170 + }, + { + "epoch": 1.2503442999234888, + "grad_norm": 2.1867889880458597, + "learning_rate": 6.507838572958619e-06, + "loss": 0.3892, + "step": 8171 + }, + { + "epoch": 1.2504973221117062, + "grad_norm": 2.2552520050421427, + "learning_rate": 6.505516518116595e-06, + "loss": 0.3685, + "step": 8172 + }, + { + "epoch": 1.2506503442999235, + "grad_norm": 2.1887592404297695, + "learning_rate": 6.503194677884477e-06, + "loss": 0.4032, + "step": 8173 + }, + { + "epoch": 1.2508033664881408, + "grad_norm": 2.124074218625281, + "learning_rate": 6.500873052404848e-06, + "loss": 0.364, + "step": 8174 + }, + { + "epoch": 1.2509563886763582, + "grad_norm": 2.333056411610224, + "learning_rate": 6.4985516418202965e-06, + "loss": 0.3767, + "step": 8175 + }, + { + "epoch": 1.2511094108645753, + "grad_norm": 2.213655759545077, + "learning_rate": 6.496230446273389e-06, + "loss": 0.3687, + "step": 8176 + }, + { + "epoch": 1.2512624330527926, + "grad_norm": 2.192929258564723, + "learning_rate": 6.493909465906671e-06, + "loss": 0.3897, + "step": 8177 + }, + { + "epoch": 1.25141545524101, + "grad_norm": 1.9952666186839858, + "learning_rate": 6.491588700862692e-06, + "loss": 0.3551, + "step": 8178 + }, + { + "epoch": 1.2515684774292273, + "grad_norm": 2.2618334986409523, + "learning_rate": 6.489268151283978e-06, + "loss": 0.3391, + "step": 8179 + }, + { + "epoch": 1.2517214996174446, + "grad_norm": 2.0576909482129793, + "learning_rate": 6.486947817313036e-06, + "loss": 0.3226, + "step": 8180 + }, + { + "epoch": 1.2518745218056617, + "grad_norm": 2.7975000133563808, + "learning_rate": 6.484627699092374e-06, + "loss": 0.4377, + "step": 8181 + }, + { + "epoch": 1.252027543993879, + "grad_norm": 2.2709472936432453, + "learning_rate": 6.482307796764476e-06, + "loss": 0.3881, + "step": 8182 + }, + { + "epoch": 1.2521805661820964, + "grad_norm": 2.331279784622257, + "learning_rate": 6.4799881104718194e-06, + "loss": 0.4555, + "step": 8183 + }, + { + "epoch": 1.2523335883703137, + "grad_norm": 1.9791757619354962, + "learning_rate": 6.477668640356862e-06, + "loss": 0.3695, + "step": 8184 + }, + { + "epoch": 1.252486610558531, + "grad_norm": 2.2578902118790785, + "learning_rate": 6.475349386562052e-06, + "loss": 0.3684, + "step": 8185 + }, + { + "epoch": 1.2526396327467482, + "grad_norm": 2.0440767072571435, + "learning_rate": 6.4730303492298276e-06, + "loss": 0.3873, + "step": 8186 + }, + { + "epoch": 1.2527926549349655, + "grad_norm": 2.2598153800347975, + "learning_rate": 6.4707115285026065e-06, + "loss": 0.4313, + "step": 8187 + }, + { + "epoch": 1.2529456771231828, + "grad_norm": 2.2472633970388163, + "learning_rate": 6.468392924522793e-06, + "loss": 0.3577, + "step": 8188 + }, + { + "epoch": 1.2530986993114002, + "grad_norm": 2.000066195188573, + "learning_rate": 6.4660745374327916e-06, + "loss": 0.3561, + "step": 8189 + }, + { + "epoch": 1.2532517214996175, + "grad_norm": 1.998668284331149, + "learning_rate": 6.46375636737497e-06, + "loss": 0.3622, + "step": 8190 + }, + { + "epoch": 1.2534047436878346, + "grad_norm": 2.2063495642394155, + "learning_rate": 6.461438414491712e-06, + "loss": 0.4527, + "step": 8191 + }, + { + "epoch": 1.2535577658760522, + "grad_norm": 2.246557833214377, + "learning_rate": 6.459120678925361e-06, + "loss": 0.4203, + "step": 8192 + }, + { + "epoch": 1.2537107880642693, + "grad_norm": 2.2063828436285715, + "learning_rate": 6.456803160818257e-06, + "loss": 0.3825, + "step": 8193 + }, + { + "epoch": 1.2538638102524866, + "grad_norm": 1.9092101174408787, + "learning_rate": 6.454485860312738e-06, + "loss": 0.3348, + "step": 8194 + }, + { + "epoch": 1.254016832440704, + "grad_norm": 2.0977756057796055, + "learning_rate": 6.452168777551108e-06, + "loss": 0.4035, + "step": 8195 + }, + { + "epoch": 1.2541698546289213, + "grad_norm": 2.1005447185578676, + "learning_rate": 6.449851912675674e-06, + "loss": 0.3674, + "step": 8196 + }, + { + "epoch": 1.2543228768171386, + "grad_norm": 2.558298193490183, + "learning_rate": 6.447535265828726e-06, + "loss": 0.4957, + "step": 8197 + }, + { + "epoch": 1.2544758990053557, + "grad_norm": 2.024092000979385, + "learning_rate": 6.44521883715253e-06, + "loss": 0.3693, + "step": 8198 + }, + { + "epoch": 1.254628921193573, + "grad_norm": 2.2401108910393646, + "learning_rate": 6.4429026267893515e-06, + "loss": 0.3757, + "step": 8199 + }, + { + "epoch": 1.2547819433817904, + "grad_norm": 2.2399359164291486, + "learning_rate": 6.440586634881443e-06, + "loss": 0.3903, + "step": 8200 + }, + { + "epoch": 1.2549349655700077, + "grad_norm": 2.231638113223591, + "learning_rate": 6.438270861571028e-06, + "loss": 0.3898, + "step": 8201 + }, + { + "epoch": 1.255087987758225, + "grad_norm": 2.3003887351119485, + "learning_rate": 6.435955307000334e-06, + "loss": 0.4559, + "step": 8202 + }, + { + "epoch": 1.2552410099464422, + "grad_norm": 2.342065540275981, + "learning_rate": 6.4336399713115675e-06, + "loss": 0.4731, + "step": 8203 + }, + { + "epoch": 1.2553940321346595, + "grad_norm": 2.159649433148729, + "learning_rate": 6.431324854646925e-06, + "loss": 0.4246, + "step": 8204 + }, + { + "epoch": 1.2555470543228768, + "grad_norm": 2.1847647079301815, + "learning_rate": 6.429009957148582e-06, + "loss": 0.4069, + "step": 8205 + }, + { + "epoch": 1.2557000765110942, + "grad_norm": 2.1882504529272904, + "learning_rate": 6.426695278958706e-06, + "loss": 0.3894, + "step": 8206 + }, + { + "epoch": 1.2558530986993115, + "grad_norm": 2.09420529387032, + "learning_rate": 6.424380820219455e-06, + "loss": 0.3681, + "step": 8207 + }, + { + "epoch": 1.2560061208875286, + "grad_norm": 2.428772300102065, + "learning_rate": 6.422066581072964e-06, + "loss": 0.4136, + "step": 8208 + }, + { + "epoch": 1.256159143075746, + "grad_norm": 2.213077828037295, + "learning_rate": 6.419752561661358e-06, + "loss": 0.3422, + "step": 8209 + }, + { + "epoch": 1.2563121652639633, + "grad_norm": 2.0985586609316913, + "learning_rate": 6.41743876212676e-06, + "loss": 0.4081, + "step": 8210 + }, + { + "epoch": 1.2564651874521806, + "grad_norm": 2.2509752229595543, + "learning_rate": 6.415125182611257e-06, + "loss": 0.4167, + "step": 8211 + }, + { + "epoch": 1.256618209640398, + "grad_norm": 1.9046730532077025, + "learning_rate": 6.412811823256942e-06, + "loss": 0.3838, + "step": 8212 + }, + { + "epoch": 1.256771231828615, + "grad_norm": 2.2792001975357454, + "learning_rate": 6.410498684205889e-06, + "loss": 0.4713, + "step": 8213 + }, + { + "epoch": 1.2569242540168324, + "grad_norm": 2.3477391524234297, + "learning_rate": 6.40818576560015e-06, + "loss": 0.4213, + "step": 8214 + }, + { + "epoch": 1.2570772762050497, + "grad_norm": 2.1809286043256364, + "learning_rate": 6.405873067581776e-06, + "loss": 0.3181, + "step": 8215 + }, + { + "epoch": 1.257230298393267, + "grad_norm": 2.2438204832034225, + "learning_rate": 6.4035605902927985e-06, + "loss": 0.3751, + "step": 8216 + }, + { + "epoch": 1.2573833205814844, + "grad_norm": 2.2567114261781756, + "learning_rate": 6.401248333875232e-06, + "loss": 0.3622, + "step": 8217 + }, + { + "epoch": 1.2575363427697015, + "grad_norm": 1.9982564455204277, + "learning_rate": 6.398936298471086e-06, + "loss": 0.3942, + "step": 8218 + }, + { + "epoch": 1.2576893649579188, + "grad_norm": 2.2711651757471576, + "learning_rate": 6.3966244842223455e-06, + "loss": 0.4798, + "step": 8219 + }, + { + "epoch": 1.2578423871461362, + "grad_norm": 2.6053006150189892, + "learning_rate": 6.394312891270997e-06, + "loss": 0.4886, + "step": 8220 + }, + { + "epoch": 1.2579954093343535, + "grad_norm": 2.1850007261040543, + "learning_rate": 6.392001519758997e-06, + "loss": 0.3153, + "step": 8221 + }, + { + "epoch": 1.2581484315225708, + "grad_norm": 1.9624789585008473, + "learning_rate": 6.389690369828297e-06, + "loss": 0.3293, + "step": 8222 + }, + { + "epoch": 1.258301453710788, + "grad_norm": 1.8446939371406101, + "learning_rate": 6.387379441620838e-06, + "loss": 0.3596, + "step": 8223 + }, + { + "epoch": 1.2584544758990053, + "grad_norm": 1.9329320390958946, + "learning_rate": 6.3850687352785365e-06, + "loss": 0.3903, + "step": 8224 + }, + { + "epoch": 1.2586074980872226, + "grad_norm": 2.18800037719509, + "learning_rate": 6.3827582509433085e-06, + "loss": 0.4227, + "step": 8225 + }, + { + "epoch": 1.25876052027544, + "grad_norm": 1.9241949681406423, + "learning_rate": 6.380447988757048e-06, + "loss": 0.3641, + "step": 8226 + }, + { + "epoch": 1.2589135424636573, + "grad_norm": 2.172606220056416, + "learning_rate": 6.378137948861633e-06, + "loss": 0.3112, + "step": 8227 + }, + { + "epoch": 1.2590665646518744, + "grad_norm": 2.5231403921304647, + "learning_rate": 6.375828131398937e-06, + "loss": 0.4162, + "step": 8228 + }, + { + "epoch": 1.259219586840092, + "grad_norm": 2.196984189818374, + "learning_rate": 6.373518536510815e-06, + "loss": 0.4081, + "step": 8229 + }, + { + "epoch": 1.259372609028309, + "grad_norm": 2.262628444670184, + "learning_rate": 6.371209164339103e-06, + "loss": 0.3758, + "step": 8230 + }, + { + "epoch": 1.2595256312165264, + "grad_norm": 2.1348336472126883, + "learning_rate": 6.3689000150256345e-06, + "loss": 0.4025, + "step": 8231 + }, + { + "epoch": 1.2596786534047437, + "grad_norm": 2.2223035828990434, + "learning_rate": 6.3665910887122175e-06, + "loss": 0.3868, + "step": 8232 + }, + { + "epoch": 1.259831675592961, + "grad_norm": 2.3435045332318207, + "learning_rate": 6.364282385540661e-06, + "loss": 0.3535, + "step": 8233 + }, + { + "epoch": 1.2599846977811784, + "grad_norm": 2.110334468447516, + "learning_rate": 6.361973905652743e-06, + "loss": 0.3774, + "step": 8234 + }, + { + "epoch": 1.2601377199693955, + "grad_norm": 2.3897983562604272, + "learning_rate": 6.359665649190239e-06, + "loss": 0.4594, + "step": 8235 + }, + { + "epoch": 1.2602907421576128, + "grad_norm": 2.402575115221835, + "learning_rate": 6.357357616294912e-06, + "loss": 0.4075, + "step": 8236 + }, + { + "epoch": 1.2604437643458302, + "grad_norm": 2.263508452347503, + "learning_rate": 6.3550498071085025e-06, + "loss": 0.3954, + "step": 8237 + }, + { + "epoch": 1.2605967865340475, + "grad_norm": 2.2037969854889, + "learning_rate": 6.3527422217727405e-06, + "loss": 0.427, + "step": 8238 + }, + { + "epoch": 1.2607498087222648, + "grad_norm": 2.2239380934976687, + "learning_rate": 6.350434860429351e-06, + "loss": 0.4106, + "step": 8239 + }, + { + "epoch": 1.260902830910482, + "grad_norm": 2.468996984247626, + "learning_rate": 6.34812772322003e-06, + "loss": 0.4408, + "step": 8240 + }, + { + "epoch": 1.2610558530986993, + "grad_norm": 2.0541356098332915, + "learning_rate": 6.3458208102864735e-06, + "loss": 0.3487, + "step": 8241 + }, + { + "epoch": 1.2612088752869166, + "grad_norm": 2.30173111222071, + "learning_rate": 6.343514121770358e-06, + "loss": 0.4123, + "step": 8242 + }, + { + "epoch": 1.261361897475134, + "grad_norm": 2.4270570534290963, + "learning_rate": 6.341207657813342e-06, + "loss": 0.4424, + "step": 8243 + }, + { + "epoch": 1.2615149196633513, + "grad_norm": 2.399051121363175, + "learning_rate": 6.338901418557078e-06, + "loss": 0.4351, + "step": 8244 + }, + { + "epoch": 1.2616679418515684, + "grad_norm": 2.0292806752648724, + "learning_rate": 6.336595404143202e-06, + "loss": 0.3252, + "step": 8245 + }, + { + "epoch": 1.2618209640397857, + "grad_norm": 2.3769670869429755, + "learning_rate": 6.334289614713329e-06, + "loss": 0.4465, + "step": 8246 + }, + { + "epoch": 1.261973986228003, + "grad_norm": 2.217733053161381, + "learning_rate": 6.331984050409075e-06, + "loss": 0.3758, + "step": 8247 + }, + { + "epoch": 1.2621270084162204, + "grad_norm": 2.142845566249004, + "learning_rate": 6.329678711372025e-06, + "loss": 0.3559, + "step": 8248 + }, + { + "epoch": 1.2622800306044377, + "grad_norm": 2.2208892699743004, + "learning_rate": 6.32737359774377e-06, + "loss": 0.4055, + "step": 8249 + }, + { + "epoch": 1.2624330527926548, + "grad_norm": 2.1323922539303015, + "learning_rate": 6.325068709665867e-06, + "loss": 0.3995, + "step": 8250 + }, + { + "epoch": 1.2625860749808722, + "grad_norm": 1.8538505473912326, + "learning_rate": 6.322764047279869e-06, + "loss": 0.3345, + "step": 8251 + }, + { + "epoch": 1.2627390971690895, + "grad_norm": 2.086198855742557, + "learning_rate": 6.320459610727321e-06, + "loss": 0.3561, + "step": 8252 + }, + { + "epoch": 1.2628921193573068, + "grad_norm": 2.293317461350589, + "learning_rate": 6.318155400149739e-06, + "loss": 0.4263, + "step": 8253 + }, + { + "epoch": 1.2630451415455242, + "grad_norm": 2.057507239735757, + "learning_rate": 6.315851415688637e-06, + "loss": 0.3783, + "step": 8254 + }, + { + "epoch": 1.2631981637337413, + "grad_norm": 2.0939084818628655, + "learning_rate": 6.313547657485516e-06, + "loss": 0.4935, + "step": 8255 + }, + { + "epoch": 1.2633511859219586, + "grad_norm": 2.0975012438371197, + "learning_rate": 6.31124412568185e-06, + "loss": 0.4144, + "step": 8256 + }, + { + "epoch": 1.263504208110176, + "grad_norm": 2.212720946192638, + "learning_rate": 6.3089408204191146e-06, + "loss": 0.4442, + "step": 8257 + }, + { + "epoch": 1.2636572302983933, + "grad_norm": 2.1433392764607047, + "learning_rate": 6.306637741838766e-06, + "loss": 0.4074, + "step": 8258 + }, + { + "epoch": 1.2638102524866106, + "grad_norm": 1.9780751620760415, + "learning_rate": 6.304334890082238e-06, + "loss": 0.347, + "step": 8259 + }, + { + "epoch": 1.2639632746748277, + "grad_norm": 1.8717079822783924, + "learning_rate": 6.302032265290963e-06, + "loss": 0.3133, + "step": 8260 + }, + { + "epoch": 1.2641162968630453, + "grad_norm": 1.991064644715254, + "learning_rate": 6.299729867606352e-06, + "loss": 0.4103, + "step": 8261 + }, + { + "epoch": 1.2642693190512624, + "grad_norm": 2.0474793196900722, + "learning_rate": 6.297427697169809e-06, + "loss": 0.3109, + "step": 8262 + }, + { + "epoch": 1.2644223412394797, + "grad_norm": 2.0837694344823885, + "learning_rate": 6.295125754122713e-06, + "loss": 0.3487, + "step": 8263 + }, + { + "epoch": 1.264575363427697, + "grad_norm": 2.0638674344607497, + "learning_rate": 6.292824038606437e-06, + "loss": 0.3289, + "step": 8264 + }, + { + "epoch": 1.2647283856159144, + "grad_norm": 2.322091446527354, + "learning_rate": 6.290522550762343e-06, + "loss": 0.4061, + "step": 8265 + }, + { + "epoch": 1.2648814078041317, + "grad_norm": 2.4768178687633124, + "learning_rate": 6.288221290731769e-06, + "loss": 0.4325, + "step": 8266 + }, + { + "epoch": 1.2650344299923488, + "grad_norm": 2.193177524623937, + "learning_rate": 6.285920258656043e-06, + "loss": 0.4204, + "step": 8267 + }, + { + "epoch": 1.2651874521805662, + "grad_norm": 2.428941265384786, + "learning_rate": 6.283619454676489e-06, + "loss": 0.4369, + "step": 8268 + }, + { + "epoch": 1.2653404743687835, + "grad_norm": 2.3254280713571265, + "learning_rate": 6.281318878934397e-06, + "loss": 0.3982, + "step": 8269 + }, + { + "epoch": 1.2654934965570008, + "grad_norm": 2.272008199503931, + "learning_rate": 6.279018531571064e-06, + "loss": 0.4191, + "step": 8270 + }, + { + "epoch": 1.2656465187452182, + "grad_norm": 2.305652085924085, + "learning_rate": 6.27671841272776e-06, + "loss": 0.3955, + "step": 8271 + }, + { + "epoch": 1.2657995409334353, + "grad_norm": 1.9353755654140936, + "learning_rate": 6.27441852254574e-06, + "loss": 0.3637, + "step": 8272 + }, + { + "epoch": 1.2659525631216526, + "grad_norm": 1.8828491506877458, + "learning_rate": 6.2721188611662575e-06, + "loss": 0.2892, + "step": 8273 + }, + { + "epoch": 1.26610558530987, + "grad_norm": 2.1776025532535623, + "learning_rate": 6.269819428730539e-06, + "loss": 0.4541, + "step": 8274 + }, + { + "epoch": 1.2662586074980873, + "grad_norm": 1.999490178432348, + "learning_rate": 6.267520225379797e-06, + "loss": 0.3681, + "step": 8275 + }, + { + "epoch": 1.2664116296863046, + "grad_norm": 2.253935070804977, + "learning_rate": 6.265221251255245e-06, + "loss": 0.3812, + "step": 8276 + }, + { + "epoch": 1.2665646518745217, + "grad_norm": 2.002987514155831, + "learning_rate": 6.262922506498063e-06, + "loss": 0.3807, + "step": 8277 + }, + { + "epoch": 1.266717674062739, + "grad_norm": 2.0505934288097167, + "learning_rate": 6.26062399124943e-06, + "loss": 0.3383, + "step": 8278 + }, + { + "epoch": 1.2668706962509564, + "grad_norm": 2.0558883228802864, + "learning_rate": 6.25832570565051e-06, + "loss": 0.3349, + "step": 8279 + }, + { + "epoch": 1.2670237184391737, + "grad_norm": 2.1637451051227834, + "learning_rate": 6.25602764984244e-06, + "loss": 0.3656, + "step": 8280 + }, + { + "epoch": 1.267176740627391, + "grad_norm": 2.101792748438887, + "learning_rate": 6.2537298239663615e-06, + "loss": 0.3983, + "step": 8281 + }, + { + "epoch": 1.2673297628156082, + "grad_norm": 1.9893459953979338, + "learning_rate": 6.251432228163392e-06, + "loss": 0.3411, + "step": 8282 + }, + { + "epoch": 1.2674827850038255, + "grad_norm": 2.2123349417488734, + "learning_rate": 6.249134862574631e-06, + "loss": 0.4035, + "step": 8283 + }, + { + "epoch": 1.2676358071920428, + "grad_norm": 1.9567029487218803, + "learning_rate": 6.2468377273411715e-06, + "loss": 0.3217, + "step": 8284 + }, + { + "epoch": 1.2677888293802602, + "grad_norm": 2.25711180116385, + "learning_rate": 6.244540822604088e-06, + "loss": 0.3533, + "step": 8285 + }, + { + "epoch": 1.2679418515684775, + "grad_norm": 2.1290431639737477, + "learning_rate": 6.24224414850445e-06, + "loss": 0.4025, + "step": 8286 + }, + { + "epoch": 1.2680948737566946, + "grad_norm": 2.4206966590733434, + "learning_rate": 6.239947705183296e-06, + "loss": 0.4232, + "step": 8287 + }, + { + "epoch": 1.268247895944912, + "grad_norm": 2.2201454182364677, + "learning_rate": 6.23765149278166e-06, + "loss": 0.3474, + "step": 8288 + }, + { + "epoch": 1.2684009181331293, + "grad_norm": 2.0819815759252935, + "learning_rate": 6.2353555114405695e-06, + "loss": 0.3625, + "step": 8289 + }, + { + "epoch": 1.2685539403213466, + "grad_norm": 2.6327659066634244, + "learning_rate": 6.233059761301022e-06, + "loss": 0.3831, + "step": 8290 + }, + { + "epoch": 1.268706962509564, + "grad_norm": 2.358386566791711, + "learning_rate": 6.2307642425040085e-06, + "loss": 0.4258, + "step": 8291 + }, + { + "epoch": 1.268859984697781, + "grad_norm": 2.406902397025713, + "learning_rate": 6.228468955190512e-06, + "loss": 0.4622, + "step": 8292 + }, + { + "epoch": 1.2690130068859986, + "grad_norm": 2.1879106838578375, + "learning_rate": 6.2261738995014885e-06, + "loss": 0.4255, + "step": 8293 + }, + { + "epoch": 1.2691660290742157, + "grad_norm": 2.0277613512524058, + "learning_rate": 6.22387907557789e-06, + "loss": 0.4056, + "step": 8294 + }, + { + "epoch": 1.269319051262433, + "grad_norm": 2.172511714493054, + "learning_rate": 6.221584483560652e-06, + "loss": 0.3701, + "step": 8295 + }, + { + "epoch": 1.2694720734506504, + "grad_norm": 1.9385020661538073, + "learning_rate": 6.219290123590689e-06, + "loss": 0.3819, + "step": 8296 + }, + { + "epoch": 1.2696250956388677, + "grad_norm": 2.0184288428167183, + "learning_rate": 6.21699599580891e-06, + "loss": 0.3895, + "step": 8297 + }, + { + "epoch": 1.269778117827085, + "grad_norm": 2.275096189344347, + "learning_rate": 6.214702100356205e-06, + "loss": 0.3442, + "step": 8298 + }, + { + "epoch": 1.2699311400153022, + "grad_norm": 2.6762797079326064, + "learning_rate": 6.212408437373456e-06, + "loss": 0.3961, + "step": 8299 + }, + { + "epoch": 1.2700841622035195, + "grad_norm": 2.2741365286187025, + "learning_rate": 6.21011500700152e-06, + "loss": 0.4131, + "step": 8300 + }, + { + "epoch": 1.2702371843917368, + "grad_norm": 1.9190830243481853, + "learning_rate": 6.207821809381247e-06, + "loss": 0.2969, + "step": 8301 + }, + { + "epoch": 1.2703902065799542, + "grad_norm": 2.2401792695423497, + "learning_rate": 6.205528844653474e-06, + "loss": 0.3656, + "step": 8302 + }, + { + "epoch": 1.2705432287681715, + "grad_norm": 1.9741605060912868, + "learning_rate": 6.2032361129590165e-06, + "loss": 0.3255, + "step": 8303 + }, + { + "epoch": 1.2706962509563886, + "grad_norm": 1.9776819900089568, + "learning_rate": 6.200943614438681e-06, + "loss": 0.3396, + "step": 8304 + }, + { + "epoch": 1.270849273144606, + "grad_norm": 2.4727351802146096, + "learning_rate": 6.1986513492332645e-06, + "loss": 0.4541, + "step": 8305 + }, + { + "epoch": 1.2710022953328233, + "grad_norm": 1.762131739082361, + "learning_rate": 6.196359317483536e-06, + "loss": 0.2889, + "step": 8306 + }, + { + "epoch": 1.2711553175210406, + "grad_norm": 2.077140126012181, + "learning_rate": 6.1940675193302625e-06, + "loss": 0.3449, + "step": 8307 + }, + { + "epoch": 1.271308339709258, + "grad_norm": 1.966671711976856, + "learning_rate": 6.191775954914195e-06, + "loss": 0.3855, + "step": 8308 + }, + { + "epoch": 1.271461361897475, + "grad_norm": 1.9752626408364398, + "learning_rate": 6.189484624376059e-06, + "loss": 0.321, + "step": 8309 + }, + { + "epoch": 1.2716143840856924, + "grad_norm": 2.1434276190951, + "learning_rate": 6.187193527856581e-06, + "loss": 0.4265, + "step": 8310 + }, + { + "epoch": 1.2717674062739097, + "grad_norm": 2.145563444426424, + "learning_rate": 6.184902665496466e-06, + "loss": 0.34, + "step": 8311 + }, + { + "epoch": 1.271920428462127, + "grad_norm": 1.9283577771018996, + "learning_rate": 6.182612037436399e-06, + "loss": 0.3911, + "step": 8312 + }, + { + "epoch": 1.2720734506503444, + "grad_norm": 2.3122756860692784, + "learning_rate": 6.180321643817062e-06, + "loss": 0.3537, + "step": 8313 + }, + { + "epoch": 1.2722264728385615, + "grad_norm": 2.164697408819195, + "learning_rate": 6.178031484779115e-06, + "loss": 0.353, + "step": 8314 + }, + { + "epoch": 1.2723794950267788, + "grad_norm": 2.0017443136895654, + "learning_rate": 6.1757415604632084e-06, + "loss": 0.3517, + "step": 8315 + }, + { + "epoch": 1.2725325172149962, + "grad_norm": 2.4725242851848463, + "learning_rate": 6.173451871009974e-06, + "loss": 0.3811, + "step": 8316 + }, + { + "epoch": 1.2726855394032135, + "grad_norm": 1.9614402340989665, + "learning_rate": 6.1711624165600255e-06, + "loss": 0.3094, + "step": 8317 + }, + { + "epoch": 1.2728385615914308, + "grad_norm": 2.4254988450390393, + "learning_rate": 6.168873197253975e-06, + "loss": 0.3652, + "step": 8318 + }, + { + "epoch": 1.272991583779648, + "grad_norm": 2.3981313138203686, + "learning_rate": 6.1665842132324095e-06, + "loss": 0.4186, + "step": 8319 + }, + { + "epoch": 1.2731446059678653, + "grad_norm": 2.1409247382783057, + "learning_rate": 6.164295464635899e-06, + "loss": 0.3752, + "step": 8320 + }, + { + "epoch": 1.2732976281560826, + "grad_norm": 2.1396974459941616, + "learning_rate": 6.162006951605015e-06, + "loss": 0.3566, + "step": 8321 + }, + { + "epoch": 1.2734506503443, + "grad_norm": 1.9810680981670772, + "learning_rate": 6.159718674280294e-06, + "loss": 0.3446, + "step": 8322 + }, + { + "epoch": 1.2736036725325173, + "grad_norm": 2.049489820508499, + "learning_rate": 6.157430632802275e-06, + "loss": 0.3551, + "step": 8323 + }, + { + "epoch": 1.2737566947207344, + "grad_norm": 2.1250524285136674, + "learning_rate": 6.1551428273114745e-06, + "loss": 0.3756, + "step": 8324 + }, + { + "epoch": 1.2739097169089517, + "grad_norm": 2.0211598009609637, + "learning_rate": 6.15285525794839e-06, + "loss": 0.3541, + "step": 8325 + }, + { + "epoch": 1.274062739097169, + "grad_norm": 2.1857253921429525, + "learning_rate": 6.150567924853516e-06, + "loss": 0.4467, + "step": 8326 + }, + { + "epoch": 1.2742157612853864, + "grad_norm": 2.4665073319611563, + "learning_rate": 6.1482808281673275e-06, + "loss": 0.4236, + "step": 8327 + }, + { + "epoch": 1.2743687834736037, + "grad_norm": 2.0913011820808998, + "learning_rate": 6.1459939680302774e-06, + "loss": 0.3339, + "step": 8328 + }, + { + "epoch": 1.2745218056618208, + "grad_norm": 1.952019493038919, + "learning_rate": 6.143707344582817e-06, + "loss": 0.3398, + "step": 8329 + }, + { + "epoch": 1.2746748278500384, + "grad_norm": 2.2720660090661906, + "learning_rate": 6.141420957965372e-06, + "loss": 0.4193, + "step": 8330 + }, + { + "epoch": 1.2748278500382555, + "grad_norm": 1.9846585102425593, + "learning_rate": 6.1391348083183635e-06, + "loss": 0.2835, + "step": 8331 + }, + { + "epoch": 1.2749808722264728, + "grad_norm": 2.2036433802674287, + "learning_rate": 6.13684889578219e-06, + "loss": 0.3277, + "step": 8332 + }, + { + "epoch": 1.2751338944146902, + "grad_norm": 1.9806487568109303, + "learning_rate": 6.134563220497236e-06, + "loss": 0.2871, + "step": 8333 + }, + { + "epoch": 1.2752869166029075, + "grad_norm": 1.9513522399246421, + "learning_rate": 6.132277782603879e-06, + "loss": 0.3365, + "step": 8334 + }, + { + "epoch": 1.2754399387911248, + "grad_norm": 2.165274496595406, + "learning_rate": 6.129992582242472e-06, + "loss": 0.3678, + "step": 8335 + }, + { + "epoch": 1.275592960979342, + "grad_norm": 1.9969987826577444, + "learning_rate": 6.127707619553361e-06, + "loss": 0.4157, + "step": 8336 + }, + { + "epoch": 1.2757459831675593, + "grad_norm": 2.265249391828096, + "learning_rate": 6.125422894676876e-06, + "loss": 0.394, + "step": 8337 + }, + { + "epoch": 1.2758990053557766, + "grad_norm": 2.116122460738184, + "learning_rate": 6.123138407753324e-06, + "loss": 0.3288, + "step": 8338 + }, + { + "epoch": 1.276052027543994, + "grad_norm": 1.8821468642653791, + "learning_rate": 6.120854158923011e-06, + "loss": 0.3307, + "step": 8339 + }, + { + "epoch": 1.2762050497322113, + "grad_norm": 1.9513607257134291, + "learning_rate": 6.118570148326222e-06, + "loss": 0.2738, + "step": 8340 + }, + { + "epoch": 1.2763580719204284, + "grad_norm": 1.9936208113589038, + "learning_rate": 6.11628637610322e-06, + "loss": 0.3224, + "step": 8341 + }, + { + "epoch": 1.2765110941086457, + "grad_norm": 2.3496749593098785, + "learning_rate": 6.114002842394267e-06, + "loss": 0.5041, + "step": 8342 + }, + { + "epoch": 1.276664116296863, + "grad_norm": 2.0713929832468616, + "learning_rate": 6.1117195473395985e-06, + "loss": 0.3629, + "step": 8343 + }, + { + "epoch": 1.2768171384850804, + "grad_norm": 2.2931273716137355, + "learning_rate": 6.109436491079449e-06, + "loss": 0.401, + "step": 8344 + }, + { + "epoch": 1.2769701606732977, + "grad_norm": 1.9863701102849063, + "learning_rate": 6.1071536737540225e-06, + "loss": 0.3444, + "step": 8345 + }, + { + "epoch": 1.2771231828615148, + "grad_norm": 1.7369545129822166, + "learning_rate": 6.104871095503515e-06, + "loss": 0.3197, + "step": 8346 + }, + { + "epoch": 1.2772762050497322, + "grad_norm": 2.077382461800277, + "learning_rate": 6.102588756468115e-06, + "loss": 0.382, + "step": 8347 + }, + { + "epoch": 1.2774292272379495, + "grad_norm": 2.494916291650205, + "learning_rate": 6.100306656787986e-06, + "loss": 0.3721, + "step": 8348 + }, + { + "epoch": 1.2775822494261668, + "grad_norm": 1.9543398821885236, + "learning_rate": 6.098024796603277e-06, + "loss": 0.374, + "step": 8349 + }, + { + "epoch": 1.2777352716143842, + "grad_norm": 2.0822556782993784, + "learning_rate": 6.095743176054134e-06, + "loss": 0.3969, + "step": 8350 + }, + { + "epoch": 1.2778882938026013, + "grad_norm": 2.0266313057284635, + "learning_rate": 6.093461795280673e-06, + "loss": 0.3629, + "step": 8351 + }, + { + "epoch": 1.2780413159908186, + "grad_norm": 2.2597584533701247, + "learning_rate": 6.091180654423008e-06, + "loss": 0.3956, + "step": 8352 + }, + { + "epoch": 1.278194338179036, + "grad_norm": 2.156520273496613, + "learning_rate": 6.0888997536212314e-06, + "loss": 0.3296, + "step": 8353 + }, + { + "epoch": 1.2783473603672533, + "grad_norm": 2.0934949462261416, + "learning_rate": 6.086619093015416e-06, + "loss": 0.3407, + "step": 8354 + }, + { + "epoch": 1.2785003825554706, + "grad_norm": 2.169281922759286, + "learning_rate": 6.0843386727456355e-06, + "loss": 0.3667, + "step": 8355 + }, + { + "epoch": 1.2786534047436877, + "grad_norm": 2.22918601294946, + "learning_rate": 6.0820584929519375e-06, + "loss": 0.3654, + "step": 8356 + }, + { + "epoch": 1.278806426931905, + "grad_norm": 2.6905374498040087, + "learning_rate": 6.079778553774347e-06, + "loss": 0.4124, + "step": 8357 + }, + { + "epoch": 1.2789594491201224, + "grad_norm": 2.2477897591567015, + "learning_rate": 6.077498855352899e-06, + "loss": 0.4542, + "step": 8358 + }, + { + "epoch": 1.2791124713083397, + "grad_norm": 2.2985892832882864, + "learning_rate": 6.075219397827585e-06, + "loss": 0.3577, + "step": 8359 + }, + { + "epoch": 1.279265493496557, + "grad_norm": 2.1801673874314336, + "learning_rate": 6.072940181338406e-06, + "loss": 0.4154, + "step": 8360 + }, + { + "epoch": 1.2794185156847742, + "grad_norm": 2.069130573415728, + "learning_rate": 6.070661206025334e-06, + "loss": 0.3956, + "step": 8361 + }, + { + "epoch": 1.2795715378729917, + "grad_norm": 2.0048130565920155, + "learning_rate": 6.068382472028325e-06, + "loss": 0.3135, + "step": 8362 + }, + { + "epoch": 1.2797245600612088, + "grad_norm": 1.88601860046552, + "learning_rate": 6.066103979487332e-06, + "loss": 0.3595, + "step": 8363 + }, + { + "epoch": 1.2798775822494262, + "grad_norm": 2.308966911833846, + "learning_rate": 6.0638257285422845e-06, + "loss": 0.3559, + "step": 8364 + }, + { + "epoch": 1.2800306044376435, + "grad_norm": 1.9954425438063566, + "learning_rate": 6.061547719333092e-06, + "loss": 0.4064, + "step": 8365 + }, + { + "epoch": 1.2801836266258608, + "grad_norm": 2.2874589176202775, + "learning_rate": 6.059269951999665e-06, + "loss": 0.3284, + "step": 8366 + }, + { + "epoch": 1.2803366488140782, + "grad_norm": 2.281622447058958, + "learning_rate": 6.0569924266818844e-06, + "loss": 0.4108, + "step": 8367 + }, + { + "epoch": 1.2804896710022953, + "grad_norm": 2.206075216573966, + "learning_rate": 6.054715143519627e-06, + "loss": 0.366, + "step": 8368 + }, + { + "epoch": 1.2806426931905126, + "grad_norm": 2.2776572125639842, + "learning_rate": 6.052438102652746e-06, + "loss": 0.4196, + "step": 8369 + }, + { + "epoch": 1.28079571537873, + "grad_norm": 1.7869163264626549, + "learning_rate": 6.05016130422108e-06, + "loss": 0.3237, + "step": 8370 + }, + { + "epoch": 1.2809487375669473, + "grad_norm": 2.1341648244510423, + "learning_rate": 6.047884748364467e-06, + "loss": 0.3737, + "step": 8371 + }, + { + "epoch": 1.2811017597551646, + "grad_norm": 2.18049711268148, + "learning_rate": 6.0456084352227065e-06, + "loss": 0.409, + "step": 8372 + }, + { + "epoch": 1.2812547819433817, + "grad_norm": 2.1175669065339093, + "learning_rate": 6.043332364935603e-06, + "loss": 0.3558, + "step": 8373 + }, + { + "epoch": 1.281407804131599, + "grad_norm": 2.173177324049422, + "learning_rate": 6.041056537642942e-06, + "loss": 0.336, + "step": 8374 + }, + { + "epoch": 1.2815608263198164, + "grad_norm": 2.0800240094069697, + "learning_rate": 6.0387809534844795e-06, + "loss": 0.3442, + "step": 8375 + }, + { + "epoch": 1.2817138485080337, + "grad_norm": 1.906890683579724, + "learning_rate": 6.03650561259998e-06, + "loss": 0.3571, + "step": 8376 + }, + { + "epoch": 1.281866870696251, + "grad_norm": 1.989073502947194, + "learning_rate": 6.0342305151291755e-06, + "loss": 0.3221, + "step": 8377 + }, + { + "epoch": 1.2820198928844682, + "grad_norm": 1.816191193612736, + "learning_rate": 6.031955661211788e-06, + "loss": 0.3345, + "step": 8378 + }, + { + "epoch": 1.2821729150726855, + "grad_norm": 2.0046077691510362, + "learning_rate": 6.029681050987526e-06, + "loss": 0.3253, + "step": 8379 + }, + { + "epoch": 1.2823259372609028, + "grad_norm": 1.998886441065847, + "learning_rate": 6.027406684596082e-06, + "loss": 0.358, + "step": 8380 + }, + { + "epoch": 1.2824789594491202, + "grad_norm": 2.301187711651734, + "learning_rate": 6.025132562177139e-06, + "loss": 0.3304, + "step": 8381 + }, + { + "epoch": 1.2826319816373375, + "grad_norm": 2.1476577961675627, + "learning_rate": 6.022858683870351e-06, + "loss": 0.3894, + "step": 8382 + }, + { + "epoch": 1.2827850038255546, + "grad_norm": 2.245599970159706, + "learning_rate": 6.020585049815368e-06, + "loss": 0.402, + "step": 8383 + }, + { + "epoch": 1.282938026013772, + "grad_norm": 1.8652376913870043, + "learning_rate": 6.01831166015183e-06, + "loss": 0.3539, + "step": 8384 + }, + { + "epoch": 1.2830910482019893, + "grad_norm": 2.091989916493354, + "learning_rate": 6.016038515019347e-06, + "loss": 0.3117, + "step": 8385 + }, + { + "epoch": 1.2832440703902066, + "grad_norm": 2.2039189929629255, + "learning_rate": 6.013765614557522e-06, + "loss": 0.4243, + "step": 8386 + }, + { + "epoch": 1.283397092578424, + "grad_norm": 2.2239495888325886, + "learning_rate": 6.011492958905949e-06, + "loss": 0.3114, + "step": 8387 + }, + { + "epoch": 1.283550114766641, + "grad_norm": 2.1535282910124747, + "learning_rate": 6.009220548204193e-06, + "loss": 0.383, + "step": 8388 + }, + { + "epoch": 1.2837031369548584, + "grad_norm": 1.8290585796153886, + "learning_rate": 6.006948382591816e-06, + "loss": 0.2985, + "step": 8389 + }, + { + "epoch": 1.2838561591430757, + "grad_norm": 2.023290060036757, + "learning_rate": 6.004676462208363e-06, + "loss": 0.3981, + "step": 8390 + }, + { + "epoch": 1.284009181331293, + "grad_norm": 1.914516702960148, + "learning_rate": 6.002404787193352e-06, + "loss": 0.2963, + "step": 8391 + }, + { + "epoch": 1.2841622035195104, + "grad_norm": 2.1996232859634914, + "learning_rate": 6.000133357686306e-06, + "loss": 0.3343, + "step": 8392 + }, + { + "epoch": 1.2843152257077275, + "grad_norm": 2.190271452100412, + "learning_rate": 5.99786217382672e-06, + "loss": 0.3885, + "step": 8393 + }, + { + "epoch": 1.284468247895945, + "grad_norm": 2.3640070841460195, + "learning_rate": 5.995591235754069e-06, + "loss": 0.4181, + "step": 8394 + }, + { + "epoch": 1.2846212700841622, + "grad_norm": 2.0388498245605, + "learning_rate": 5.993320543607828e-06, + "loss": 0.3608, + "step": 8395 + }, + { + "epoch": 1.2847742922723795, + "grad_norm": 2.2357999558982247, + "learning_rate": 5.9910500975274456e-06, + "loss": 0.3957, + "step": 8396 + }, + { + "epoch": 1.2849273144605968, + "grad_norm": 2.041737865466858, + "learning_rate": 5.988779897652364e-06, + "loss": 0.4002, + "step": 8397 + }, + { + "epoch": 1.2850803366488142, + "grad_norm": 2.239729576172599, + "learning_rate": 5.986509944121996e-06, + "loss": 0.3372, + "step": 8398 + }, + { + "epoch": 1.2852333588370315, + "grad_norm": 2.2708251624696763, + "learning_rate": 5.984240237075754e-06, + "loss": 0.3929, + "step": 8399 + }, + { + "epoch": 1.2853863810252486, + "grad_norm": 1.9576468807793188, + "learning_rate": 5.981970776653032e-06, + "loss": 0.3, + "step": 8400 + }, + { + "epoch": 1.285539403213466, + "grad_norm": 2.348142708232122, + "learning_rate": 5.979701562993199e-06, + "loss": 0.4128, + "step": 8401 + }, + { + "epoch": 1.2856924254016833, + "grad_norm": 2.225493045946411, + "learning_rate": 5.977432596235623e-06, + "loss": 0.4226, + "step": 8402 + }, + { + "epoch": 1.2858454475899006, + "grad_norm": 2.0902073882347283, + "learning_rate": 5.9751638765196494e-06, + "loss": 0.3488, + "step": 8403 + }, + { + "epoch": 1.285998469778118, + "grad_norm": 2.0223018514828786, + "learning_rate": 5.972895403984603e-06, + "loss": 0.3408, + "step": 8404 + }, + { + "epoch": 1.286151491966335, + "grad_norm": 2.171673961595489, + "learning_rate": 5.970627178769806e-06, + "loss": 0.4067, + "step": 8405 + }, + { + "epoch": 1.2863045141545524, + "grad_norm": 2.216297180724609, + "learning_rate": 5.96835920101456e-06, + "loss": 0.3865, + "step": 8406 + }, + { + "epoch": 1.2864575363427697, + "grad_norm": 2.224480053286395, + "learning_rate": 5.966091470858142e-06, + "loss": 0.3895, + "step": 8407 + }, + { + "epoch": 1.286610558530987, + "grad_norm": 2.0159160285134994, + "learning_rate": 5.96382398843983e-06, + "loss": 0.3411, + "step": 8408 + }, + { + "epoch": 1.2867635807192044, + "grad_norm": 1.9803037406134543, + "learning_rate": 5.961556753898874e-06, + "loss": 0.3131, + "step": 8409 + }, + { + "epoch": 1.2869166029074215, + "grad_norm": 1.9246640134321054, + "learning_rate": 5.959289767374521e-06, + "loss": 0.2991, + "step": 8410 + }, + { + "epoch": 1.2870696250956388, + "grad_norm": 2.29223275943616, + "learning_rate": 5.957023029005988e-06, + "loss": 0.3997, + "step": 8411 + }, + { + "epoch": 1.2872226472838562, + "grad_norm": 2.0759714800081297, + "learning_rate": 5.9547565389324865e-06, + "loss": 0.3184, + "step": 8412 + }, + { + "epoch": 1.2873756694720735, + "grad_norm": 2.345956476444365, + "learning_rate": 5.952490297293214e-06, + "loss": 0.406, + "step": 8413 + }, + { + "epoch": 1.2875286916602908, + "grad_norm": 2.1234583404676997, + "learning_rate": 5.950224304227345e-06, + "loss": 0.3872, + "step": 8414 + }, + { + "epoch": 1.287681713848508, + "grad_norm": 2.3341118330349926, + "learning_rate": 5.947958559874041e-06, + "loss": 0.3964, + "step": 8415 + }, + { + "epoch": 1.2878347360367253, + "grad_norm": 2.346952170027564, + "learning_rate": 5.945693064372457e-06, + "loss": 0.3331, + "step": 8416 + }, + { + "epoch": 1.2879877582249426, + "grad_norm": 2.0502283032629234, + "learning_rate": 5.94342781786172e-06, + "loss": 0.3677, + "step": 8417 + }, + { + "epoch": 1.28814078041316, + "grad_norm": 2.3096833762404616, + "learning_rate": 5.941162820480952e-06, + "loss": 0.3422, + "step": 8418 + }, + { + "epoch": 1.2882938026013773, + "grad_norm": 2.138229007243028, + "learning_rate": 5.9388980723692545e-06, + "loss": 0.3992, + "step": 8419 + }, + { + "epoch": 1.2884468247895944, + "grad_norm": 2.262793774066718, + "learning_rate": 5.936633573665711e-06, + "loss": 0.3245, + "step": 8420 + }, + { + "epoch": 1.2885998469778117, + "grad_norm": 1.9477409894816895, + "learning_rate": 5.934369324509397e-06, + "loss": 0.2831, + "step": 8421 + }, + { + "epoch": 1.288752869166029, + "grad_norm": 2.2281573276173394, + "learning_rate": 5.93210532503937e-06, + "loss": 0.3967, + "step": 8422 + }, + { + "epoch": 1.2889058913542464, + "grad_norm": 2.22880743175084, + "learning_rate": 5.929841575394663e-06, + "loss": 0.3822, + "step": 8423 + }, + { + "epoch": 1.2890589135424637, + "grad_norm": 1.965550070449436, + "learning_rate": 5.9275780757143105e-06, + "loss": 0.3124, + "step": 8424 + }, + { + "epoch": 1.2892119357306808, + "grad_norm": 2.027513364688824, + "learning_rate": 5.9253148261373185e-06, + "loss": 0.3242, + "step": 8425 + }, + { + "epoch": 1.2893649579188982, + "grad_norm": 2.2444218795153486, + "learning_rate": 5.923051826802686e-06, + "loss": 0.3393, + "step": 8426 + }, + { + "epoch": 1.2895179801071155, + "grad_norm": 2.4148438662955822, + "learning_rate": 5.92078907784939e-06, + "loss": 0.3669, + "step": 8427 + }, + { + "epoch": 1.2896710022953328, + "grad_norm": 2.2021012890604483, + "learning_rate": 5.9185265794163905e-06, + "loss": 0.3546, + "step": 8428 + }, + { + "epoch": 1.2898240244835502, + "grad_norm": 2.1678300062988622, + "learning_rate": 5.916264331642645e-06, + "loss": 0.3763, + "step": 8429 + }, + { + "epoch": 1.2899770466717673, + "grad_norm": 2.070445823425188, + "learning_rate": 5.9140023346670815e-06, + "loss": 0.3678, + "step": 8430 + }, + { + "epoch": 1.2901300688599848, + "grad_norm": 2.2804935664834383, + "learning_rate": 5.911740588628616e-06, + "loss": 0.379, + "step": 8431 + }, + { + "epoch": 1.290283091048202, + "grad_norm": 2.0648217974475305, + "learning_rate": 5.9094790936661594e-06, + "loss": 0.3711, + "step": 8432 + }, + { + "epoch": 1.2904361132364193, + "grad_norm": 2.133264133751307, + "learning_rate": 5.907217849918588e-06, + "loss": 0.4078, + "step": 8433 + }, + { + "epoch": 1.2905891354246366, + "grad_norm": 2.0061436737092135, + "learning_rate": 5.904956857524784e-06, + "loss": 0.4, + "step": 8434 + }, + { + "epoch": 1.290742157612854, + "grad_norm": 1.9897296088214493, + "learning_rate": 5.902696116623599e-06, + "loss": 0.3381, + "step": 8435 + }, + { + "epoch": 1.2908951798010713, + "grad_norm": 1.9672303748095752, + "learning_rate": 5.900435627353868e-06, + "loss": 0.3526, + "step": 8436 + }, + { + "epoch": 1.2910482019892884, + "grad_norm": 2.3082399010119192, + "learning_rate": 5.898175389854431e-06, + "loss": 0.4296, + "step": 8437 + }, + { + "epoch": 1.2912012241775057, + "grad_norm": 2.043754552160689, + "learning_rate": 5.8959154042640855e-06, + "loss": 0.3501, + "step": 8438 + }, + { + "epoch": 1.291354246365723, + "grad_norm": 2.0111371417155053, + "learning_rate": 5.893655670721632e-06, + "loss": 0.3295, + "step": 8439 + }, + { + "epoch": 1.2915072685539404, + "grad_norm": 2.1845344064148677, + "learning_rate": 5.891396189365849e-06, + "loss": 0.3611, + "step": 8440 + }, + { + "epoch": 1.2916602907421577, + "grad_norm": 2.3597698023948865, + "learning_rate": 5.889136960335496e-06, + "loss": 0.3903, + "step": 8441 + }, + { + "epoch": 1.2918133129303748, + "grad_norm": 2.2960601544175567, + "learning_rate": 5.886877983769324e-06, + "loss": 0.3677, + "step": 8442 + }, + { + "epoch": 1.2919663351185922, + "grad_norm": 2.4107761408220028, + "learning_rate": 5.884619259806069e-06, + "loss": 0.357, + "step": 8443 + }, + { + "epoch": 1.2921193573068095, + "grad_norm": 2.211778674971672, + "learning_rate": 5.882360788584441e-06, + "loss": 0.3633, + "step": 8444 + }, + { + "epoch": 1.2922723794950268, + "grad_norm": 2.210377762764795, + "learning_rate": 5.880102570243147e-06, + "loss": 0.4351, + "step": 8445 + }, + { + "epoch": 1.2924254016832442, + "grad_norm": 2.0353077772274246, + "learning_rate": 5.877844604920869e-06, + "loss": 0.3138, + "step": 8446 + }, + { + "epoch": 1.2925784238714613, + "grad_norm": 2.1796447244885417, + "learning_rate": 5.875586892756284e-06, + "loss": 0.4046, + "step": 8447 + }, + { + "epoch": 1.2927314460596786, + "grad_norm": 2.1991555410920003, + "learning_rate": 5.873329433888042e-06, + "loss": 0.3785, + "step": 8448 + }, + { + "epoch": 1.292884468247896, + "grad_norm": 2.214367249698276, + "learning_rate": 5.87107222845478e-06, + "loss": 0.3908, + "step": 8449 + }, + { + "epoch": 1.2930374904361133, + "grad_norm": 2.261557704663619, + "learning_rate": 5.8688152765951274e-06, + "loss": 0.4244, + "step": 8450 + }, + { + "epoch": 1.2931905126243306, + "grad_norm": 2.099925902912094, + "learning_rate": 5.86655857844769e-06, + "loss": 0.359, + "step": 8451 + }, + { + "epoch": 1.2933435348125477, + "grad_norm": 2.5580065217380406, + "learning_rate": 5.8643021341510576e-06, + "loss": 0.3966, + "step": 8452 + }, + { + "epoch": 1.293496557000765, + "grad_norm": 2.334103559025696, + "learning_rate": 5.8620459438438145e-06, + "loss": 0.3454, + "step": 8453 + }, + { + "epoch": 1.2936495791889824, + "grad_norm": 2.208197675578791, + "learning_rate": 5.8597900076645135e-06, + "loss": 0.4068, + "step": 8454 + }, + { + "epoch": 1.2938026013771997, + "grad_norm": 2.158637856805305, + "learning_rate": 5.857534325751706e-06, + "loss": 0.4008, + "step": 8455 + }, + { + "epoch": 1.293955623565417, + "grad_norm": 2.076159021580232, + "learning_rate": 5.8552788982439234e-06, + "loss": 0.3684, + "step": 8456 + }, + { + "epoch": 1.2941086457536342, + "grad_norm": 2.2118198370758497, + "learning_rate": 5.853023725279675e-06, + "loss": 0.4319, + "step": 8457 + }, + { + "epoch": 1.2942616679418515, + "grad_norm": 2.0053899150292107, + "learning_rate": 5.850768806997465e-06, + "loss": 0.3374, + "step": 8458 + }, + { + "epoch": 1.2944146901300688, + "grad_norm": 2.1800900300923005, + "learning_rate": 5.848514143535774e-06, + "loss": 0.4282, + "step": 8459 + }, + { + "epoch": 1.2945677123182862, + "grad_norm": 2.2040001586963505, + "learning_rate": 5.846259735033068e-06, + "loss": 0.3899, + "step": 8460 + }, + { + "epoch": 1.2947207345065035, + "grad_norm": 2.14628589047775, + "learning_rate": 5.844005581627803e-06, + "loss": 0.3827, + "step": 8461 + }, + { + "epoch": 1.2948737566947206, + "grad_norm": 1.955815144083365, + "learning_rate": 5.84175168345841e-06, + "loss": 0.2907, + "step": 8462 + }, + { + "epoch": 1.2950267788829382, + "grad_norm": 2.2990986634509496, + "learning_rate": 5.8394980406633185e-06, + "loss": 0.3976, + "step": 8463 + }, + { + "epoch": 1.2951798010711553, + "grad_norm": 2.3551470293805488, + "learning_rate": 5.837244653380929e-06, + "loss": 0.4031, + "step": 8464 + }, + { + "epoch": 1.2953328232593726, + "grad_norm": 2.0455913786984032, + "learning_rate": 5.834991521749623e-06, + "loss": 0.3854, + "step": 8465 + }, + { + "epoch": 1.29548584544759, + "grad_norm": 2.295691500876195, + "learning_rate": 5.832738645907786e-06, + "loss": 0.4205, + "step": 8466 + }, + { + "epoch": 1.2956388676358073, + "grad_norm": 2.0712979508795573, + "learning_rate": 5.830486025993775e-06, + "loss": 0.3819, + "step": 8467 + }, + { + "epoch": 1.2957918898240246, + "grad_norm": 2.1216828024914287, + "learning_rate": 5.82823366214592e-06, + "loss": 0.3732, + "step": 8468 + }, + { + "epoch": 1.2959449120122417, + "grad_norm": 1.850364780767729, + "learning_rate": 5.82598155450256e-06, + "loss": 0.3449, + "step": 8469 + }, + { + "epoch": 1.296097934200459, + "grad_norm": 2.0162996634036245, + "learning_rate": 5.823729703202001e-06, + "loss": 0.3521, + "step": 8470 + }, + { + "epoch": 1.2962509563886764, + "grad_norm": 1.8674820045225895, + "learning_rate": 5.82147810838254e-06, + "loss": 0.3005, + "step": 8471 + }, + { + "epoch": 1.2964039785768937, + "grad_norm": 2.190393768245424, + "learning_rate": 5.819226770182453e-06, + "loss": 0.4191, + "step": 8472 + }, + { + "epoch": 1.296557000765111, + "grad_norm": 2.494076811857044, + "learning_rate": 5.816975688740007e-06, + "loss": 0.4447, + "step": 8473 + }, + { + "epoch": 1.2967100229533282, + "grad_norm": 2.113249050025513, + "learning_rate": 5.814724864193449e-06, + "loss": 0.3346, + "step": 8474 + }, + { + "epoch": 1.2968630451415455, + "grad_norm": 2.3310835110243144, + "learning_rate": 5.8124742966810075e-06, + "loss": 0.5078, + "step": 8475 + }, + { + "epoch": 1.2970160673297628, + "grad_norm": 2.083303137693278, + "learning_rate": 5.810223986340901e-06, + "loss": 0.3528, + "step": 8476 + }, + { + "epoch": 1.2971690895179802, + "grad_norm": 2.081771653631222, + "learning_rate": 5.807973933311332e-06, + "loss": 0.3542, + "step": 8477 + }, + { + "epoch": 1.2973221117061975, + "grad_norm": 2.2464382632521436, + "learning_rate": 5.805724137730482e-06, + "loss": 0.3513, + "step": 8478 + }, + { + "epoch": 1.2974751338944146, + "grad_norm": 2.0902690420170305, + "learning_rate": 5.803474599736521e-06, + "loss": 0.334, + "step": 8479 + }, + { + "epoch": 1.297628156082632, + "grad_norm": 1.977981981638016, + "learning_rate": 5.801225319467602e-06, + "loss": 0.35, + "step": 8480 + }, + { + "epoch": 1.2977811782708493, + "grad_norm": 2.6637236988962694, + "learning_rate": 5.798976297061861e-06, + "loss": 0.4043, + "step": 8481 + }, + { + "epoch": 1.2979342004590666, + "grad_norm": 2.2270672736281747, + "learning_rate": 5.79672753265742e-06, + "loss": 0.3494, + "step": 8482 + }, + { + "epoch": 1.298087222647284, + "grad_norm": 2.0626813380311, + "learning_rate": 5.794479026392381e-06, + "loss": 0.3174, + "step": 8483 + }, + { + "epoch": 1.298240244835501, + "grad_norm": 2.392483617852613, + "learning_rate": 5.792230778404846e-06, + "loss": 0.3716, + "step": 8484 + }, + { + "epoch": 1.2983932670237184, + "grad_norm": 1.9548439126657664, + "learning_rate": 5.789982788832875e-06, + "loss": 0.3328, + "step": 8485 + }, + { + "epoch": 1.2985462892119357, + "grad_norm": 2.2712771203487097, + "learning_rate": 5.7877350578145265e-06, + "loss": 0.4683, + "step": 8486 + }, + { + "epoch": 1.298699311400153, + "grad_norm": 2.1349081647901396, + "learning_rate": 5.785487585487855e-06, + "loss": 0.3355, + "step": 8487 + }, + { + "epoch": 1.2988523335883704, + "grad_norm": 2.0144446923386585, + "learning_rate": 5.7832403719908735e-06, + "loss": 0.3535, + "step": 8488 + }, + { + "epoch": 1.2990053557765875, + "grad_norm": 2.323913427366677, + "learning_rate": 5.7809934174615935e-06, + "loss": 0.3846, + "step": 8489 + }, + { + "epoch": 1.2991583779648048, + "grad_norm": 2.2065277565739647, + "learning_rate": 5.77874672203802e-06, + "loss": 0.405, + "step": 8490 + }, + { + "epoch": 1.2993114001530222, + "grad_norm": 2.0701072367879694, + "learning_rate": 5.776500285858115e-06, + "loss": 0.3591, + "step": 8491 + }, + { + "epoch": 1.2994644223412395, + "grad_norm": 2.026467391159173, + "learning_rate": 5.774254109059856e-06, + "loss": 0.3801, + "step": 8492 + }, + { + "epoch": 1.2996174445294568, + "grad_norm": 2.03636713492077, + "learning_rate": 5.772008191781187e-06, + "loss": 0.4068, + "step": 8493 + }, + { + "epoch": 1.299770466717674, + "grad_norm": 2.235728964125411, + "learning_rate": 5.769762534160026e-06, + "loss": 0.4005, + "step": 8494 + }, + { + "epoch": 1.2999234889058915, + "grad_norm": 2.2226179990971597, + "learning_rate": 5.7675171363343e-06, + "loss": 0.3848, + "step": 8495 + }, + { + "epoch": 1.3000765110941086, + "grad_norm": 1.8548276674323887, + "learning_rate": 5.765271998441909e-06, + "loss": 0.2832, + "step": 8496 + }, + { + "epoch": 1.300229533282326, + "grad_norm": 2.058654357219985, + "learning_rate": 5.763027120620723e-06, + "loss": 0.3759, + "step": 8497 + }, + { + "epoch": 1.3003825554705433, + "grad_norm": 2.1669192089657128, + "learning_rate": 5.760782503008622e-06, + "loss": 0.3865, + "step": 8498 + }, + { + "epoch": 1.3005355776587606, + "grad_norm": 2.2800499466539854, + "learning_rate": 5.758538145743448e-06, + "loss": 0.4721, + "step": 8499 + }, + { + "epoch": 1.300688599846978, + "grad_norm": 1.8692911785721809, + "learning_rate": 5.7562940489630424e-06, + "loss": 0.3094, + "step": 8500 + }, + { + "epoch": 1.300841622035195, + "grad_norm": 2.3481643066715803, + "learning_rate": 5.75405021280522e-06, + "loss": 0.3394, + "step": 8501 + }, + { + "epoch": 1.3009946442234124, + "grad_norm": 2.3364862272013154, + "learning_rate": 5.751806637407783e-06, + "loss": 0.4305, + "step": 8502 + }, + { + "epoch": 1.3011476664116297, + "grad_norm": 2.106806897595338, + "learning_rate": 5.7495633229085205e-06, + "loss": 0.3885, + "step": 8503 + }, + { + "epoch": 1.301300688599847, + "grad_norm": 2.131244287327404, + "learning_rate": 5.747320269445203e-06, + "loss": 0.3757, + "step": 8504 + }, + { + "epoch": 1.3014537107880644, + "grad_norm": 1.9915545641442498, + "learning_rate": 5.745077477155582e-06, + "loss": 0.3304, + "step": 8505 + }, + { + "epoch": 1.3016067329762815, + "grad_norm": 1.932676633650167, + "learning_rate": 5.7428349461774e-06, + "loss": 0.3865, + "step": 8506 + }, + { + "epoch": 1.3017597551644988, + "grad_norm": 2.3799151262160496, + "learning_rate": 5.740592676648377e-06, + "loss": 0.366, + "step": 8507 + }, + { + "epoch": 1.3019127773527162, + "grad_norm": 2.394625708639643, + "learning_rate": 5.7383506687062215e-06, + "loss": 0.4086, + "step": 8508 + }, + { + "epoch": 1.3020657995409335, + "grad_norm": 1.9055085009689208, + "learning_rate": 5.736108922488624e-06, + "loss": 0.3348, + "step": 8509 + }, + { + "epoch": 1.3022188217291508, + "grad_norm": 2.053303277029518, + "learning_rate": 5.733867438133256e-06, + "loss": 0.3045, + "step": 8510 + }, + { + "epoch": 1.302371843917368, + "grad_norm": 2.169386364248647, + "learning_rate": 5.731626215777779e-06, + "loss": 0.3458, + "step": 8511 + }, + { + "epoch": 1.3025248661055853, + "grad_norm": 2.127208555931068, + "learning_rate": 5.72938525555983e-06, + "loss": 0.3355, + "step": 8512 + }, + { + "epoch": 1.3026778882938026, + "grad_norm": 2.0541456236284463, + "learning_rate": 5.727144557617047e-06, + "loss": 0.4235, + "step": 8513 + }, + { + "epoch": 1.30283091048202, + "grad_norm": 2.1306174022427053, + "learning_rate": 5.724904122087028e-06, + "loss": 0.3497, + "step": 8514 + }, + { + "epoch": 1.3029839326702373, + "grad_norm": 1.9520284290277785, + "learning_rate": 5.722663949107368e-06, + "loss": 0.4199, + "step": 8515 + }, + { + "epoch": 1.3031369548584544, + "grad_norm": 2.319002474956079, + "learning_rate": 5.720424038815655e-06, + "loss": 0.3666, + "step": 8516 + }, + { + "epoch": 1.3032899770466717, + "grad_norm": 2.193066933790192, + "learning_rate": 5.71818439134944e-06, + "loss": 0.3682, + "step": 8517 + }, + { + "epoch": 1.303442999234889, + "grad_norm": 2.064281725208092, + "learning_rate": 5.715945006846269e-06, + "loss": 0.3089, + "step": 8518 + }, + { + "epoch": 1.3035960214231064, + "grad_norm": 2.165584682981705, + "learning_rate": 5.713705885443681e-06, + "loss": 0.4026, + "step": 8519 + }, + { + "epoch": 1.3037490436113237, + "grad_norm": 2.1493436478964374, + "learning_rate": 5.711467027279174e-06, + "loss": 0.342, + "step": 8520 + }, + { + "epoch": 1.3039020657995408, + "grad_norm": 1.7198630036611893, + "learning_rate": 5.709228432490259e-06, + "loss": 0.3119, + "step": 8521 + }, + { + "epoch": 1.3040550879877582, + "grad_norm": 2.053429899418811, + "learning_rate": 5.7069901012144155e-06, + "loss": 0.3461, + "step": 8522 + }, + { + "epoch": 1.3042081101759755, + "grad_norm": 2.191895371424377, + "learning_rate": 5.704752033589096e-06, + "loss": 0.3762, + "step": 8523 + }, + { + "epoch": 1.3043611323641928, + "grad_norm": 2.176273855972572, + "learning_rate": 5.702514229751761e-06, + "loss": 0.3912, + "step": 8524 + }, + { + "epoch": 1.3045141545524102, + "grad_norm": 2.1364346441788316, + "learning_rate": 5.70027668983984e-06, + "loss": 0.3702, + "step": 8525 + }, + { + "epoch": 1.3046671767406273, + "grad_norm": 2.1394636414100354, + "learning_rate": 5.698039413990748e-06, + "loss": 0.3959, + "step": 8526 + }, + { + "epoch": 1.3048201989288448, + "grad_norm": 2.0727055307523643, + "learning_rate": 5.6958024023418834e-06, + "loss": 0.3547, + "step": 8527 + }, + { + "epoch": 1.304973221117062, + "grad_norm": 2.044965766447203, + "learning_rate": 5.6935656550306326e-06, + "loss": 0.3701, + "step": 8528 + }, + { + "epoch": 1.3051262433052793, + "grad_norm": 2.259607204648534, + "learning_rate": 5.691329172194362e-06, + "loss": 0.3761, + "step": 8529 + }, + { + "epoch": 1.3052792654934966, + "grad_norm": 2.1227844121777326, + "learning_rate": 5.689092953970421e-06, + "loss": 0.4002, + "step": 8530 + }, + { + "epoch": 1.3054322876817137, + "grad_norm": 2.2885565622270008, + "learning_rate": 5.6868570004961474e-06, + "loss": 0.3782, + "step": 8531 + }, + { + "epoch": 1.3055853098699313, + "grad_norm": 2.2684685389350303, + "learning_rate": 5.684621311908859e-06, + "loss": 0.3547, + "step": 8532 + }, + { + "epoch": 1.3057383320581484, + "grad_norm": 2.2468869954835675, + "learning_rate": 5.682385888345856e-06, + "loss": 0.315, + "step": 8533 + }, + { + "epoch": 1.3058913542463657, + "grad_norm": 2.2885821903709296, + "learning_rate": 5.680150729944427e-06, + "loss": 0.3492, + "step": 8534 + }, + { + "epoch": 1.306044376434583, + "grad_norm": 2.2283745517249582, + "learning_rate": 5.677915836841842e-06, + "loss": 0.4227, + "step": 8535 + }, + { + "epoch": 1.3061973986228004, + "grad_norm": 2.10231696253404, + "learning_rate": 5.675681209175349e-06, + "loss": 0.3635, + "step": 8536 + }, + { + "epoch": 1.3063504208110177, + "grad_norm": 2.233717020395182, + "learning_rate": 5.673446847082197e-06, + "loss": 0.4036, + "step": 8537 + }, + { + "epoch": 1.3065034429992348, + "grad_norm": 1.9310872009083184, + "learning_rate": 5.671212750699597e-06, + "loss": 0.2954, + "step": 8538 + }, + { + "epoch": 1.3066564651874522, + "grad_norm": 2.030845760878608, + "learning_rate": 5.6689789201647505e-06, + "loss": 0.3745, + "step": 8539 + }, + { + "epoch": 1.3068094873756695, + "grad_norm": 1.8189427337342248, + "learning_rate": 5.66674535561486e-06, + "loss": 0.2948, + "step": 8540 + }, + { + "epoch": 1.3069625095638868, + "grad_norm": 2.0469441672632027, + "learning_rate": 5.664512057187081e-06, + "loss": 0.3323, + "step": 8541 + }, + { + "epoch": 1.3071155317521042, + "grad_norm": 2.0924553585486403, + "learning_rate": 5.662279025018582e-06, + "loss": 0.407, + "step": 8542 + }, + { + "epoch": 1.3072685539403213, + "grad_norm": 2.3896033125336005, + "learning_rate": 5.660046259246501e-06, + "loss": 0.4156, + "step": 8543 + }, + { + "epoch": 1.3074215761285386, + "grad_norm": 2.1963052778718253, + "learning_rate": 5.65781376000795e-06, + "loss": 0.3667, + "step": 8544 + }, + { + "epoch": 1.307574598316756, + "grad_norm": 2.2404897511730404, + "learning_rate": 5.6555815274400464e-06, + "loss": 0.3412, + "step": 8545 + }, + { + "epoch": 1.3077276205049733, + "grad_norm": 1.976985343614817, + "learning_rate": 5.65334956167988e-06, + "loss": 0.3942, + "step": 8546 + }, + { + "epoch": 1.3078806426931906, + "grad_norm": 2.260212423328555, + "learning_rate": 5.651117862864512e-06, + "loss": 0.3259, + "step": 8547 + }, + { + "epoch": 1.3080336648814077, + "grad_norm": 2.164028887318763, + "learning_rate": 5.6488864311310155e-06, + "loss": 0.3614, + "step": 8548 + }, + { + "epoch": 1.308186687069625, + "grad_norm": 2.30626799094876, + "learning_rate": 5.646655266616423e-06, + "loss": 0.4431, + "step": 8549 + }, + { + "epoch": 1.3083397092578424, + "grad_norm": 1.9692214406859057, + "learning_rate": 5.6444243694577615e-06, + "loss": 0.4161, + "step": 8550 + }, + { + "epoch": 1.3084927314460597, + "grad_norm": 2.1853722471880594, + "learning_rate": 5.642193739792039e-06, + "loss": 0.3402, + "step": 8551 + }, + { + "epoch": 1.308645753634277, + "grad_norm": 2.0970827282400992, + "learning_rate": 5.639963377756247e-06, + "loss": 0.4279, + "step": 8552 + }, + { + "epoch": 1.3087987758224942, + "grad_norm": 2.177870812573683, + "learning_rate": 5.63773328348736e-06, + "loss": 0.3626, + "step": 8553 + }, + { + "epoch": 1.3089517980107115, + "grad_norm": 2.591283101525571, + "learning_rate": 5.635503457122338e-06, + "loss": 0.4168, + "step": 8554 + }, + { + "epoch": 1.3091048201989288, + "grad_norm": 1.9694916240067648, + "learning_rate": 5.633273898798122e-06, + "loss": 0.3234, + "step": 8555 + }, + { + "epoch": 1.3092578423871462, + "grad_norm": 2.290182102724067, + "learning_rate": 5.631044608651639e-06, + "loss": 0.3949, + "step": 8556 + }, + { + "epoch": 1.3094108645753635, + "grad_norm": 2.276801072822145, + "learning_rate": 5.628815586819797e-06, + "loss": 0.3722, + "step": 8557 + }, + { + "epoch": 1.3095638867635806, + "grad_norm": 2.1922935542275392, + "learning_rate": 5.626586833439491e-06, + "loss": 0.2876, + "step": 8558 + }, + { + "epoch": 1.309716908951798, + "grad_norm": 2.125655685633201, + "learning_rate": 5.6243583486475965e-06, + "loss": 0.3184, + "step": 8559 + }, + { + "epoch": 1.3098699311400153, + "grad_norm": 2.290520817699174, + "learning_rate": 5.622130132580973e-06, + "loss": 0.4141, + "step": 8560 + }, + { + "epoch": 1.3100229533282326, + "grad_norm": 2.162194717394431, + "learning_rate": 5.619902185376465e-06, + "loss": 0.4075, + "step": 8561 + }, + { + "epoch": 1.31017597551645, + "grad_norm": 2.300845062414474, + "learning_rate": 5.617674507170899e-06, + "loss": 0.3332, + "step": 8562 + }, + { + "epoch": 1.310328997704667, + "grad_norm": 2.2539854405846516, + "learning_rate": 5.615447098101083e-06, + "loss": 0.3666, + "step": 8563 + }, + { + "epoch": 1.3104820198928846, + "grad_norm": 2.0871045714190015, + "learning_rate": 5.613219958303815e-06, + "loss": 0.3525, + "step": 8564 + }, + { + "epoch": 1.3106350420811017, + "grad_norm": 2.2476746690566896, + "learning_rate": 5.6109930879158656e-06, + "loss": 0.3941, + "step": 8565 + }, + { + "epoch": 1.310788064269319, + "grad_norm": 2.111502301320883, + "learning_rate": 5.60876648707401e-06, + "loss": 0.3222, + "step": 8566 + }, + { + "epoch": 1.3109410864575364, + "grad_norm": 2.217927916834475, + "learning_rate": 5.606540155914978e-06, + "loss": 0.3817, + "step": 8567 + }, + { + "epoch": 1.3110941086457537, + "grad_norm": 2.1948887182227312, + "learning_rate": 5.604314094575498e-06, + "loss": 0.3711, + "step": 8568 + }, + { + "epoch": 1.311247130833971, + "grad_norm": 2.376746522845046, + "learning_rate": 5.602088303192295e-06, + "loss": 0.4375, + "step": 8569 + }, + { + "epoch": 1.3114001530221882, + "grad_norm": 2.0035516123023993, + "learning_rate": 5.599862781902049e-06, + "loss": 0.3363, + "step": 8570 + }, + { + "epoch": 1.3115531752104055, + "grad_norm": 2.0738010508442755, + "learning_rate": 5.59763753084144e-06, + "loss": 0.3137, + "step": 8571 + }, + { + "epoch": 1.3117061973986228, + "grad_norm": 2.0745210501379217, + "learning_rate": 5.595412550147139e-06, + "loss": 0.3249, + "step": 8572 + }, + { + "epoch": 1.3118592195868402, + "grad_norm": 2.267526336468913, + "learning_rate": 5.593187839955776e-06, + "loss": 0.3783, + "step": 8573 + }, + { + "epoch": 1.3120122417750575, + "grad_norm": 2.3694203084709935, + "learning_rate": 5.590963400403993e-06, + "loss": 0.3513, + "step": 8574 + }, + { + "epoch": 1.3121652639632746, + "grad_norm": 2.012157483197845, + "learning_rate": 5.588739231628397e-06, + "loss": 0.2914, + "step": 8575 + }, + { + "epoch": 1.312318286151492, + "grad_norm": 2.0659451699838502, + "learning_rate": 5.586515333765577e-06, + "loss": 0.3701, + "step": 8576 + }, + { + "epoch": 1.3124713083397093, + "grad_norm": 1.921731703819784, + "learning_rate": 5.584291706952119e-06, + "loss": 0.2673, + "step": 8577 + }, + { + "epoch": 1.3126243305279266, + "grad_norm": 2.1435100630609893, + "learning_rate": 5.58206835132458e-06, + "loss": 0.3065, + "step": 8578 + }, + { + "epoch": 1.312777352716144, + "grad_norm": 2.178374984629396, + "learning_rate": 5.579845267019508e-06, + "loss": 0.3687, + "step": 8579 + }, + { + "epoch": 1.312930374904361, + "grad_norm": 2.4781271076321207, + "learning_rate": 5.577622454173429e-06, + "loss": 0.4722, + "step": 8580 + }, + { + "epoch": 1.3130833970925784, + "grad_norm": 2.085509746046495, + "learning_rate": 5.5753999129228565e-06, + "loss": 0.3132, + "step": 8581 + }, + { + "epoch": 1.3132364192807957, + "grad_norm": 1.9262893935758099, + "learning_rate": 5.5731776434042835e-06, + "loss": 0.3986, + "step": 8582 + }, + { + "epoch": 1.313389441469013, + "grad_norm": 1.9838375146009155, + "learning_rate": 5.57095564575419e-06, + "loss": 0.2953, + "step": 8583 + }, + { + "epoch": 1.3135424636572304, + "grad_norm": 2.10472370597914, + "learning_rate": 5.568733920109038e-06, + "loss": 0.3327, + "step": 8584 + }, + { + "epoch": 1.3136954858454475, + "grad_norm": 1.9445119541733116, + "learning_rate": 5.5665124666052695e-06, + "loss": 0.3065, + "step": 8585 + }, + { + "epoch": 1.3138485080336648, + "grad_norm": 2.603442112632636, + "learning_rate": 5.564291285379314e-06, + "loss": 0.3505, + "step": 8586 + }, + { + "epoch": 1.3140015302218822, + "grad_norm": 2.0445384664789885, + "learning_rate": 5.562070376567585e-06, + "loss": 0.3704, + "step": 8587 + }, + { + "epoch": 1.3141545524100995, + "grad_norm": 2.1157344605778827, + "learning_rate": 5.559849740306474e-06, + "loss": 0.3614, + "step": 8588 + }, + { + "epoch": 1.3143075745983168, + "grad_norm": 2.13504060924164, + "learning_rate": 5.5576293767323606e-06, + "loss": 0.3657, + "step": 8589 + }, + { + "epoch": 1.314460596786534, + "grad_norm": 2.289321737350508, + "learning_rate": 5.555409285981605e-06, + "loss": 0.3704, + "step": 8590 + }, + { + "epoch": 1.3146136189747513, + "grad_norm": 2.1213736767351055, + "learning_rate": 5.553189468190554e-06, + "loss": 0.3293, + "step": 8591 + }, + { + "epoch": 1.3147666411629686, + "grad_norm": 2.2923070600542075, + "learning_rate": 5.550969923495533e-06, + "loss": 0.3757, + "step": 8592 + }, + { + "epoch": 1.314919663351186, + "grad_norm": 1.9832544875286664, + "learning_rate": 5.548750652032855e-06, + "loss": 0.2989, + "step": 8593 + }, + { + "epoch": 1.3150726855394033, + "grad_norm": 2.1527441947882116, + "learning_rate": 5.546531653938807e-06, + "loss": 0.3381, + "step": 8594 + }, + { + "epoch": 1.3152257077276204, + "grad_norm": 2.212591146712386, + "learning_rate": 5.5443129293496825e-06, + "loss": 0.3609, + "step": 8595 + }, + { + "epoch": 1.315378729915838, + "grad_norm": 2.2950267556502757, + "learning_rate": 5.542094478401725e-06, + "loss": 0.4125, + "step": 8596 + }, + { + "epoch": 1.315531752104055, + "grad_norm": 2.0985948379512593, + "learning_rate": 5.539876301231182e-06, + "loss": 0.3706, + "step": 8597 + }, + { + "epoch": 1.3156847742922724, + "grad_norm": 2.167223206670299, + "learning_rate": 5.537658397974293e-06, + "loss": 0.3895, + "step": 8598 + }, + { + "epoch": 1.3158377964804897, + "grad_norm": 2.4820509629216074, + "learning_rate": 5.535440768767253e-06, + "loss": 0.3807, + "step": 8599 + }, + { + "epoch": 1.315990818668707, + "grad_norm": 2.2159112092687896, + "learning_rate": 5.533223413746256e-06, + "loss": 0.3477, + "step": 8600 + }, + { + "epoch": 1.3161438408569244, + "grad_norm": 2.2364380030208326, + "learning_rate": 5.531006333047493e-06, + "loss": 0.3696, + "step": 8601 + }, + { + "epoch": 1.3162968630451415, + "grad_norm": 1.829042244844607, + "learning_rate": 5.528789526807105e-06, + "loss": 0.2988, + "step": 8602 + }, + { + "epoch": 1.3164498852333588, + "grad_norm": 2.061590745235742, + "learning_rate": 5.526572995161248e-06, + "loss": 0.3311, + "step": 8603 + }, + { + "epoch": 1.3166029074215762, + "grad_norm": 2.133150535391964, + "learning_rate": 5.524356738246048e-06, + "loss": 0.3052, + "step": 8604 + }, + { + "epoch": 1.3167559296097935, + "grad_norm": 2.0168388184124106, + "learning_rate": 5.5221407561976e-06, + "loss": 0.3656, + "step": 8605 + }, + { + "epoch": 1.3169089517980108, + "grad_norm": 2.2695540004967096, + "learning_rate": 5.519925049152011e-06, + "loss": 0.3959, + "step": 8606 + }, + { + "epoch": 1.317061973986228, + "grad_norm": 2.305770870691185, + "learning_rate": 5.51770961724535e-06, + "loss": 0.4459, + "step": 8607 + }, + { + "epoch": 1.3172149961744453, + "grad_norm": 2.2371126516169797, + "learning_rate": 5.515494460613678e-06, + "loss": 0.3856, + "step": 8608 + }, + { + "epoch": 1.3173680183626626, + "grad_norm": 2.2476766777179322, + "learning_rate": 5.513279579393034e-06, + "loss": 0.3237, + "step": 8609 + }, + { + "epoch": 1.31752104055088, + "grad_norm": 2.010718411909031, + "learning_rate": 5.511064973719442e-06, + "loss": 0.4544, + "step": 8610 + }, + { + "epoch": 1.3176740627390973, + "grad_norm": 1.8265150494873308, + "learning_rate": 5.508850643728911e-06, + "loss": 0.2927, + "step": 8611 + }, + { + "epoch": 1.3178270849273144, + "grad_norm": 2.1922469791566797, + "learning_rate": 5.506636589557433e-06, + "loss": 0.3636, + "step": 8612 + }, + { + "epoch": 1.3179801071155317, + "grad_norm": 1.9522370911195468, + "learning_rate": 5.504422811340977e-06, + "loss": 0.345, + "step": 8613 + }, + { + "epoch": 1.318133129303749, + "grad_norm": 1.8985775896312533, + "learning_rate": 5.502209309215505e-06, + "loss": 0.3437, + "step": 8614 + }, + { + "epoch": 1.3182861514919664, + "grad_norm": 2.531908097820924, + "learning_rate": 5.499996083316952e-06, + "loss": 0.3961, + "step": 8615 + }, + { + "epoch": 1.3184391736801837, + "grad_norm": 2.300599284977995, + "learning_rate": 5.49778313378125e-06, + "loss": 0.3706, + "step": 8616 + }, + { + "epoch": 1.3185921958684008, + "grad_norm": 1.9660351647994214, + "learning_rate": 5.495570460744295e-06, + "loss": 0.3349, + "step": 8617 + }, + { + "epoch": 1.3187452180566182, + "grad_norm": 2.320828623988891, + "learning_rate": 5.4933580643419744e-06, + "loss": 0.3809, + "step": 8618 + }, + { + "epoch": 1.3188982402448355, + "grad_norm": 2.2384342613355557, + "learning_rate": 5.491145944710174e-06, + "loss": 0.3401, + "step": 8619 + }, + { + "epoch": 1.3190512624330528, + "grad_norm": 1.8784193029515464, + "learning_rate": 5.488934101984737e-06, + "loss": 0.3359, + "step": 8620 + }, + { + "epoch": 1.3192042846212702, + "grad_norm": 2.22885101285974, + "learning_rate": 5.486722536301499e-06, + "loss": 0.3814, + "step": 8621 + }, + { + "epoch": 1.3193573068094873, + "grad_norm": 2.518679488884114, + "learning_rate": 5.4845112477962936e-06, + "loss": 0.4485, + "step": 8622 + }, + { + "epoch": 1.3195103289977046, + "grad_norm": 2.2918568209225616, + "learning_rate": 5.482300236604909e-06, + "loss": 0.3204, + "step": 8623 + }, + { + "epoch": 1.319663351185922, + "grad_norm": 2.483464519209941, + "learning_rate": 5.4800895028631466e-06, + "loss": 0.3592, + "step": 8624 + }, + { + "epoch": 1.3198163733741393, + "grad_norm": 2.2598966067125104, + "learning_rate": 5.477879046706771e-06, + "loss": 0.373, + "step": 8625 + }, + { + "epoch": 1.3199693955623566, + "grad_norm": 2.162251025832532, + "learning_rate": 5.475668868271527e-06, + "loss": 0.3023, + "step": 8626 + }, + { + "epoch": 1.3201224177505737, + "grad_norm": 2.055830955710422, + "learning_rate": 5.473458967693162e-06, + "loss": 0.3665, + "step": 8627 + }, + { + "epoch": 1.3202754399387913, + "grad_norm": 2.3676870900519495, + "learning_rate": 5.471249345107392e-06, + "loss": 0.3866, + "step": 8628 + }, + { + "epoch": 1.3204284621270084, + "grad_norm": 2.2301256287820377, + "learning_rate": 5.469040000649909e-06, + "loss": 0.45, + "step": 8629 + }, + { + "epoch": 1.3205814843152257, + "grad_norm": 2.250482208887682, + "learning_rate": 5.466830934456409e-06, + "loss": 0.2956, + "step": 8630 + }, + { + "epoch": 1.320734506503443, + "grad_norm": 1.9602055867205517, + "learning_rate": 5.464622146662555e-06, + "loss": 0.3221, + "step": 8631 + }, + { + "epoch": 1.3208875286916604, + "grad_norm": 1.991138809691683, + "learning_rate": 5.462413637403997e-06, + "loss": 0.3425, + "step": 8632 + }, + { + "epoch": 1.3210405508798777, + "grad_norm": 2.265706180431464, + "learning_rate": 5.4602054068163705e-06, + "loss": 0.3718, + "step": 8633 + }, + { + "epoch": 1.3211935730680948, + "grad_norm": 2.437289055001441, + "learning_rate": 5.457997455035286e-06, + "loss": 0.3938, + "step": 8634 + }, + { + "epoch": 1.3213465952563122, + "grad_norm": 2.093157026239112, + "learning_rate": 5.45578978219635e-06, + "loss": 0.3366, + "step": 8635 + }, + { + "epoch": 1.3214996174445295, + "grad_norm": 2.113182376884977, + "learning_rate": 5.453582388435139e-06, + "loss": 0.3187, + "step": 8636 + }, + { + "epoch": 1.3216526396327468, + "grad_norm": 2.3073790380831802, + "learning_rate": 5.451375273887219e-06, + "loss": 0.3507, + "step": 8637 + }, + { + "epoch": 1.3218056618209642, + "grad_norm": 2.0134841418393075, + "learning_rate": 5.449168438688138e-06, + "loss": 0.3791, + "step": 8638 + }, + { + "epoch": 1.3219586840091813, + "grad_norm": 2.1274435392661935, + "learning_rate": 5.446961882973427e-06, + "loss": 0.3967, + "step": 8639 + }, + { + "epoch": 1.3221117061973986, + "grad_norm": 2.2861859533824056, + "learning_rate": 5.444755606878598e-06, + "loss": 0.3898, + "step": 8640 + }, + { + "epoch": 1.322264728385616, + "grad_norm": 2.4796974235976386, + "learning_rate": 5.442549610539146e-06, + "loss": 0.428, + "step": 8641 + }, + { + "epoch": 1.3224177505738333, + "grad_norm": 2.338041372087975, + "learning_rate": 5.440343894090552e-06, + "loss": 0.3597, + "step": 8642 + }, + { + "epoch": 1.3225707727620506, + "grad_norm": 2.2580701341379497, + "learning_rate": 5.438138457668277e-06, + "loss": 0.4203, + "step": 8643 + }, + { + "epoch": 1.3227237949502677, + "grad_norm": 2.1502655499210204, + "learning_rate": 5.435933301407765e-06, + "loss": 0.2846, + "step": 8644 + }, + { + "epoch": 1.322876817138485, + "grad_norm": 2.3728101853670616, + "learning_rate": 5.433728425444444e-06, + "loss": 0.3727, + "step": 8645 + }, + { + "epoch": 1.3230298393267024, + "grad_norm": 2.45813606253095, + "learning_rate": 5.431523829913725e-06, + "loss": 0.4305, + "step": 8646 + }, + { + "epoch": 1.3231828615149197, + "grad_norm": 1.9190440302219542, + "learning_rate": 5.4293195149509945e-06, + "loss": 0.3219, + "step": 8647 + }, + { + "epoch": 1.323335883703137, + "grad_norm": 2.0409914653263064, + "learning_rate": 5.427115480691642e-06, + "loss": 0.33, + "step": 8648 + }, + { + "epoch": 1.3234889058913542, + "grad_norm": 2.264838190331384, + "learning_rate": 5.424911727271012e-06, + "loss": 0.3573, + "step": 8649 + }, + { + "epoch": 1.3236419280795715, + "grad_norm": 2.1410706263548462, + "learning_rate": 5.422708254824447e-06, + "loss": 0.3522, + "step": 8650 + }, + { + "epoch": 1.3237949502677888, + "grad_norm": 2.4540891731226884, + "learning_rate": 5.420505063487282e-06, + "loss": 0.4584, + "step": 8651 + }, + { + "epoch": 1.3239479724560062, + "grad_norm": 2.248809644345026, + "learning_rate": 5.418302153394809e-06, + "loss": 0.2998, + "step": 8652 + }, + { + "epoch": 1.3241009946442235, + "grad_norm": 2.0834229656833516, + "learning_rate": 5.4160995246823275e-06, + "loss": 0.3484, + "step": 8653 + }, + { + "epoch": 1.3242540168324406, + "grad_norm": 2.017276455572411, + "learning_rate": 5.413897177485112e-06, + "loss": 0.3551, + "step": 8654 + }, + { + "epoch": 1.324407039020658, + "grad_norm": 2.1939725955162674, + "learning_rate": 5.411695111938402e-06, + "loss": 0.3737, + "step": 8655 + }, + { + "epoch": 1.3245600612088753, + "grad_norm": 2.5776750204395222, + "learning_rate": 5.409493328177451e-06, + "loss": 0.3604, + "step": 8656 + }, + { + "epoch": 1.3247130833970926, + "grad_norm": 2.1881840047583956, + "learning_rate": 5.407291826337475e-06, + "loss": 0.3046, + "step": 8657 + }, + { + "epoch": 1.32486610558531, + "grad_norm": 2.282370930865902, + "learning_rate": 5.405090606553667e-06, + "loss": 0.3554, + "step": 8658 + }, + { + "epoch": 1.325019127773527, + "grad_norm": 2.366658092343715, + "learning_rate": 5.402889668961225e-06, + "loss": 0.4061, + "step": 8659 + }, + { + "epoch": 1.3251721499617444, + "grad_norm": 2.232569419388663, + "learning_rate": 5.400689013695311e-06, + "loss": 0.3978, + "step": 8660 + }, + { + "epoch": 1.3253251721499617, + "grad_norm": 2.129469851628214, + "learning_rate": 5.398488640891079e-06, + "loss": 0.3334, + "step": 8661 + }, + { + "epoch": 1.325478194338179, + "grad_norm": 2.4309004407524863, + "learning_rate": 5.3962885506836594e-06, + "loss": 0.415, + "step": 8662 + }, + { + "epoch": 1.3256312165263964, + "grad_norm": 2.0085523161120835, + "learning_rate": 5.39408874320817e-06, + "loss": 0.3572, + "step": 8663 + }, + { + "epoch": 1.3257842387146135, + "grad_norm": 2.117619652748608, + "learning_rate": 5.391889218599711e-06, + "loss": 0.3793, + "step": 8664 + }, + { + "epoch": 1.325937260902831, + "grad_norm": 2.451783913784382, + "learning_rate": 5.38968997699336e-06, + "loss": 0.4311, + "step": 8665 + }, + { + "epoch": 1.3260902830910481, + "grad_norm": 2.1107048901080963, + "learning_rate": 5.387491018524184e-06, + "loss": 0.3542, + "step": 8666 + }, + { + "epoch": 1.3262433052792655, + "grad_norm": 2.220269332344632, + "learning_rate": 5.385292343327229e-06, + "loss": 0.3762, + "step": 8667 + }, + { + "epoch": 1.3263963274674828, + "grad_norm": 2.3319220579245328, + "learning_rate": 5.383093951537524e-06, + "loss": 0.4135, + "step": 8668 + }, + { + "epoch": 1.3265493496557001, + "grad_norm": 2.1507183519468236, + "learning_rate": 5.380895843290079e-06, + "loss": 0.3431, + "step": 8669 + }, + { + "epoch": 1.3267023718439175, + "grad_norm": 2.010715555568859, + "learning_rate": 5.3786980187198925e-06, + "loss": 0.3584, + "step": 8670 + }, + { + "epoch": 1.3268553940321346, + "grad_norm": 2.335760014272626, + "learning_rate": 5.376500477961939e-06, + "loss": 0.3866, + "step": 8671 + }, + { + "epoch": 1.327008416220352, + "grad_norm": 2.3548091082451705, + "learning_rate": 5.374303221151178e-06, + "loss": 0.4641, + "step": 8672 + }, + { + "epoch": 1.3271614384085693, + "grad_norm": 2.077535323157751, + "learning_rate": 5.372106248422551e-06, + "loss": 0.3462, + "step": 8673 + }, + { + "epoch": 1.3273144605967866, + "grad_norm": 2.150316112354047, + "learning_rate": 5.369909559910985e-06, + "loss": 0.3316, + "step": 8674 + }, + { + "epoch": 1.327467482785004, + "grad_norm": 2.0408943761398035, + "learning_rate": 5.367713155751386e-06, + "loss": 0.2938, + "step": 8675 + }, + { + "epoch": 1.327620504973221, + "grad_norm": 2.0607758587739786, + "learning_rate": 5.36551703607864e-06, + "loss": 0.3099, + "step": 8676 + }, + { + "epoch": 1.3277735271614384, + "grad_norm": 2.173805194803525, + "learning_rate": 5.3633212010276295e-06, + "loss": 0.3144, + "step": 8677 + }, + { + "epoch": 1.3279265493496557, + "grad_norm": 2.1294183601816563, + "learning_rate": 5.3611256507332e-06, + "loss": 0.3466, + "step": 8678 + }, + { + "epoch": 1.328079571537873, + "grad_norm": 2.0559929103742736, + "learning_rate": 5.358930385330188e-06, + "loss": 0.3814, + "step": 8679 + }, + { + "epoch": 1.3282325937260904, + "grad_norm": 2.164937780009964, + "learning_rate": 5.356735404953424e-06, + "loss": 0.336, + "step": 8680 + }, + { + "epoch": 1.3283856159143075, + "grad_norm": 2.0680556793090505, + "learning_rate": 5.354540709737696e-06, + "loss": 0.3517, + "step": 8681 + }, + { + "epoch": 1.3285386381025248, + "grad_norm": 2.2848427066316033, + "learning_rate": 5.352346299817799e-06, + "loss": 0.3799, + "step": 8682 + }, + { + "epoch": 1.3286916602907421, + "grad_norm": 1.862547574716012, + "learning_rate": 5.350152175328502e-06, + "loss": 0.3432, + "step": 8683 + }, + { + "epoch": 1.3288446824789595, + "grad_norm": 2.1285348894501404, + "learning_rate": 5.347958336404544e-06, + "loss": 0.3473, + "step": 8684 + }, + { + "epoch": 1.3289977046671768, + "grad_norm": 2.15008899321762, + "learning_rate": 5.345764783180665e-06, + "loss": 0.356, + "step": 8685 + }, + { + "epoch": 1.329150726855394, + "grad_norm": 2.3855541790397847, + "learning_rate": 5.343571515791584e-06, + "loss": 0.4541, + "step": 8686 + }, + { + "epoch": 1.3293037490436113, + "grad_norm": 2.0715480983178383, + "learning_rate": 5.3413785343719835e-06, + "loss": 0.3749, + "step": 8687 + }, + { + "epoch": 1.3294567712318286, + "grad_norm": 2.2500551363113463, + "learning_rate": 5.339185839056555e-06, + "loss": 0.3332, + "step": 8688 + }, + { + "epoch": 1.329609793420046, + "grad_norm": 2.391261775337038, + "learning_rate": 5.336993429979958e-06, + "loss": 0.4095, + "step": 8689 + }, + { + "epoch": 1.3297628156082633, + "grad_norm": 2.1836965124937944, + "learning_rate": 5.3348013072768365e-06, + "loss": 0.3513, + "step": 8690 + }, + { + "epoch": 1.3299158377964804, + "grad_norm": 2.3000977897605535, + "learning_rate": 5.332609471081818e-06, + "loss": 0.4085, + "step": 8691 + }, + { + "epoch": 1.3300688599846977, + "grad_norm": 1.9898087771318698, + "learning_rate": 5.330417921529509e-06, + "loss": 0.3931, + "step": 8692 + }, + { + "epoch": 1.330221882172915, + "grad_norm": 2.323604812423909, + "learning_rate": 5.328226658754503e-06, + "loss": 0.3886, + "step": 8693 + }, + { + "epoch": 1.3303749043611324, + "grad_norm": 1.9935416913349375, + "learning_rate": 5.326035682891375e-06, + "loss": 0.2802, + "step": 8694 + }, + { + "epoch": 1.3305279265493497, + "grad_norm": 2.0321753727504985, + "learning_rate": 5.3238449940746805e-06, + "loss": 0.3618, + "step": 8695 + }, + { + "epoch": 1.3306809487375668, + "grad_norm": 2.543698718100647, + "learning_rate": 5.321654592438958e-06, + "loss": 0.3664, + "step": 8696 + }, + { + "epoch": 1.3308339709257844, + "grad_norm": 2.321863025318773, + "learning_rate": 5.319464478118723e-06, + "loss": 0.3786, + "step": 8697 + }, + { + "epoch": 1.3309869931140015, + "grad_norm": 2.1807540364309705, + "learning_rate": 5.317274651248495e-06, + "loss": 0.3745, + "step": 8698 + }, + { + "epoch": 1.3311400153022188, + "grad_norm": 1.9410874369549784, + "learning_rate": 5.3150851119627445e-06, + "loss": 0.2741, + "step": 8699 + }, + { + "epoch": 1.3312930374904361, + "grad_norm": 2.2986422088732272, + "learning_rate": 5.3128958603959415e-06, + "loss": 0.3781, + "step": 8700 + }, + { + "epoch": 1.3314460596786535, + "grad_norm": 1.8028486611526706, + "learning_rate": 5.310706896682547e-06, + "loss": 0.3409, + "step": 8701 + }, + { + "epoch": 1.3315990818668708, + "grad_norm": 1.9787947600298408, + "learning_rate": 5.308518220956983e-06, + "loss": 0.2612, + "step": 8702 + }, + { + "epoch": 1.331752104055088, + "grad_norm": 1.9880430933707562, + "learning_rate": 5.306329833353664e-06, + "loss": 0.3417, + "step": 8703 + }, + { + "epoch": 1.3319051262433053, + "grad_norm": 2.2725659984139157, + "learning_rate": 5.3041417340070005e-06, + "loss": 0.3574, + "step": 8704 + }, + { + "epoch": 1.3320581484315226, + "grad_norm": 2.1827794106466465, + "learning_rate": 5.301953923051354e-06, + "loss": 0.4218, + "step": 8705 + }, + { + "epoch": 1.33221117061974, + "grad_norm": 2.2996389614314876, + "learning_rate": 5.2997664006211e-06, + "loss": 0.3736, + "step": 8706 + }, + { + "epoch": 1.3323641928079573, + "grad_norm": 2.250697167924539, + "learning_rate": 5.297579166850584e-06, + "loss": 0.3826, + "step": 8707 + }, + { + "epoch": 1.3325172149961744, + "grad_norm": 2.0961534983528147, + "learning_rate": 5.295392221874118e-06, + "loss": 0.4577, + "step": 8708 + }, + { + "epoch": 1.3326702371843917, + "grad_norm": 2.178073703599122, + "learning_rate": 5.293205565826025e-06, + "loss": 0.3955, + "step": 8709 + }, + { + "epoch": 1.332823259372609, + "grad_norm": 2.582183841226845, + "learning_rate": 5.2910191988405966e-06, + "loss": 0.429, + "step": 8710 + }, + { + "epoch": 1.3329762815608264, + "grad_norm": 2.6280302076259265, + "learning_rate": 5.28883312105209e-06, + "loss": 0.4491, + "step": 8711 + }, + { + "epoch": 1.3331293037490437, + "grad_norm": 2.4395766994785677, + "learning_rate": 5.286647332594779e-06, + "loss": 0.4561, + "step": 8712 + }, + { + "epoch": 1.3332823259372608, + "grad_norm": 2.257750469790304, + "learning_rate": 5.284461833602892e-06, + "loss": 0.3722, + "step": 8713 + }, + { + "epoch": 1.3334353481254781, + "grad_norm": 2.4393338162992078, + "learning_rate": 5.2822766242106505e-06, + "loss": 0.3524, + "step": 8714 + }, + { + "epoch": 1.3335883703136955, + "grad_norm": 2.0470819190945777, + "learning_rate": 5.280091704552257e-06, + "loss": 0.3516, + "step": 8715 + }, + { + "epoch": 1.3337413925019128, + "grad_norm": 1.8879596223256987, + "learning_rate": 5.2779070747618985e-06, + "loss": 0.3157, + "step": 8716 + }, + { + "epoch": 1.3338944146901301, + "grad_norm": 2.1578376231813072, + "learning_rate": 5.275722734973739e-06, + "loss": 0.3593, + "step": 8717 + }, + { + "epoch": 1.3340474368783473, + "grad_norm": 2.3849493596618565, + "learning_rate": 5.273538685321926e-06, + "loss": 0.3537, + "step": 8718 + }, + { + "epoch": 1.3342004590665646, + "grad_norm": 2.0641129751115175, + "learning_rate": 5.2713549259405936e-06, + "loss": 0.3433, + "step": 8719 + }, + { + "epoch": 1.334353481254782, + "grad_norm": 2.0207189663188747, + "learning_rate": 5.269171456963853e-06, + "loss": 0.504, + "step": 8720 + }, + { + "epoch": 1.3345065034429993, + "grad_norm": 1.9309222928308578, + "learning_rate": 5.266988278525802e-06, + "loss": 0.3243, + "step": 8721 + }, + { + "epoch": 1.3346595256312166, + "grad_norm": 2.417786904966103, + "learning_rate": 5.264805390760515e-06, + "loss": 0.3591, + "step": 8722 + }, + { + "epoch": 1.3348125478194337, + "grad_norm": 2.0559938066081114, + "learning_rate": 5.262622793802055e-06, + "loss": 0.325, + "step": 8723 + }, + { + "epoch": 1.334965570007651, + "grad_norm": 2.154604476772168, + "learning_rate": 5.26044048778446e-06, + "loss": 0.3335, + "step": 8724 + }, + { + "epoch": 1.3351185921958684, + "grad_norm": 2.0975545420310717, + "learning_rate": 5.258258472841755e-06, + "loss": 0.4099, + "step": 8725 + }, + { + "epoch": 1.3352716143840857, + "grad_norm": 1.898966736601991, + "learning_rate": 5.256076749107946e-06, + "loss": 0.3336, + "step": 8726 + }, + { + "epoch": 1.335424636572303, + "grad_norm": 2.126852100718994, + "learning_rate": 5.253895316717028e-06, + "loss": 0.3528, + "step": 8727 + }, + { + "epoch": 1.3355776587605201, + "grad_norm": 2.0082127986354785, + "learning_rate": 5.251714175802962e-06, + "loss": 0.349, + "step": 8728 + }, + { + "epoch": 1.3357306809487377, + "grad_norm": 2.424781858285472, + "learning_rate": 5.249533326499701e-06, + "loss": 0.3659, + "step": 8729 + }, + { + "epoch": 1.3358837031369548, + "grad_norm": 2.3398283834939924, + "learning_rate": 5.247352768941189e-06, + "loss": 0.3618, + "step": 8730 + }, + { + "epoch": 1.3360367253251721, + "grad_norm": 2.0472105115040256, + "learning_rate": 5.245172503261331e-06, + "loss": 0.3646, + "step": 8731 + }, + { + "epoch": 1.3361897475133895, + "grad_norm": 2.223173368949995, + "learning_rate": 5.242992529594028e-06, + "loss": 0.3406, + "step": 8732 + }, + { + "epoch": 1.3363427697016068, + "grad_norm": 2.12524462674687, + "learning_rate": 5.240812848073172e-06, + "loss": 0.3169, + "step": 8733 + }, + { + "epoch": 1.3364957918898241, + "grad_norm": 2.285717102884861, + "learning_rate": 5.238633458832607e-06, + "loss": 0.3333, + "step": 8734 + }, + { + "epoch": 1.3366488140780413, + "grad_norm": 2.1594353097887504, + "learning_rate": 5.2364543620061935e-06, + "loss": 0.3743, + "step": 8735 + }, + { + "epoch": 1.3368018362662586, + "grad_norm": 2.1247874494512238, + "learning_rate": 5.234275557727756e-06, + "loss": 0.407, + "step": 8736 + }, + { + "epoch": 1.336954858454476, + "grad_norm": 2.2331924409935633, + "learning_rate": 5.232097046131093e-06, + "loss": 0.3355, + "step": 8737 + }, + { + "epoch": 1.3371078806426933, + "grad_norm": 2.035105727000153, + "learning_rate": 5.2299188273500045e-06, + "loss": 0.3884, + "step": 8738 + }, + { + "epoch": 1.3372609028309106, + "grad_norm": 2.419555581257721, + "learning_rate": 5.227740901518268e-06, + "loss": 0.3536, + "step": 8739 + }, + { + "epoch": 1.3374139250191277, + "grad_norm": 2.3728108043849385, + "learning_rate": 5.225563268769622e-06, + "loss": 0.3485, + "step": 8740 + }, + { + "epoch": 1.337566947207345, + "grad_norm": 1.9635099062925558, + "learning_rate": 5.223385929237819e-06, + "loss": 0.3223, + "step": 8741 + }, + { + "epoch": 1.3377199693955624, + "grad_norm": 2.0871070571533488, + "learning_rate": 5.221208883056571e-06, + "loss": 0.3663, + "step": 8742 + }, + { + "epoch": 1.3378729915837797, + "grad_norm": 2.1517294117700234, + "learning_rate": 5.2190321303595805e-06, + "loss": 0.3299, + "step": 8743 + }, + { + "epoch": 1.338026013771997, + "grad_norm": 2.2349571562423094, + "learning_rate": 5.2168556712805295e-06, + "loss": 0.3746, + "step": 8744 + }, + { + "epoch": 1.3381790359602141, + "grad_norm": 2.1111718913928983, + "learning_rate": 5.214679505953084e-06, + "loss": 0.355, + "step": 8745 + }, + { + "epoch": 1.3383320581484315, + "grad_norm": 2.071140771626689, + "learning_rate": 5.212503634510891e-06, + "loss": 0.319, + "step": 8746 + }, + { + "epoch": 1.3384850803366488, + "grad_norm": 2.1976321183634453, + "learning_rate": 5.210328057087579e-06, + "loss": 0.327, + "step": 8747 + }, + { + "epoch": 1.3386381025248661, + "grad_norm": 2.3923131141546423, + "learning_rate": 5.208152773816757e-06, + "loss": 0.4028, + "step": 8748 + }, + { + "epoch": 1.3387911247130835, + "grad_norm": 2.283782792526242, + "learning_rate": 5.205977784832019e-06, + "loss": 0.3377, + "step": 8749 + }, + { + "epoch": 1.3389441469013006, + "grad_norm": 2.101349490871268, + "learning_rate": 5.203803090266941e-06, + "loss": 0.281, + "step": 8750 + }, + { + "epoch": 1.339097169089518, + "grad_norm": 2.0532807907074253, + "learning_rate": 5.201628690255076e-06, + "loss": 0.3238, + "step": 8751 + }, + { + "epoch": 1.3392501912777353, + "grad_norm": 2.362591506710098, + "learning_rate": 5.199454584929966e-06, + "loss": 0.4185, + "step": 8752 + }, + { + "epoch": 1.3394032134659526, + "grad_norm": 1.8928761243937846, + "learning_rate": 5.197280774425129e-06, + "loss": 0.3364, + "step": 8753 + }, + { + "epoch": 1.33955623565417, + "grad_norm": 2.198285759440423, + "learning_rate": 5.195107258874068e-06, + "loss": 0.3444, + "step": 8754 + }, + { + "epoch": 1.339709257842387, + "grad_norm": 2.189265677914583, + "learning_rate": 5.1929340384102645e-06, + "loss": 0.3567, + "step": 8755 + }, + { + "epoch": 1.3398622800306044, + "grad_norm": 1.9479819167709145, + "learning_rate": 5.1907611131671935e-06, + "loss": 0.3587, + "step": 8756 + }, + { + "epoch": 1.3400153022188217, + "grad_norm": 2.002771514837011, + "learning_rate": 5.188588483278293e-06, + "loss": 0.3138, + "step": 8757 + }, + { + "epoch": 1.340168324407039, + "grad_norm": 2.510943872707632, + "learning_rate": 5.186416148876991e-06, + "loss": 0.3849, + "step": 8758 + }, + { + "epoch": 1.3403213465952564, + "grad_norm": 1.9571166172475, + "learning_rate": 5.184244110096715e-06, + "loss": 0.3232, + "step": 8759 + }, + { + "epoch": 1.3404743687834735, + "grad_norm": 1.9050058458205525, + "learning_rate": 5.182072367070841e-06, + "loss": 0.3442, + "step": 8760 + }, + { + "epoch": 1.3406273909716908, + "grad_norm": 2.4189439891658333, + "learning_rate": 5.1799009199327465e-06, + "loss": 0.4003, + "step": 8761 + }, + { + "epoch": 1.3407804131599081, + "grad_norm": 2.0763029109194515, + "learning_rate": 5.177729768815802e-06, + "loss": 0.3211, + "step": 8762 + }, + { + "epoch": 1.3409334353481255, + "grad_norm": 2.116101575691697, + "learning_rate": 5.175558913853329e-06, + "loss": 0.3508, + "step": 8763 + }, + { + "epoch": 1.3410864575363428, + "grad_norm": 1.7887941597743418, + "learning_rate": 5.17338835517866e-06, + "loss": 0.3165, + "step": 8764 + }, + { + "epoch": 1.34123947972456, + "grad_norm": 2.2457710693606088, + "learning_rate": 5.171218092925099e-06, + "loss": 0.3803, + "step": 8765 + }, + { + "epoch": 1.3413925019127775, + "grad_norm": 2.0683172511693626, + "learning_rate": 5.169048127225915e-06, + "loss": 0.3428, + "step": 8766 + }, + { + "epoch": 1.3415455241009946, + "grad_norm": 2.1635182390813905, + "learning_rate": 5.166878458214391e-06, + "loss": 0.3602, + "step": 8767 + }, + { + "epoch": 1.341698546289212, + "grad_norm": 2.050582046768065, + "learning_rate": 5.16470908602377e-06, + "loss": 0.3622, + "step": 8768 + }, + { + "epoch": 1.3418515684774293, + "grad_norm": 2.026326185696356, + "learning_rate": 5.162540010787273e-06, + "loss": 0.3138, + "step": 8769 + }, + { + "epoch": 1.3420045906656466, + "grad_norm": 2.0367330156221497, + "learning_rate": 5.160371232638122e-06, + "loss": 0.303, + "step": 8770 + }, + { + "epoch": 1.342157612853864, + "grad_norm": 2.1169309129042215, + "learning_rate": 5.1582027517095065e-06, + "loss": 0.3154, + "step": 8771 + }, + { + "epoch": 1.342310635042081, + "grad_norm": 2.2530862240372493, + "learning_rate": 5.156034568134601e-06, + "loss": 0.3594, + "step": 8772 + }, + { + "epoch": 1.3424636572302984, + "grad_norm": 2.040678051046319, + "learning_rate": 5.153866682046562e-06, + "loss": 0.3179, + "step": 8773 + }, + { + "epoch": 1.3426166794185157, + "grad_norm": 1.9484942018819191, + "learning_rate": 5.151699093578528e-06, + "loss": 0.348, + "step": 8774 + }, + { + "epoch": 1.342769701606733, + "grad_norm": 2.49836830449986, + "learning_rate": 5.149531802863621e-06, + "loss": 0.4379, + "step": 8775 + }, + { + "epoch": 1.3429227237949504, + "grad_norm": 2.272462907141933, + "learning_rate": 5.14736481003494e-06, + "loss": 0.3124, + "step": 8776 + }, + { + "epoch": 1.3430757459831675, + "grad_norm": 2.0838929898866057, + "learning_rate": 5.1451981152255695e-06, + "loss": 0.3627, + "step": 8777 + }, + { + "epoch": 1.3432287681713848, + "grad_norm": 2.0977369640414794, + "learning_rate": 5.143031718568575e-06, + "loss": 0.463, + "step": 8778 + }, + { + "epoch": 1.3433817903596021, + "grad_norm": 2.074263154199314, + "learning_rate": 5.140865620197e-06, + "loss": 0.3553, + "step": 8779 + }, + { + "epoch": 1.3435348125478195, + "grad_norm": 2.3498251424537955, + "learning_rate": 5.138699820243882e-06, + "loss": 0.4366, + "step": 8780 + }, + { + "epoch": 1.3436878347360368, + "grad_norm": 2.132767275315044, + "learning_rate": 5.1365343188422235e-06, + "loss": 0.3064, + "step": 8781 + }, + { + "epoch": 1.343840856924254, + "grad_norm": 2.0608549554257265, + "learning_rate": 5.134369116125015e-06, + "loss": 0.3042, + "step": 8782 + }, + { + "epoch": 1.3439938791124713, + "grad_norm": 2.0378925886824732, + "learning_rate": 5.13220421222524e-06, + "loss": 0.3163, + "step": 8783 + }, + { + "epoch": 1.3441469013006886, + "grad_norm": 2.2278406207902672, + "learning_rate": 5.130039607275844e-06, + "loss": 0.3278, + "step": 8784 + }, + { + "epoch": 1.344299923488906, + "grad_norm": 2.4507146833070106, + "learning_rate": 5.127875301409764e-06, + "loss": 0.3432, + "step": 8785 + }, + { + "epoch": 1.3444529456771233, + "grad_norm": 2.048076636441864, + "learning_rate": 5.125711294759927e-06, + "loss": 0.2924, + "step": 8786 + }, + { + "epoch": 1.3446059678653404, + "grad_norm": 2.2006190582319496, + "learning_rate": 5.123547587459221e-06, + "loss": 0.3902, + "step": 8787 + }, + { + "epoch": 1.3447589900535577, + "grad_norm": 2.1200415996383155, + "learning_rate": 5.121384179640539e-06, + "loss": 0.4043, + "step": 8788 + }, + { + "epoch": 1.344912012241775, + "grad_norm": 1.9167752593186782, + "learning_rate": 5.119221071436744e-06, + "loss": 0.3091, + "step": 8789 + }, + { + "epoch": 1.3450650344299924, + "grad_norm": 1.7729997894919598, + "learning_rate": 5.117058262980668e-06, + "loss": 0.2764, + "step": 8790 + }, + { + "epoch": 1.3452180566182097, + "grad_norm": 2.277714016209161, + "learning_rate": 5.114895754405151e-06, + "loss": 0.3934, + "step": 8791 + }, + { + "epoch": 1.3453710788064268, + "grad_norm": 2.04919853608827, + "learning_rate": 5.112733545842996e-06, + "loss": 0.3216, + "step": 8792 + }, + { + "epoch": 1.3455241009946441, + "grad_norm": 2.063088383531425, + "learning_rate": 5.110571637426992e-06, + "loss": 0.3876, + "step": 8793 + }, + { + "epoch": 1.3456771231828615, + "grad_norm": 2.3164550683532608, + "learning_rate": 5.108410029289912e-06, + "loss": 0.368, + "step": 8794 + }, + { + "epoch": 1.3458301453710788, + "grad_norm": 1.9412845305314341, + "learning_rate": 5.106248721564509e-06, + "loss": 0.3319, + "step": 8795 + }, + { + "epoch": 1.3459831675592961, + "grad_norm": 2.0398505288619684, + "learning_rate": 5.1040877143835154e-06, + "loss": 0.3048, + "step": 8796 + }, + { + "epoch": 1.3461361897475133, + "grad_norm": 1.9405074387765255, + "learning_rate": 5.1019270078796476e-06, + "loss": 0.335, + "step": 8797 + }, + { + "epoch": 1.3462892119357308, + "grad_norm": 3.0361304228685215, + "learning_rate": 5.099766602185605e-06, + "loss": 0.3535, + "step": 8798 + }, + { + "epoch": 1.346442234123948, + "grad_norm": 2.0142600613872514, + "learning_rate": 5.097606497434064e-06, + "loss": 0.3425, + "step": 8799 + }, + { + "epoch": 1.3465952563121653, + "grad_norm": 2.250187533982632, + "learning_rate": 5.095446693757688e-06, + "loss": 0.37, + "step": 8800 + }, + { + "epoch": 1.3467482785003826, + "grad_norm": 2.2286570301304756, + "learning_rate": 5.093287191289116e-06, + "loss": 0.4101, + "step": 8801 + }, + { + "epoch": 1.3469013006886, + "grad_norm": 2.0538342836101133, + "learning_rate": 5.091127990160973e-06, + "loss": 0.3505, + "step": 8802 + }, + { + "epoch": 1.3470543228768173, + "grad_norm": 2.2048180157371773, + "learning_rate": 5.088969090505864e-06, + "loss": 0.3738, + "step": 8803 + }, + { + "epoch": 1.3472073450650344, + "grad_norm": 1.901363181601386, + "learning_rate": 5.086810492456375e-06, + "loss": 0.294, + "step": 8804 + }, + { + "epoch": 1.3473603672532517, + "grad_norm": 2.07977293833161, + "learning_rate": 5.084652196145074e-06, + "loss": 0.3492, + "step": 8805 + }, + { + "epoch": 1.347513389441469, + "grad_norm": 2.2163815188002967, + "learning_rate": 5.082494201704511e-06, + "loss": 0.2793, + "step": 8806 + }, + { + "epoch": 1.3476664116296864, + "grad_norm": 2.412112333946498, + "learning_rate": 5.0803365092672165e-06, + "loss": 0.3754, + "step": 8807 + }, + { + "epoch": 1.3478194338179037, + "grad_norm": 1.7349318109113925, + "learning_rate": 5.078179118965698e-06, + "loss": 0.2788, + "step": 8808 + }, + { + "epoch": 1.3479724560061208, + "grad_norm": 1.9627355364540608, + "learning_rate": 5.076022030932461e-06, + "loss": 0.3577, + "step": 8809 + }, + { + "epoch": 1.3481254781943381, + "grad_norm": 1.7114120202932948, + "learning_rate": 5.0738652452999715e-06, + "loss": 0.3282, + "step": 8810 + }, + { + "epoch": 1.3482785003825555, + "grad_norm": 2.2111716488185675, + "learning_rate": 5.0717087622006844e-06, + "loss": 0.2957, + "step": 8811 + }, + { + "epoch": 1.3484315225707728, + "grad_norm": 1.9715870922408556, + "learning_rate": 5.06955258176705e-06, + "loss": 0.3563, + "step": 8812 + }, + { + "epoch": 1.3485845447589901, + "grad_norm": 2.060344670247845, + "learning_rate": 5.0673967041314755e-06, + "loss": 0.2971, + "step": 8813 + }, + { + "epoch": 1.3487375669472073, + "grad_norm": 2.13181718352457, + "learning_rate": 5.0652411294263615e-06, + "loss": 0.3509, + "step": 8814 + }, + { + "epoch": 1.3488905891354246, + "grad_norm": 2.3479993718903294, + "learning_rate": 5.063085857784102e-06, + "loss": 0.3717, + "step": 8815 + }, + { + "epoch": 1.349043611323642, + "grad_norm": 2.4033272572666613, + "learning_rate": 5.060930889337047e-06, + "loss": 0.3488, + "step": 8816 + }, + { + "epoch": 1.3491966335118593, + "grad_norm": 2.062225137838282, + "learning_rate": 5.05877622421755e-06, + "loss": 0.3394, + "step": 8817 + }, + { + "epoch": 1.3493496557000766, + "grad_norm": 2.1399741737163778, + "learning_rate": 5.05662186255794e-06, + "loss": 0.346, + "step": 8818 + }, + { + "epoch": 1.3495026778882937, + "grad_norm": 2.044146746709054, + "learning_rate": 5.054467804490513e-06, + "loss": 0.3237, + "step": 8819 + }, + { + "epoch": 1.349655700076511, + "grad_norm": 1.9006878517739, + "learning_rate": 5.052314050147567e-06, + "loss": 0.3543, + "step": 8820 + }, + { + "epoch": 1.3498087222647284, + "grad_norm": 1.928889322164962, + "learning_rate": 5.050160599661374e-06, + "loss": 0.349, + "step": 8821 + }, + { + "epoch": 1.3499617444529457, + "grad_norm": 2.402232435601587, + "learning_rate": 5.048007453164178e-06, + "loss": 0.3668, + "step": 8822 + }, + { + "epoch": 1.350114766641163, + "grad_norm": 2.4871412589958264, + "learning_rate": 5.04585461078822e-06, + "loss": 0.3781, + "step": 8823 + }, + { + "epoch": 1.3502677888293801, + "grad_norm": 2.106796208369143, + "learning_rate": 5.043702072665711e-06, + "loss": 0.3279, + "step": 8824 + }, + { + "epoch": 1.3504208110175975, + "grad_norm": 2.365383560450284, + "learning_rate": 5.041549838928845e-06, + "loss": 0.4045, + "step": 8825 + }, + { + "epoch": 1.3505738332058148, + "grad_norm": 2.451762479109529, + "learning_rate": 5.039397909709802e-06, + "loss": 0.3966, + "step": 8826 + }, + { + "epoch": 1.3507268553940321, + "grad_norm": 1.8946555567238423, + "learning_rate": 5.037246285140739e-06, + "loss": 0.2784, + "step": 8827 + }, + { + "epoch": 1.3508798775822495, + "grad_norm": 2.370556858656519, + "learning_rate": 5.035094965353796e-06, + "loss": 0.3991, + "step": 8828 + }, + { + "epoch": 1.3510328997704666, + "grad_norm": 1.9665823637058073, + "learning_rate": 5.032943950481094e-06, + "loss": 0.3155, + "step": 8829 + }, + { + "epoch": 1.3511859219586841, + "grad_norm": 2.1502421747517393, + "learning_rate": 5.0307932406547355e-06, + "loss": 0.3399, + "step": 8830 + }, + { + "epoch": 1.3513389441469013, + "grad_norm": 1.9684009046771507, + "learning_rate": 5.028642836006803e-06, + "loss": 0.305, + "step": 8831 + }, + { + "epoch": 1.3514919663351186, + "grad_norm": 2.3712114531183466, + "learning_rate": 5.0264927366693635e-06, + "loss": 0.3648, + "step": 8832 + }, + { + "epoch": 1.351644988523336, + "grad_norm": 2.1471009526278237, + "learning_rate": 5.0243429427744605e-06, + "loss": 0.3872, + "step": 8833 + }, + { + "epoch": 1.3517980107115533, + "grad_norm": 1.8739089991135973, + "learning_rate": 5.0221934544541225e-06, + "loss": 0.2828, + "step": 8834 + }, + { + "epoch": 1.3519510328997706, + "grad_norm": 2.4082764083999817, + "learning_rate": 5.020044271840358e-06, + "loss": 0.3817, + "step": 8835 + }, + { + "epoch": 1.3521040550879877, + "grad_norm": 2.155006057515205, + "learning_rate": 5.017895395065156e-06, + "loss": 0.2847, + "step": 8836 + }, + { + "epoch": 1.352257077276205, + "grad_norm": 2.1032880782979957, + "learning_rate": 5.0157468242604835e-06, + "loss": 0.3277, + "step": 8837 + }, + { + "epoch": 1.3524100994644224, + "grad_norm": 2.0750944376718614, + "learning_rate": 5.013598559558306e-06, + "loss": 0.3113, + "step": 8838 + }, + { + "epoch": 1.3525631216526397, + "grad_norm": 2.1837175276655416, + "learning_rate": 5.011450601090544e-06, + "loss": 0.3165, + "step": 8839 + }, + { + "epoch": 1.352716143840857, + "grad_norm": 2.101554636022105, + "learning_rate": 5.009302948989111e-06, + "loss": 0.3534, + "step": 8840 + }, + { + "epoch": 1.3528691660290741, + "grad_norm": 2.259872323501473, + "learning_rate": 5.007155603385916e-06, + "loss": 0.3817, + "step": 8841 + }, + { + "epoch": 1.3530221882172915, + "grad_norm": 2.1371804613305536, + "learning_rate": 5.005008564412823e-06, + "loss": 0.366, + "step": 8842 + }, + { + "epoch": 1.3531752104055088, + "grad_norm": 1.783351213021392, + "learning_rate": 5.002861832201691e-06, + "loss": 0.2375, + "step": 8843 + }, + { + "epoch": 1.3533282325937261, + "grad_norm": 2.4210398863460836, + "learning_rate": 5.0007154068843714e-06, + "loss": 0.4451, + "step": 8844 + }, + { + "epoch": 1.3534812547819435, + "grad_norm": 2.109957799601956, + "learning_rate": 4.9985692885926675e-06, + "loss": 0.3213, + "step": 8845 + }, + { + "epoch": 1.3536342769701606, + "grad_norm": 2.201835379953216, + "learning_rate": 4.996423477458393e-06, + "loss": 0.3522, + "step": 8846 + }, + { + "epoch": 1.353787299158378, + "grad_norm": 2.2332666443797913, + "learning_rate": 4.994277973613331e-06, + "loss": 0.3499, + "step": 8847 + }, + { + "epoch": 1.3539403213465953, + "grad_norm": 2.0828069249606247, + "learning_rate": 4.992132777189234e-06, + "loss": 0.3706, + "step": 8848 + }, + { + "epoch": 1.3540933435348126, + "grad_norm": 2.067829402167768, + "learning_rate": 4.9899878883178565e-06, + "loss": 0.4055, + "step": 8849 + }, + { + "epoch": 1.35424636572303, + "grad_norm": 2.234378046868571, + "learning_rate": 4.987843307130923e-06, + "loss": 0.318, + "step": 8850 + }, + { + "epoch": 1.354399387911247, + "grad_norm": 2.2611687807693874, + "learning_rate": 4.98569903376014e-06, + "loss": 0.3289, + "step": 8851 + }, + { + "epoch": 1.3545524100994644, + "grad_norm": 2.178487669182021, + "learning_rate": 4.983555068337194e-06, + "loss": 0.3294, + "step": 8852 + }, + { + "epoch": 1.3547054322876817, + "grad_norm": 2.223404581120469, + "learning_rate": 4.981411410993756e-06, + "loss": 0.3945, + "step": 8853 + }, + { + "epoch": 1.354858454475899, + "grad_norm": 1.9874431190206212, + "learning_rate": 4.979268061861477e-06, + "loss": 0.3015, + "step": 8854 + }, + { + "epoch": 1.3550114766641164, + "grad_norm": 2.1072934649507506, + "learning_rate": 4.977125021071988e-06, + "loss": 0.3378, + "step": 8855 + }, + { + "epoch": 1.3551644988523335, + "grad_norm": 2.341148964981265, + "learning_rate": 4.9749822887569e-06, + "loss": 0.3951, + "step": 8856 + }, + { + "epoch": 1.3553175210405508, + "grad_norm": 2.208386708307971, + "learning_rate": 4.9728398650478075e-06, + "loss": 0.3591, + "step": 8857 + }, + { + "epoch": 1.3554705432287681, + "grad_norm": 2.1163674385472375, + "learning_rate": 4.970697750076282e-06, + "loss": 0.3211, + "step": 8858 + }, + { + "epoch": 1.3556235654169855, + "grad_norm": 1.960454860325222, + "learning_rate": 4.96855594397389e-06, + "loss": 0.304, + "step": 8859 + }, + { + "epoch": 1.3557765876052028, + "grad_norm": 2.192707172753196, + "learning_rate": 4.9664144468721565e-06, + "loss": 0.3023, + "step": 8860 + }, + { + "epoch": 1.35592960979342, + "grad_norm": 2.0167491244911315, + "learning_rate": 4.9642732589025986e-06, + "loss": 0.2747, + "step": 8861 + }, + { + "epoch": 1.3560826319816373, + "grad_norm": 2.2142639849416836, + "learning_rate": 4.962132380196729e-06, + "loss": 0.3612, + "step": 8862 + }, + { + "epoch": 1.3562356541698546, + "grad_norm": 2.2930940087859524, + "learning_rate": 4.959991810886013e-06, + "loss": 0.3922, + "step": 8863 + }, + { + "epoch": 1.356388676358072, + "grad_norm": 2.0111407859260892, + "learning_rate": 4.957851551101915e-06, + "loss": 0.2614, + "step": 8864 + }, + { + "epoch": 1.3565416985462893, + "grad_norm": 2.2532932543779105, + "learning_rate": 4.955711600975883e-06, + "loss": 0.3617, + "step": 8865 + }, + { + "epoch": 1.3566947207345064, + "grad_norm": 2.3137796151354393, + "learning_rate": 4.95357196063933e-06, + "loss": 0.3794, + "step": 8866 + }, + { + "epoch": 1.356847742922724, + "grad_norm": 2.261134143687513, + "learning_rate": 4.9514326302236655e-06, + "loss": 0.3776, + "step": 8867 + }, + { + "epoch": 1.357000765110941, + "grad_norm": 2.071670886754076, + "learning_rate": 4.94929360986028e-06, + "loss": 0.3288, + "step": 8868 + }, + { + "epoch": 1.3571537872991584, + "grad_norm": 2.1015927260956824, + "learning_rate": 4.947154899680523e-06, + "loss": 0.3423, + "step": 8869 + }, + { + "epoch": 1.3573068094873757, + "grad_norm": 2.140147633865237, + "learning_rate": 4.9450164998157545e-06, + "loss": 0.339, + "step": 8870 + }, + { + "epoch": 1.357459831675593, + "grad_norm": 2.014679294585402, + "learning_rate": 4.9428784103973025e-06, + "loss": 0.3075, + "step": 8871 + }, + { + "epoch": 1.3576128538638104, + "grad_norm": 2.0835625273825706, + "learning_rate": 4.940740631556462e-06, + "loss": 0.3462, + "step": 8872 + }, + { + "epoch": 1.3577658760520275, + "grad_norm": 2.182089133357909, + "learning_rate": 4.9386031634245365e-06, + "loss": 0.3325, + "step": 8873 + }, + { + "epoch": 1.3579188982402448, + "grad_norm": 2.0004273732212234, + "learning_rate": 4.936466006132791e-06, + "loss": 0.3194, + "step": 8874 + }, + { + "epoch": 1.3580719204284621, + "grad_norm": 2.1668334947043952, + "learning_rate": 4.934329159812476e-06, + "loss": 0.3887, + "step": 8875 + }, + { + "epoch": 1.3582249426166795, + "grad_norm": 2.2245033231188946, + "learning_rate": 4.932192624594824e-06, + "loss": 0.3336, + "step": 8876 + }, + { + "epoch": 1.3583779648048968, + "grad_norm": 2.4973226364697925, + "learning_rate": 4.9300564006110486e-06, + "loss": 0.4022, + "step": 8877 + }, + { + "epoch": 1.358530986993114, + "grad_norm": 2.308538977320974, + "learning_rate": 4.9279204879923425e-06, + "loss": 0.3662, + "step": 8878 + }, + { + "epoch": 1.3586840091813313, + "grad_norm": 2.2287780866163462, + "learning_rate": 4.925784886869883e-06, + "loss": 0.354, + "step": 8879 + }, + { + "epoch": 1.3588370313695486, + "grad_norm": 2.26226499220998, + "learning_rate": 4.923649597374823e-06, + "loss": 0.3948, + "step": 8880 + }, + { + "epoch": 1.358990053557766, + "grad_norm": 2.397765277999804, + "learning_rate": 4.9215146196383e-06, + "loss": 0.3911, + "step": 8881 + }, + { + "epoch": 1.3591430757459833, + "grad_norm": 2.095255008366767, + "learning_rate": 4.919379953791432e-06, + "loss": 0.334, + "step": 8882 + }, + { + "epoch": 1.3592960979342004, + "grad_norm": 2.3511037115621978, + "learning_rate": 4.917245599965317e-06, + "loss": 0.3601, + "step": 8883 + }, + { + "epoch": 1.3594491201224177, + "grad_norm": 2.20916949291957, + "learning_rate": 4.9151115582910324e-06, + "loss": 0.357, + "step": 8884 + }, + { + "epoch": 1.359602142310635, + "grad_norm": 2.2283270514852997, + "learning_rate": 4.912977828899639e-06, + "loss": 0.3641, + "step": 8885 + }, + { + "epoch": 1.3597551644988524, + "grad_norm": 1.944639349996175, + "learning_rate": 4.910844411922179e-06, + "loss": 0.4083, + "step": 8886 + }, + { + "epoch": 1.3599081866870697, + "grad_norm": 2.0157061926550712, + "learning_rate": 4.908711307489671e-06, + "loss": 0.3363, + "step": 8887 + }, + { + "epoch": 1.3600612088752868, + "grad_norm": 1.9664016887617008, + "learning_rate": 4.90657851573312e-06, + "loss": 0.3352, + "step": 8888 + }, + { + "epoch": 1.3602142310635041, + "grad_norm": 2.1423036098364703, + "learning_rate": 4.904446036783508e-06, + "loss": 0.3169, + "step": 8889 + }, + { + "epoch": 1.3603672532517215, + "grad_norm": 2.1307687150076404, + "learning_rate": 4.902313870771795e-06, + "loss": 0.3136, + "step": 8890 + }, + { + "epoch": 1.3605202754399388, + "grad_norm": 2.037116383219786, + "learning_rate": 4.9001820178289365e-06, + "loss": 0.3409, + "step": 8891 + }, + { + "epoch": 1.3606732976281561, + "grad_norm": 2.04431583800202, + "learning_rate": 4.898050478085846e-06, + "loss": 0.3112, + "step": 8892 + }, + { + "epoch": 1.3608263198163733, + "grad_norm": 2.1739991034642325, + "learning_rate": 4.895919251673432e-06, + "loss": 0.339, + "step": 8893 + }, + { + "epoch": 1.3609793420045906, + "grad_norm": 1.9316321394254719, + "learning_rate": 4.893788338722591e-06, + "loss": 0.3547, + "step": 8894 + }, + { + "epoch": 1.361132364192808, + "grad_norm": 1.8223272300386113, + "learning_rate": 4.891657739364177e-06, + "loss": 0.3305, + "step": 8895 + }, + { + "epoch": 1.3612853863810253, + "grad_norm": 1.873250474892585, + "learning_rate": 4.88952745372905e-06, + "loss": 0.3082, + "step": 8896 + }, + { + "epoch": 1.3614384085692426, + "grad_norm": 2.205921503536888, + "learning_rate": 4.887397481948036e-06, + "loss": 0.3402, + "step": 8897 + }, + { + "epoch": 1.3615914307574597, + "grad_norm": 2.5016859094591055, + "learning_rate": 4.885267824151937e-06, + "loss": 0.3385, + "step": 8898 + }, + { + "epoch": 1.3617444529456773, + "grad_norm": 2.085347385042572, + "learning_rate": 4.883138480471553e-06, + "loss": 0.3988, + "step": 8899 + }, + { + "epoch": 1.3618974751338944, + "grad_norm": 2.2051244044589557, + "learning_rate": 4.881009451037656e-06, + "loss": 0.3533, + "step": 8900 + }, + { + "epoch": 1.3620504973221117, + "grad_norm": 2.1863458371965683, + "learning_rate": 4.878880735980986e-06, + "loss": 0.3619, + "step": 8901 + }, + { + "epoch": 1.362203519510329, + "grad_norm": 2.271622039133187, + "learning_rate": 4.876752335432288e-06, + "loss": 0.3355, + "step": 8902 + }, + { + "epoch": 1.3623565416985464, + "grad_norm": 2.132277396146395, + "learning_rate": 4.874624249522273e-06, + "loss": 0.3664, + "step": 8903 + }, + { + "epoch": 1.3625095638867637, + "grad_norm": 1.9125739210979713, + "learning_rate": 4.872496478381632e-06, + "loss": 0.2759, + "step": 8904 + }, + { + "epoch": 1.3626625860749808, + "grad_norm": 2.2241444950777916, + "learning_rate": 4.870369022141042e-06, + "loss": 0.3585, + "step": 8905 + }, + { + "epoch": 1.3628156082631981, + "grad_norm": 2.1451735908210527, + "learning_rate": 4.868241880931157e-06, + "loss": 0.3512, + "step": 8906 + }, + { + "epoch": 1.3629686304514155, + "grad_norm": 1.9142230746245907, + "learning_rate": 4.866115054882613e-06, + "loss": 0.328, + "step": 8907 + }, + { + "epoch": 1.3631216526396328, + "grad_norm": 2.368547829079924, + "learning_rate": 4.863988544126028e-06, + "loss": 0.3551, + "step": 8908 + }, + { + "epoch": 1.3632746748278501, + "grad_norm": 2.04742148952542, + "learning_rate": 4.861862348791999e-06, + "loss": 0.3481, + "step": 8909 + }, + { + "epoch": 1.3634276970160673, + "grad_norm": 2.1096845873170342, + "learning_rate": 4.859736469011104e-06, + "loss": 0.3453, + "step": 8910 + }, + { + "epoch": 1.3635807192042846, + "grad_norm": 1.9345016136926325, + "learning_rate": 4.8576109049139e-06, + "loss": 0.3191, + "step": 8911 + }, + { + "epoch": 1.363733741392502, + "grad_norm": 2.064452971674517, + "learning_rate": 4.8554856566309286e-06, + "loss": 0.3252, + "step": 8912 + }, + { + "epoch": 1.3638867635807193, + "grad_norm": 2.164800719738148, + "learning_rate": 4.853360724292707e-06, + "loss": 0.3702, + "step": 8913 + }, + { + "epoch": 1.3640397857689366, + "grad_norm": 2.0398684616518272, + "learning_rate": 4.851236108029739e-06, + "loss": 0.2936, + "step": 8914 + }, + { + "epoch": 1.3641928079571537, + "grad_norm": 2.3919411510047968, + "learning_rate": 4.849111807972502e-06, + "loss": 0.4252, + "step": 8915 + }, + { + "epoch": 1.364345830145371, + "grad_norm": 2.317113758644946, + "learning_rate": 4.84698782425146e-06, + "loss": 0.3417, + "step": 8916 + }, + { + "epoch": 1.3644988523335884, + "grad_norm": 1.813125371501577, + "learning_rate": 4.844864156997054e-06, + "loss": 0.2696, + "step": 8917 + }, + { + "epoch": 1.3646518745218057, + "grad_norm": 2.030357912038102, + "learning_rate": 4.842740806339709e-06, + "loss": 0.3526, + "step": 8918 + }, + { + "epoch": 1.364804896710023, + "grad_norm": 1.9203863871860456, + "learning_rate": 4.84061777240982e-06, + "loss": 0.3105, + "step": 8919 + }, + { + "epoch": 1.3649579188982401, + "grad_norm": 2.4389896556711, + "learning_rate": 4.8384950553377865e-06, + "loss": 0.4105, + "step": 8920 + }, + { + "epoch": 1.3651109410864575, + "grad_norm": 2.156091012678082, + "learning_rate": 4.8363726552539595e-06, + "loss": 0.357, + "step": 8921 + }, + { + "epoch": 1.3652639632746748, + "grad_norm": 2.2964416250356208, + "learning_rate": 4.8342505722886835e-06, + "loss": 0.4063, + "step": 8922 + }, + { + "epoch": 1.3654169854628921, + "grad_norm": 2.103168718904348, + "learning_rate": 4.832128806572296e-06, + "loss": 0.3337, + "step": 8923 + }, + { + "epoch": 1.3655700076511095, + "grad_norm": 1.8160059643896378, + "learning_rate": 4.8300073582350924e-06, + "loss": 0.3028, + "step": 8924 + }, + { + "epoch": 1.3657230298393266, + "grad_norm": 2.2057911487268944, + "learning_rate": 4.827886227407358e-06, + "loss": 0.3627, + "step": 8925 + }, + { + "epoch": 1.365876052027544, + "grad_norm": 2.1394622208954064, + "learning_rate": 4.825765414219371e-06, + "loss": 0.3153, + "step": 8926 + }, + { + "epoch": 1.3660290742157613, + "grad_norm": 1.9790746960660583, + "learning_rate": 4.823644918801363e-06, + "loss": 0.2966, + "step": 8927 + }, + { + "epoch": 1.3661820964039786, + "grad_norm": 2.1585848700459462, + "learning_rate": 4.821524741283577e-06, + "loss": 0.3889, + "step": 8928 + }, + { + "epoch": 1.366335118592196, + "grad_norm": 1.9951219879291964, + "learning_rate": 4.819404881796217e-06, + "loss": 0.3879, + "step": 8929 + }, + { + "epoch": 1.366488140780413, + "grad_norm": 2.0365099355278033, + "learning_rate": 4.817285340469462e-06, + "loss": 0.3067, + "step": 8930 + }, + { + "epoch": 1.3666411629686306, + "grad_norm": 2.386301644489069, + "learning_rate": 4.815166117433493e-06, + "loss": 0.3994, + "step": 8931 + }, + { + "epoch": 1.3667941851568477, + "grad_norm": 2.237602958997356, + "learning_rate": 4.813047212818457e-06, + "loss": 0.4376, + "step": 8932 + }, + { + "epoch": 1.366947207345065, + "grad_norm": 2.2080395359941116, + "learning_rate": 4.810928626754482e-06, + "loss": 0.3744, + "step": 8933 + }, + { + "epoch": 1.3671002295332824, + "grad_norm": 2.213416724594083, + "learning_rate": 4.808810359371681e-06, + "loss": 0.3219, + "step": 8934 + }, + { + "epoch": 1.3672532517214997, + "grad_norm": 2.2757376646140717, + "learning_rate": 4.806692410800142e-06, + "loss": 0.3351, + "step": 8935 + }, + { + "epoch": 1.367406273909717, + "grad_norm": 2.275814186271426, + "learning_rate": 4.804574781169941e-06, + "loss": 0.3565, + "step": 8936 + }, + { + "epoch": 1.3675592960979341, + "grad_norm": 2.00049387430229, + "learning_rate": 4.802457470611125e-06, + "loss": 0.3045, + "step": 8937 + }, + { + "epoch": 1.3677123182861515, + "grad_norm": 2.0273766683214838, + "learning_rate": 4.800340479253729e-06, + "loss": 0.3273, + "step": 8938 + }, + { + "epoch": 1.3678653404743688, + "grad_norm": 2.100353583194502, + "learning_rate": 4.798223807227767e-06, + "loss": 0.3112, + "step": 8939 + }, + { + "epoch": 1.3680183626625861, + "grad_norm": 2.173515984420499, + "learning_rate": 4.796107454663225e-06, + "loss": 0.3939, + "step": 8940 + }, + { + "epoch": 1.3681713848508035, + "grad_norm": 2.2645920139454248, + "learning_rate": 4.79399142169009e-06, + "loss": 0.3357, + "step": 8941 + }, + { + "epoch": 1.3683244070390206, + "grad_norm": 2.1304360217955156, + "learning_rate": 4.791875708438304e-06, + "loss": 0.3591, + "step": 8942 + }, + { + "epoch": 1.368477429227238, + "grad_norm": 2.016668890017917, + "learning_rate": 4.789760315037801e-06, + "loss": 0.3164, + "step": 8943 + }, + { + "epoch": 1.3686304514154553, + "grad_norm": 2.102649703832163, + "learning_rate": 4.7876452416185075e-06, + "loss": 0.3938, + "step": 8944 + }, + { + "epoch": 1.3687834736036726, + "grad_norm": 2.3476389450020188, + "learning_rate": 4.785530488310307e-06, + "loss": 0.4252, + "step": 8945 + }, + { + "epoch": 1.36893649579189, + "grad_norm": 1.8083889949743437, + "learning_rate": 4.783416055243074e-06, + "loss": 0.2638, + "step": 8946 + }, + { + "epoch": 1.369089517980107, + "grad_norm": 1.6592216434472842, + "learning_rate": 4.781301942546677e-06, + "loss": 0.2525, + "step": 8947 + }, + { + "epoch": 1.3692425401683244, + "grad_norm": 2.2613655519561515, + "learning_rate": 4.779188150350934e-06, + "loss": 0.4171, + "step": 8948 + }, + { + "epoch": 1.3693955623565417, + "grad_norm": 2.0986112105450214, + "learning_rate": 4.777074678785676e-06, + "loss": 0.3432, + "step": 8949 + }, + { + "epoch": 1.369548584544759, + "grad_norm": 2.16417414807789, + "learning_rate": 4.774961527980697e-06, + "loss": 0.3598, + "step": 8950 + }, + { + "epoch": 1.3697016067329764, + "grad_norm": 2.5301061654646255, + "learning_rate": 4.772848698065764e-06, + "loss": 0.4318, + "step": 8951 + }, + { + "epoch": 1.3698546289211935, + "grad_norm": 1.9819120670022627, + "learning_rate": 4.770736189170645e-06, + "loss": 0.3328, + "step": 8952 + }, + { + "epoch": 1.3700076511094108, + "grad_norm": 2.079685492030615, + "learning_rate": 4.768624001425077e-06, + "loss": 0.4102, + "step": 8953 + }, + { + "epoch": 1.3701606732976281, + "grad_norm": 2.0838078795974835, + "learning_rate": 4.766512134958767e-06, + "loss": 0.3124, + "step": 8954 + }, + { + "epoch": 1.3703136954858455, + "grad_norm": 1.9182397991758027, + "learning_rate": 4.764400589901424e-06, + "loss": 0.3356, + "step": 8955 + }, + { + "epoch": 1.3704667176740628, + "grad_norm": 2.0187379873078153, + "learning_rate": 4.762289366382723e-06, + "loss": 0.3335, + "step": 8956 + }, + { + "epoch": 1.37061973986228, + "grad_norm": 2.392520798804102, + "learning_rate": 4.7601784645323225e-06, + "loss": 0.4488, + "step": 8957 + }, + { + "epoch": 1.3707727620504973, + "grad_norm": 2.2128964626438536, + "learning_rate": 4.75806788447986e-06, + "loss": 0.3515, + "step": 8958 + }, + { + "epoch": 1.3709257842387146, + "grad_norm": 2.1733235228362147, + "learning_rate": 4.755957626354954e-06, + "loss": 0.3589, + "step": 8959 + }, + { + "epoch": 1.371078806426932, + "grad_norm": 2.0313878062789326, + "learning_rate": 4.753847690287207e-06, + "loss": 0.2957, + "step": 8960 + }, + { + "epoch": 1.3712318286151493, + "grad_norm": 2.1544020326434588, + "learning_rate": 4.751738076406196e-06, + "loss": 0.345, + "step": 8961 + }, + { + "epoch": 1.3713848508033664, + "grad_norm": 2.1167948771347382, + "learning_rate": 4.74962878484148e-06, + "loss": 0.4013, + "step": 8962 + }, + { + "epoch": 1.371537872991584, + "grad_norm": 2.2116358842619195, + "learning_rate": 4.747519815722601e-06, + "loss": 0.347, + "step": 8963 + }, + { + "epoch": 1.371690895179801, + "grad_norm": 2.2257642473680743, + "learning_rate": 4.7454111691790785e-06, + "loss": 0.4065, + "step": 8964 + }, + { + "epoch": 1.3718439173680184, + "grad_norm": 2.373332905259864, + "learning_rate": 4.743302845340411e-06, + "loss": 0.3881, + "step": 8965 + }, + { + "epoch": 1.3719969395562357, + "grad_norm": 1.9971503672491702, + "learning_rate": 4.74119484433608e-06, + "loss": 0.3295, + "step": 8966 + }, + { + "epoch": 1.372149961744453, + "grad_norm": 2.164576400339109, + "learning_rate": 4.739087166295546e-06, + "loss": 0.368, + "step": 8967 + }, + { + "epoch": 1.3723029839326704, + "grad_norm": 2.1304626352278735, + "learning_rate": 4.73697981134825e-06, + "loss": 0.3502, + "step": 8968 + }, + { + "epoch": 1.3724560061208875, + "grad_norm": 1.9351491350127181, + "learning_rate": 4.734872779623611e-06, + "loss": 0.3074, + "step": 8969 + }, + { + "epoch": 1.3726090283091048, + "grad_norm": 1.8804547668574492, + "learning_rate": 4.732766071251037e-06, + "loss": 0.3111, + "step": 8970 + }, + { + "epoch": 1.3727620504973221, + "grad_norm": 2.2549524395110825, + "learning_rate": 4.730659686359901e-06, + "loss": 0.3449, + "step": 8971 + }, + { + "epoch": 1.3729150726855395, + "grad_norm": 2.242806963979162, + "learning_rate": 4.728553625079564e-06, + "loss": 0.3164, + "step": 8972 + }, + { + "epoch": 1.3730680948737568, + "grad_norm": 2.11930174434817, + "learning_rate": 4.726447887539378e-06, + "loss": 0.3364, + "step": 8973 + }, + { + "epoch": 1.373221117061974, + "grad_norm": 2.470435497406207, + "learning_rate": 4.724342473868655e-06, + "loss": 0.4222, + "step": 8974 + }, + { + "epoch": 1.3733741392501913, + "grad_norm": 2.3682646592639736, + "learning_rate": 4.722237384196694e-06, + "loss": 0.3675, + "step": 8975 + }, + { + "epoch": 1.3735271614384086, + "grad_norm": 2.3977878266611667, + "learning_rate": 4.72013261865279e-06, + "loss": 0.3816, + "step": 8976 + }, + { + "epoch": 1.373680183626626, + "grad_norm": 1.8686907034229359, + "learning_rate": 4.71802817736619e-06, + "loss": 0.2997, + "step": 8977 + }, + { + "epoch": 1.3738332058148433, + "grad_norm": 2.14486465113667, + "learning_rate": 4.715924060466145e-06, + "loss": 0.3721, + "step": 8978 + }, + { + "epoch": 1.3739862280030604, + "grad_norm": 2.2985453789295627, + "learning_rate": 4.713820268081879e-06, + "loss": 0.3437, + "step": 8979 + }, + { + "epoch": 1.3741392501912777, + "grad_norm": 2.227704431604356, + "learning_rate": 4.711716800342584e-06, + "loss": 0.3468, + "step": 8980 + }, + { + "epoch": 1.374292272379495, + "grad_norm": 2.204147290592917, + "learning_rate": 4.709613657377449e-06, + "loss": 0.3204, + "step": 8981 + }, + { + "epoch": 1.3744452945677124, + "grad_norm": 2.1549027912943712, + "learning_rate": 4.707510839315642e-06, + "loss": 0.3232, + "step": 8982 + }, + { + "epoch": 1.3745983167559297, + "grad_norm": 1.9921345110290458, + "learning_rate": 4.705408346286291e-06, + "loss": 0.3144, + "step": 8983 + }, + { + "epoch": 1.3747513389441468, + "grad_norm": 2.0591857889441956, + "learning_rate": 4.703306178418529e-06, + "loss": 0.3305, + "step": 8984 + }, + { + "epoch": 1.3749043611323641, + "grad_norm": 2.094890599785986, + "learning_rate": 4.701204335841455e-06, + "loss": 0.2791, + "step": 8985 + }, + { + "epoch": 1.3750573833205815, + "grad_norm": 2.1299628438008753, + "learning_rate": 4.699102818684151e-06, + "loss": 0.2865, + "step": 8986 + }, + { + "epoch": 1.3752104055087988, + "grad_norm": 2.378452431658748, + "learning_rate": 4.6970016270756826e-06, + "loss": 0.4039, + "step": 8987 + }, + { + "epoch": 1.3753634276970161, + "grad_norm": 1.941247460390366, + "learning_rate": 4.694900761145088e-06, + "loss": 0.2924, + "step": 8988 + }, + { + "epoch": 1.3755164498852332, + "grad_norm": 2.0735387694712673, + "learning_rate": 4.692800221021392e-06, + "loss": 0.283, + "step": 8989 + }, + { + "epoch": 1.3756694720734506, + "grad_norm": 2.1748933607293783, + "learning_rate": 4.690700006833595e-06, + "loss": 0.3652, + "step": 8990 + }, + { + "epoch": 1.375822494261668, + "grad_norm": 2.2313085481512647, + "learning_rate": 4.688600118710682e-06, + "loss": 0.3407, + "step": 8991 + }, + { + "epoch": 1.3759755164498852, + "grad_norm": 2.125396345210953, + "learning_rate": 4.686500556781614e-06, + "loss": 0.3484, + "step": 8992 + }, + { + "epoch": 1.3761285386381026, + "grad_norm": 2.442831053015856, + "learning_rate": 4.684401321175333e-06, + "loss": 0.3715, + "step": 8993 + }, + { + "epoch": 1.3762815608263197, + "grad_norm": 2.264403792995592, + "learning_rate": 4.6823024120207615e-06, + "loss": 0.3353, + "step": 8994 + }, + { + "epoch": 1.376434583014537, + "grad_norm": 2.153718862608068, + "learning_rate": 4.680203829446802e-06, + "loss": 0.3349, + "step": 8995 + }, + { + "epoch": 1.3765876052027544, + "grad_norm": 2.2678638376517317, + "learning_rate": 4.678105573582337e-06, + "loss": 0.3347, + "step": 8996 + }, + { + "epoch": 1.3767406273909717, + "grad_norm": 2.143262059852469, + "learning_rate": 4.676007644556226e-06, + "loss": 0.3476, + "step": 8997 + }, + { + "epoch": 1.376893649579189, + "grad_norm": 1.8505114454183178, + "learning_rate": 4.673910042497312e-06, + "loss": 0.3635, + "step": 8998 + }, + { + "epoch": 1.3770466717674061, + "grad_norm": 1.9620249089509953, + "learning_rate": 4.671812767534424e-06, + "loss": 0.2853, + "step": 8999 + }, + { + "epoch": 1.3771996939556237, + "grad_norm": 2.2096240025262563, + "learning_rate": 4.669715819796354e-06, + "loss": 0.3826, + "step": 9000 + }, + { + "epoch": 1.3773527161438408, + "grad_norm": 2.0471221789313048, + "learning_rate": 4.667619199411886e-06, + "loss": 0.3303, + "step": 9001 + }, + { + "epoch": 1.3775057383320581, + "grad_norm": 2.219398951091194, + "learning_rate": 4.665522906509789e-06, + "loss": 0.3383, + "step": 9002 + }, + { + "epoch": 1.3776587605202755, + "grad_norm": 2.1239169778671663, + "learning_rate": 4.663426941218796e-06, + "loss": 0.3643, + "step": 9003 + }, + { + "epoch": 1.3778117827084928, + "grad_norm": 2.2161074087918413, + "learning_rate": 4.661331303667627e-06, + "loss": 0.3544, + "step": 9004 + }, + { + "epoch": 1.3779648048967101, + "grad_norm": 1.8727061163511225, + "learning_rate": 4.659235993984995e-06, + "loss": 0.2673, + "step": 9005 + }, + { + "epoch": 1.3781178270849272, + "grad_norm": 1.9336210558087923, + "learning_rate": 4.657141012299567e-06, + "loss": 0.3544, + "step": 9006 + }, + { + "epoch": 1.3782708492731446, + "grad_norm": 2.126956772127913, + "learning_rate": 4.655046358740014e-06, + "loss": 0.4919, + "step": 9007 + }, + { + "epoch": 1.378423871461362, + "grad_norm": 2.0412566209981478, + "learning_rate": 4.652952033434978e-06, + "loss": 0.3247, + "step": 9008 + }, + { + "epoch": 1.3785768936495792, + "grad_norm": 1.9837314117934364, + "learning_rate": 4.650858036513067e-06, + "loss": 0.3384, + "step": 9009 + }, + { + "epoch": 1.3787299158377966, + "grad_norm": 2.0790714883936134, + "learning_rate": 4.648764368102894e-06, + "loss": 0.2951, + "step": 9010 + }, + { + "epoch": 1.3788829380260137, + "grad_norm": 2.0441032696122616, + "learning_rate": 4.646671028333039e-06, + "loss": 0.358, + "step": 9011 + }, + { + "epoch": 1.379035960214231, + "grad_norm": 2.026421631547, + "learning_rate": 4.644578017332052e-06, + "loss": 0.4144, + "step": 9012 + }, + { + "epoch": 1.3791889824024484, + "grad_norm": 2.2121542332323854, + "learning_rate": 4.642485335228483e-06, + "loss": 0.3838, + "step": 9013 + }, + { + "epoch": 1.3793420045906657, + "grad_norm": 2.218639370811378, + "learning_rate": 4.640392982150846e-06, + "loss": 0.3431, + "step": 9014 + }, + { + "epoch": 1.379495026778883, + "grad_norm": 2.037830779490401, + "learning_rate": 4.638300958227645e-06, + "loss": 0.3522, + "step": 9015 + }, + { + "epoch": 1.3796480489671001, + "grad_norm": 2.410335657240026, + "learning_rate": 4.636209263587358e-06, + "loss": 0.3976, + "step": 9016 + }, + { + "epoch": 1.3798010711553175, + "grad_norm": 2.1241097054830167, + "learning_rate": 4.634117898358441e-06, + "loss": 0.3065, + "step": 9017 + }, + { + "epoch": 1.3799540933435348, + "grad_norm": 2.343825620509157, + "learning_rate": 4.632026862669337e-06, + "loss": 0.3899, + "step": 9018 + }, + { + "epoch": 1.3801071155317521, + "grad_norm": 1.9525031937885948, + "learning_rate": 4.629936156648463e-06, + "loss": 0.2961, + "step": 9019 + }, + { + "epoch": 1.3802601377199695, + "grad_norm": 2.134029971911187, + "learning_rate": 4.627845780424217e-06, + "loss": 0.3464, + "step": 9020 + }, + { + "epoch": 1.3804131599081866, + "grad_norm": 1.9946961858753256, + "learning_rate": 4.625755734124977e-06, + "loss": 0.3302, + "step": 9021 + }, + { + "epoch": 1.380566182096404, + "grad_norm": 1.9751054972586108, + "learning_rate": 4.623666017879098e-06, + "loss": 0.2879, + "step": 9022 + }, + { + "epoch": 1.3807192042846212, + "grad_norm": 2.1883779520669115, + "learning_rate": 4.62157663181493e-06, + "loss": 0.2725, + "step": 9023 + }, + { + "epoch": 1.3808722264728386, + "grad_norm": 1.8160688719204534, + "learning_rate": 4.619487576060777e-06, + "loss": 0.2921, + "step": 9024 + }, + { + "epoch": 1.381025248661056, + "grad_norm": 2.04738295569202, + "learning_rate": 4.6173988507449366e-06, + "loss": 0.3446, + "step": 9025 + }, + { + "epoch": 1.381178270849273, + "grad_norm": 1.970933426371676, + "learning_rate": 4.615310455995697e-06, + "loss": 0.2965, + "step": 9026 + }, + { + "epoch": 1.3813312930374904, + "grad_norm": 2.265268014430741, + "learning_rate": 4.613222391941304e-06, + "loss": 0.2872, + "step": 9027 + }, + { + "epoch": 1.3814843152257077, + "grad_norm": 2.139073356126362, + "learning_rate": 4.611134658709992e-06, + "loss": 0.3425, + "step": 9028 + }, + { + "epoch": 1.381637337413925, + "grad_norm": 2.07292711463194, + "learning_rate": 4.609047256429992e-06, + "loss": 0.3747, + "step": 9029 + }, + { + "epoch": 1.3817903596021424, + "grad_norm": 2.268761758492247, + "learning_rate": 4.60696018522948e-06, + "loss": 0.38, + "step": 9030 + }, + { + "epoch": 1.3819433817903595, + "grad_norm": 2.2366077257417785, + "learning_rate": 4.604873445236645e-06, + "loss": 0.3249, + "step": 9031 + }, + { + "epoch": 1.382096403978577, + "grad_norm": 2.013369317159582, + "learning_rate": 4.602787036579643e-06, + "loss": 0.3385, + "step": 9032 + }, + { + "epoch": 1.3822494261667941, + "grad_norm": 2.0345532889136653, + "learning_rate": 4.600700959386592e-06, + "loss": 0.2601, + "step": 9033 + }, + { + "epoch": 1.3824024483550115, + "grad_norm": 2.1497689903647235, + "learning_rate": 4.598615213785624e-06, + "loss": 0.3804, + "step": 9034 + }, + { + "epoch": 1.3825554705432288, + "grad_norm": 2.249995107110444, + "learning_rate": 4.596529799904825e-06, + "loss": 0.3559, + "step": 9035 + }, + { + "epoch": 1.3827084927314461, + "grad_norm": 1.8239006623904237, + "learning_rate": 4.594444717872269e-06, + "loss": 0.2621, + "step": 9036 + }, + { + "epoch": 1.3828615149196635, + "grad_norm": 2.023991229226579, + "learning_rate": 4.592359967816012e-06, + "loss": 0.3317, + "step": 9037 + }, + { + "epoch": 1.3830145371078806, + "grad_norm": 2.0859580119335326, + "learning_rate": 4.590275549864081e-06, + "loss": 0.3042, + "step": 9038 + }, + { + "epoch": 1.383167559296098, + "grad_norm": 2.376028679244968, + "learning_rate": 4.5881914641444936e-06, + "loss": 0.3887, + "step": 9039 + }, + { + "epoch": 1.3833205814843152, + "grad_norm": 2.0554311647854164, + "learning_rate": 4.586107710785238e-06, + "loss": 0.3451, + "step": 9040 + }, + { + "epoch": 1.3834736036725326, + "grad_norm": 1.7523628481913938, + "learning_rate": 4.584024289914288e-06, + "loss": 0.3022, + "step": 9041 + }, + { + "epoch": 1.38362662586075, + "grad_norm": 1.9390495654961886, + "learning_rate": 4.581941201659593e-06, + "loss": 0.3678, + "step": 9042 + }, + { + "epoch": 1.383779648048967, + "grad_norm": 2.237567627656652, + "learning_rate": 4.579858446149086e-06, + "loss": 0.4361, + "step": 9043 + }, + { + "epoch": 1.3839326702371844, + "grad_norm": 2.3021361434486405, + "learning_rate": 4.577776023510674e-06, + "loss": 0.3286, + "step": 9044 + }, + { + "epoch": 1.3840856924254017, + "grad_norm": 2.3763182088117545, + "learning_rate": 4.575693933872248e-06, + "loss": 0.3913, + "step": 9045 + }, + { + "epoch": 1.384238714613619, + "grad_norm": 1.9626988253783606, + "learning_rate": 4.573612177361679e-06, + "loss": 0.3182, + "step": 9046 + }, + { + "epoch": 1.3843917368018364, + "grad_norm": 2.173280085060481, + "learning_rate": 4.571530754106813e-06, + "loss": 0.3433, + "step": 9047 + }, + { + "epoch": 1.3845447589900535, + "grad_norm": 1.942616106562521, + "learning_rate": 4.5694496642354815e-06, + "loss": 0.2648, + "step": 9048 + }, + { + "epoch": 1.3846977811782708, + "grad_norm": 2.1229756870525462, + "learning_rate": 4.567368907875489e-06, + "loss": 0.3388, + "step": 9049 + }, + { + "epoch": 1.3848508033664881, + "grad_norm": 1.9885131487226595, + "learning_rate": 4.565288485154624e-06, + "loss": 0.3119, + "step": 9050 + }, + { + "epoch": 1.3850038255547055, + "grad_norm": 1.9222230959456768, + "learning_rate": 4.563208396200651e-06, + "loss": 0.2911, + "step": 9051 + }, + { + "epoch": 1.3851568477429228, + "grad_norm": 2.158206506225906, + "learning_rate": 4.561128641141327e-06, + "loss": 0.3206, + "step": 9052 + }, + { + "epoch": 1.38530986993114, + "grad_norm": 2.19345897921204, + "learning_rate": 4.559049220104366e-06, + "loss": 0.3805, + "step": 9053 + }, + { + "epoch": 1.3854628921193572, + "grad_norm": 2.1317407824996044, + "learning_rate": 4.5569701332174746e-06, + "loss": 0.3375, + "step": 9054 + }, + { + "epoch": 1.3856159143075746, + "grad_norm": 2.3366870170548233, + "learning_rate": 4.554891380608346e-06, + "loss": 0.3741, + "step": 9055 + }, + { + "epoch": 1.385768936495792, + "grad_norm": 2.2540854792574363, + "learning_rate": 4.552812962404637e-06, + "loss": 0.3169, + "step": 9056 + }, + { + "epoch": 1.3859219586840092, + "grad_norm": 2.099517950545955, + "learning_rate": 4.550734878733989e-06, + "loss": 0.3271, + "step": 9057 + }, + { + "epoch": 1.3860749808722264, + "grad_norm": 1.9069516040024563, + "learning_rate": 4.548657129724038e-06, + "loss": 0.2895, + "step": 9058 + }, + { + "epoch": 1.3862280030604437, + "grad_norm": 2.161070641348649, + "learning_rate": 4.54657971550237e-06, + "loss": 0.3233, + "step": 9059 + }, + { + "epoch": 1.386381025248661, + "grad_norm": 2.216026925034713, + "learning_rate": 4.54450263619658e-06, + "loss": 0.3162, + "step": 9060 + }, + { + "epoch": 1.3865340474368784, + "grad_norm": 2.1840129464952094, + "learning_rate": 4.542425891934229e-06, + "loss": 0.3682, + "step": 9061 + }, + { + "epoch": 1.3866870696250957, + "grad_norm": 1.884220281115278, + "learning_rate": 4.540349482842846e-06, + "loss": 0.2823, + "step": 9062 + }, + { + "epoch": 1.3868400918133128, + "grad_norm": 1.7360218886526524, + "learning_rate": 4.538273409049964e-06, + "loss": 0.2798, + "step": 9063 + }, + { + "epoch": 1.3869931140015304, + "grad_norm": 2.3949601964182103, + "learning_rate": 4.536197670683081e-06, + "loss": 0.3681, + "step": 9064 + }, + { + "epoch": 1.3871461361897475, + "grad_norm": 2.0561106971906646, + "learning_rate": 4.534122267869667e-06, + "loss": 0.3101, + "step": 9065 + }, + { + "epoch": 1.3872991583779648, + "grad_norm": 2.3957391340285383, + "learning_rate": 4.532047200737191e-06, + "loss": 0.4326, + "step": 9066 + }, + { + "epoch": 1.3874521805661821, + "grad_norm": 1.9010857779535084, + "learning_rate": 4.529972469413087e-06, + "loss": 0.3132, + "step": 9067 + }, + { + "epoch": 1.3876052027543995, + "grad_norm": 2.02976579219007, + "learning_rate": 4.527898074024772e-06, + "loss": 0.358, + "step": 9068 + }, + { + "epoch": 1.3877582249426168, + "grad_norm": 2.29210851094873, + "learning_rate": 4.525824014699643e-06, + "loss": 0.3679, + "step": 9069 + }, + { + "epoch": 1.387911247130834, + "grad_norm": 1.9044989375358745, + "learning_rate": 4.523750291565078e-06, + "loss": 0.3186, + "step": 9070 + }, + { + "epoch": 1.3880642693190512, + "grad_norm": 1.9671211186121362, + "learning_rate": 4.52167690474843e-06, + "loss": 0.313, + "step": 9071 + }, + { + "epoch": 1.3882172915072686, + "grad_norm": 2.1843101523101747, + "learning_rate": 4.5196038543770335e-06, + "loss": 0.3622, + "step": 9072 + }, + { + "epoch": 1.388370313695486, + "grad_norm": 1.8997165102498639, + "learning_rate": 4.517531140578205e-06, + "loss": 0.3651, + "step": 9073 + }, + { + "epoch": 1.3885233358837032, + "grad_norm": 2.11857325406712, + "learning_rate": 4.515458763479237e-06, + "loss": 0.4028, + "step": 9074 + }, + { + "epoch": 1.3886763580719204, + "grad_norm": 2.0344891913610037, + "learning_rate": 4.5133867232074e-06, + "loss": 0.2762, + "step": 9075 + }, + { + "epoch": 1.3888293802601377, + "grad_norm": 1.8699787605804676, + "learning_rate": 4.51131501988995e-06, + "loss": 0.2799, + "step": 9076 + }, + { + "epoch": 1.388982402448355, + "grad_norm": 1.9247216761568173, + "learning_rate": 4.509243653654116e-06, + "loss": 0.231, + "step": 9077 + }, + { + "epoch": 1.3891354246365724, + "grad_norm": 1.8389597441884304, + "learning_rate": 4.507172624627109e-06, + "loss": 0.2892, + "step": 9078 + }, + { + "epoch": 1.3892884468247897, + "grad_norm": 2.1670682348965054, + "learning_rate": 4.505101932936119e-06, + "loss": 0.3743, + "step": 9079 + }, + { + "epoch": 1.3894414690130068, + "grad_norm": 2.04514272674738, + "learning_rate": 4.5030315787083126e-06, + "loss": 0.3327, + "step": 9080 + }, + { + "epoch": 1.3895944912012241, + "grad_norm": 2.3185780179520767, + "learning_rate": 4.5009615620708466e-06, + "loss": 0.4264, + "step": 9081 + }, + { + "epoch": 1.3897475133894415, + "grad_norm": 1.994748159454086, + "learning_rate": 4.498891883150842e-06, + "loss": 0.3528, + "step": 9082 + }, + { + "epoch": 1.3899005355776588, + "grad_norm": 2.2945232098348107, + "learning_rate": 4.496822542075403e-06, + "loss": 0.3135, + "step": 9083 + }, + { + "epoch": 1.3900535577658761, + "grad_norm": 2.19865510079339, + "learning_rate": 4.494753538971627e-06, + "loss": 0.3625, + "step": 9084 + }, + { + "epoch": 1.3902065799540932, + "grad_norm": 1.9457973161006021, + "learning_rate": 4.4926848739665695e-06, + "loss": 0.3176, + "step": 9085 + }, + { + "epoch": 1.3903596021423106, + "grad_norm": 2.2547412719369184, + "learning_rate": 4.490616547187275e-06, + "loss": 0.3324, + "step": 9086 + }, + { + "epoch": 1.390512624330528, + "grad_norm": 2.331565110391067, + "learning_rate": 4.488548558760778e-06, + "loss": 0.3423, + "step": 9087 + }, + { + "epoch": 1.3906656465187452, + "grad_norm": 2.0880204887768783, + "learning_rate": 4.486480908814068e-06, + "loss": 0.3643, + "step": 9088 + }, + { + "epoch": 1.3908186687069626, + "grad_norm": 2.1681981316566596, + "learning_rate": 4.4844135974741384e-06, + "loss": 0.356, + "step": 9089 + }, + { + "epoch": 1.3909716908951797, + "grad_norm": 2.149384143224433, + "learning_rate": 4.48234662486795e-06, + "loss": 0.3372, + "step": 9090 + }, + { + "epoch": 1.391124713083397, + "grad_norm": 2.1414181789883484, + "learning_rate": 4.480279991122434e-06, + "loss": 0.3571, + "step": 9091 + }, + { + "epoch": 1.3912777352716144, + "grad_norm": 1.985829509300672, + "learning_rate": 4.478213696364519e-06, + "loss": 0.2794, + "step": 9092 + }, + { + "epoch": 1.3914307574598317, + "grad_norm": 2.155329738496481, + "learning_rate": 4.476147740721108e-06, + "loss": 0.3552, + "step": 9093 + }, + { + "epoch": 1.391583779648049, + "grad_norm": 2.0895352644773655, + "learning_rate": 4.4740821243190655e-06, + "loss": 0.3342, + "step": 9094 + }, + { + "epoch": 1.3917368018362661, + "grad_norm": 2.031285782158425, + "learning_rate": 4.472016847285261e-06, + "loss": 0.3187, + "step": 9095 + }, + { + "epoch": 1.3918898240244835, + "grad_norm": 2.179929418116259, + "learning_rate": 4.469951909746527e-06, + "loss": 0.2988, + "step": 9096 + }, + { + "epoch": 1.3920428462127008, + "grad_norm": 2.2347385172890992, + "learning_rate": 4.467887311829681e-06, + "loss": 0.3867, + "step": 9097 + }, + { + "epoch": 1.3921958684009181, + "grad_norm": 2.03613306275373, + "learning_rate": 4.465823053661515e-06, + "loss": 0.4267, + "step": 9098 + }, + { + "epoch": 1.3923488905891355, + "grad_norm": 2.362451409078388, + "learning_rate": 4.463759135368807e-06, + "loss": 0.3879, + "step": 9099 + }, + { + "epoch": 1.3925019127773526, + "grad_norm": 2.3193830575924856, + "learning_rate": 4.461695557078307e-06, + "loss": 0.3888, + "step": 9100 + }, + { + "epoch": 1.3926549349655701, + "grad_norm": 2.0010960754941793, + "learning_rate": 4.45963231891675e-06, + "loss": 0.3607, + "step": 9101 + }, + { + "epoch": 1.3928079571537872, + "grad_norm": 1.956606094360901, + "learning_rate": 4.457569421010846e-06, + "loss": 0.2785, + "step": 9102 + }, + { + "epoch": 1.3929609793420046, + "grad_norm": 1.659279994173601, + "learning_rate": 4.455506863487285e-06, + "loss": 0.2374, + "step": 9103 + }, + { + "epoch": 1.393114001530222, + "grad_norm": 2.005492432335445, + "learning_rate": 4.453444646472736e-06, + "loss": 0.3199, + "step": 9104 + }, + { + "epoch": 1.3932670237184392, + "grad_norm": 2.291401472114738, + "learning_rate": 4.451382770093855e-06, + "loss": 0.3307, + "step": 9105 + }, + { + "epoch": 1.3934200459066566, + "grad_norm": 2.2285881389564364, + "learning_rate": 4.44932123447726e-06, + "loss": 0.3942, + "step": 9106 + }, + { + "epoch": 1.3935730680948737, + "grad_norm": 2.5366833904976667, + "learning_rate": 4.447260039749562e-06, + "loss": 0.4648, + "step": 9107 + }, + { + "epoch": 1.393726090283091, + "grad_norm": 2.1735252628773116, + "learning_rate": 4.445199186037352e-06, + "loss": 0.357, + "step": 9108 + }, + { + "epoch": 1.3938791124713084, + "grad_norm": 1.949213191530467, + "learning_rate": 4.443138673467183e-06, + "loss": 0.2779, + "step": 9109 + }, + { + "epoch": 1.3940321346595257, + "grad_norm": 2.0589593708206877, + "learning_rate": 4.441078502165613e-06, + "loss": 0.3583, + "step": 9110 + }, + { + "epoch": 1.394185156847743, + "grad_norm": 2.0886474227263174, + "learning_rate": 4.43901867225916e-06, + "loss": 0.3245, + "step": 9111 + }, + { + "epoch": 1.3943381790359601, + "grad_norm": 2.2564496042284574, + "learning_rate": 4.4369591838743175e-06, + "loss": 0.4147, + "step": 9112 + }, + { + "epoch": 1.3944912012241775, + "grad_norm": 2.224803250753932, + "learning_rate": 4.434900037137579e-06, + "loss": 0.3826, + "step": 9113 + }, + { + "epoch": 1.3946442234123948, + "grad_norm": 1.9444166418014062, + "learning_rate": 4.432841232175404e-06, + "loss": 0.3276, + "step": 9114 + }, + { + "epoch": 1.3947972456006121, + "grad_norm": 1.9814704822594493, + "learning_rate": 4.43078276911422e-06, + "loss": 0.3291, + "step": 9115 + }, + { + "epoch": 1.3949502677888295, + "grad_norm": 2.202246242241708, + "learning_rate": 4.428724648080457e-06, + "loss": 0.3486, + "step": 9116 + }, + { + "epoch": 1.3951032899770466, + "grad_norm": 2.1543385539147315, + "learning_rate": 4.426666869200507e-06, + "loss": 0.3825, + "step": 9117 + }, + { + "epoch": 1.395256312165264, + "grad_norm": 1.9988288994094405, + "learning_rate": 4.424609432600749e-06, + "loss": 0.3201, + "step": 9118 + }, + { + "epoch": 1.3954093343534812, + "grad_norm": 2.339767082170093, + "learning_rate": 4.422552338407537e-06, + "loss": 0.355, + "step": 9119 + }, + { + "epoch": 1.3955623565416986, + "grad_norm": 2.377307692928964, + "learning_rate": 4.4204955867472035e-06, + "loss": 0.392, + "step": 9120 + }, + { + "epoch": 1.395715378729916, + "grad_norm": 1.8196435714068888, + "learning_rate": 4.418439177746064e-06, + "loss": 0.3027, + "step": 9121 + }, + { + "epoch": 1.395868400918133, + "grad_norm": 2.2604499545369934, + "learning_rate": 4.41638311153041e-06, + "loss": 0.3455, + "step": 9122 + }, + { + "epoch": 1.3960214231063504, + "grad_norm": 2.033493361083444, + "learning_rate": 4.414327388226511e-06, + "loss": 0.3, + "step": 9123 + }, + { + "epoch": 1.3961744452945677, + "grad_norm": 2.089188809076427, + "learning_rate": 4.412272007960621e-06, + "loss": 0.3484, + "step": 9124 + }, + { + "epoch": 1.396327467482785, + "grad_norm": 2.0463071223170473, + "learning_rate": 4.4102169708589635e-06, + "loss": 0.3231, + "step": 9125 + }, + { + "epoch": 1.3964804896710024, + "grad_norm": 2.088456068827583, + "learning_rate": 4.4081622770477505e-06, + "loss": 0.411, + "step": 9126 + }, + { + "epoch": 1.3966335118592195, + "grad_norm": 2.1862769519850844, + "learning_rate": 4.4061079266531685e-06, + "loss": 0.3502, + "step": 9127 + }, + { + "epoch": 1.3967865340474368, + "grad_norm": 2.45723775464347, + "learning_rate": 4.404053919801381e-06, + "loss": 0.3739, + "step": 9128 + }, + { + "epoch": 1.3969395562356541, + "grad_norm": 2.25312171783395, + "learning_rate": 4.4020002566185336e-06, + "loss": 0.3956, + "step": 9129 + }, + { + "epoch": 1.3970925784238715, + "grad_norm": 2.303669832441205, + "learning_rate": 4.39994693723075e-06, + "loss": 0.3633, + "step": 9130 + }, + { + "epoch": 1.3972456006120888, + "grad_norm": 2.158839638357098, + "learning_rate": 4.3978939617641324e-06, + "loss": 0.3417, + "step": 9131 + }, + { + "epoch": 1.397398622800306, + "grad_norm": 2.1069280338103487, + "learning_rate": 4.395841330344762e-06, + "loss": 0.3423, + "step": 9132 + }, + { + "epoch": 1.3975516449885235, + "grad_norm": 1.982236354173458, + "learning_rate": 4.393789043098697e-06, + "loss": 0.346, + "step": 9133 + }, + { + "epoch": 1.3977046671767406, + "grad_norm": 2.192432973522562, + "learning_rate": 4.391737100151984e-06, + "loss": 0.4102, + "step": 9134 + }, + { + "epoch": 1.397857689364958, + "grad_norm": 2.2270581821455844, + "learning_rate": 4.3896855016306324e-06, + "loss": 0.4185, + "step": 9135 + }, + { + "epoch": 1.3980107115531752, + "grad_norm": 2.299865181546426, + "learning_rate": 4.387634247660638e-06, + "loss": 0.4115, + "step": 9136 + }, + { + "epoch": 1.3981637337413926, + "grad_norm": 2.474822061012731, + "learning_rate": 4.385583338367988e-06, + "loss": 0.3599, + "step": 9137 + }, + { + "epoch": 1.39831675592961, + "grad_norm": 1.9942991712556746, + "learning_rate": 4.38353277387862e-06, + "loss": 0.3323, + "step": 9138 + }, + { + "epoch": 1.398469778117827, + "grad_norm": 1.9557470552238374, + "learning_rate": 4.381482554318481e-06, + "loss": 0.2981, + "step": 9139 + }, + { + "epoch": 1.3986228003060444, + "grad_norm": 1.9394714605145627, + "learning_rate": 4.379432679813482e-06, + "loss": 0.3279, + "step": 9140 + }, + { + "epoch": 1.3987758224942617, + "grad_norm": 2.227238700846652, + "learning_rate": 4.377383150489503e-06, + "loss": 0.3151, + "step": 9141 + }, + { + "epoch": 1.398928844682479, + "grad_norm": 1.9792871389602236, + "learning_rate": 4.375333966472423e-06, + "loss": 0.2772, + "step": 9142 + }, + { + "epoch": 1.3990818668706964, + "grad_norm": 1.9119147272515082, + "learning_rate": 4.373285127888093e-06, + "loss": 0.3299, + "step": 9143 + }, + { + "epoch": 1.3992348890589135, + "grad_norm": 2.0573878948922553, + "learning_rate": 4.3712366348623256e-06, + "loss": 0.3293, + "step": 9144 + }, + { + "epoch": 1.3993879112471308, + "grad_norm": 2.012802138222529, + "learning_rate": 4.369188487520942e-06, + "loss": 0.3336, + "step": 9145 + }, + { + "epoch": 1.3995409334353481, + "grad_norm": 2.168983085038054, + "learning_rate": 4.36714068598972e-06, + "loss": 0.4029, + "step": 9146 + }, + { + "epoch": 1.3996939556235655, + "grad_norm": 2.1144699722556517, + "learning_rate": 4.365093230394425e-06, + "loss": 0.337, + "step": 9147 + }, + { + "epoch": 1.3998469778117828, + "grad_norm": 2.0987987285199305, + "learning_rate": 4.363046120860799e-06, + "loss": 0.3245, + "step": 9148 + }, + { + "epoch": 1.4, + "grad_norm": 2.138558119857808, + "learning_rate": 4.360999357514562e-06, + "loss": 0.3627, + "step": 9149 + }, + { + "epoch": 1.4001530221882172, + "grad_norm": 2.1546083763172135, + "learning_rate": 4.358952940481414e-06, + "loss": 0.3407, + "step": 9150 + }, + { + "epoch": 1.4003060443764346, + "grad_norm": 2.13815010724717, + "learning_rate": 4.356906869887034e-06, + "loss": 0.3007, + "step": 9151 + }, + { + "epoch": 1.400459066564652, + "grad_norm": 2.084680308601072, + "learning_rate": 4.354861145857079e-06, + "loss": 0.3639, + "step": 9152 + }, + { + "epoch": 1.4006120887528692, + "grad_norm": 1.9804950825280374, + "learning_rate": 4.352815768517185e-06, + "loss": 0.2538, + "step": 9153 + }, + { + "epoch": 1.4007651109410864, + "grad_norm": 1.9922835195123851, + "learning_rate": 4.350770737992967e-06, + "loss": 0.2989, + "step": 9154 + }, + { + "epoch": 1.4009181331293037, + "grad_norm": 2.0487239910505006, + "learning_rate": 4.3487260544100176e-06, + "loss": 0.3304, + "step": 9155 + }, + { + "epoch": 1.401071155317521, + "grad_norm": 2.0799090584250557, + "learning_rate": 4.346681717893909e-06, + "loss": 0.3208, + "step": 9156 + }, + { + "epoch": 1.4012241775057384, + "grad_norm": 2.3668579542580246, + "learning_rate": 4.3446377285701924e-06, + "loss": 0.3714, + "step": 9157 + }, + { + "epoch": 1.4013771996939557, + "grad_norm": 2.3259095379886516, + "learning_rate": 4.342594086564396e-06, + "loss": 0.3701, + "step": 9158 + }, + { + "epoch": 1.4015302218821728, + "grad_norm": 2.2272276688925636, + "learning_rate": 4.340550792002029e-06, + "loss": 0.3585, + "step": 9159 + }, + { + "epoch": 1.4016832440703901, + "grad_norm": 2.041804323945471, + "learning_rate": 4.338507845008578e-06, + "loss": 0.3692, + "step": 9160 + }, + { + "epoch": 1.4018362662586075, + "grad_norm": 2.2361335197569723, + "learning_rate": 4.3364652457095095e-06, + "loss": 0.3295, + "step": 9161 + }, + { + "epoch": 1.4019892884468248, + "grad_norm": 2.1316794949736573, + "learning_rate": 4.3344229942302605e-06, + "loss": 0.3014, + "step": 9162 + }, + { + "epoch": 1.4021423106350421, + "grad_norm": 2.06167169272297, + "learning_rate": 4.332381090696267e-06, + "loss": 0.341, + "step": 9163 + }, + { + "epoch": 1.4022953328232592, + "grad_norm": 1.9514353960688686, + "learning_rate": 4.3303395352329195e-06, + "loss": 0.2991, + "step": 9164 + }, + { + "epoch": 1.4024483550114768, + "grad_norm": 2.2184375708560595, + "learning_rate": 4.3282983279655965e-06, + "loss": 0.2893, + "step": 9165 + }, + { + "epoch": 1.402601377199694, + "grad_norm": 2.0421161222765765, + "learning_rate": 4.326257469019668e-06, + "loss": 0.2997, + "step": 9166 + }, + { + "epoch": 1.4027543993879112, + "grad_norm": 1.9957994860733428, + "learning_rate": 4.324216958520462e-06, + "loss": 0.3234, + "step": 9167 + }, + { + "epoch": 1.4029074215761286, + "grad_norm": 2.4207732562865116, + "learning_rate": 4.3221767965932915e-06, + "loss": 0.344, + "step": 9168 + }, + { + "epoch": 1.403060443764346, + "grad_norm": 1.8290840140032734, + "learning_rate": 4.320136983363463e-06, + "loss": 0.2701, + "step": 9169 + }, + { + "epoch": 1.4032134659525632, + "grad_norm": 2.079210865586468, + "learning_rate": 4.318097518956236e-06, + "loss": 0.3439, + "step": 9170 + }, + { + "epoch": 1.4033664881407804, + "grad_norm": 2.3346282697773617, + "learning_rate": 4.31605840349687e-06, + "loss": 0.3681, + "step": 9171 + }, + { + "epoch": 1.4035195103289977, + "grad_norm": 2.3745803397537566, + "learning_rate": 4.314019637110598e-06, + "loss": 0.4203, + "step": 9172 + }, + { + "epoch": 1.403672532517215, + "grad_norm": 2.2195262607467954, + "learning_rate": 4.311981219922616e-06, + "loss": 0.3696, + "step": 9173 + }, + { + "epoch": 1.4038255547054324, + "grad_norm": 2.449637868284868, + "learning_rate": 4.309943152058122e-06, + "loss": 0.4486, + "step": 9174 + }, + { + "epoch": 1.4039785768936497, + "grad_norm": 2.3354398337723348, + "learning_rate": 4.30790543364228e-06, + "loss": 0.413, + "step": 9175 + }, + { + "epoch": 1.4041315990818668, + "grad_norm": 2.036115746289367, + "learning_rate": 4.305868064800233e-06, + "loss": 0.3874, + "step": 9176 + }, + { + "epoch": 1.4042846212700841, + "grad_norm": 2.0707373753851543, + "learning_rate": 4.303831045657102e-06, + "loss": 0.297, + "step": 9177 + }, + { + "epoch": 1.4044376434583015, + "grad_norm": 2.2471500234006294, + "learning_rate": 4.301794376337991e-06, + "loss": 0.3593, + "step": 9178 + }, + { + "epoch": 1.4045906656465188, + "grad_norm": 2.130251359667783, + "learning_rate": 4.29975805696798e-06, + "loss": 0.3498, + "step": 9179 + }, + { + "epoch": 1.4047436878347361, + "grad_norm": 2.135027547460175, + "learning_rate": 4.297722087672125e-06, + "loss": 0.2999, + "step": 9180 + }, + { + "epoch": 1.4048967100229532, + "grad_norm": 2.021138639473653, + "learning_rate": 4.295686468575464e-06, + "loss": 0.2895, + "step": 9181 + }, + { + "epoch": 1.4050497322111706, + "grad_norm": 2.0694117569321455, + "learning_rate": 4.2936511998030116e-06, + "loss": 0.3878, + "step": 9182 + }, + { + "epoch": 1.405202754399388, + "grad_norm": 2.098084448075512, + "learning_rate": 4.29161628147976e-06, + "loss": 0.3577, + "step": 9183 + }, + { + "epoch": 1.4053557765876052, + "grad_norm": 2.300338937806773, + "learning_rate": 4.289581713730691e-06, + "loss": 0.3517, + "step": 9184 + }, + { + "epoch": 1.4055087987758226, + "grad_norm": 2.2472611334731547, + "learning_rate": 4.287547496680744e-06, + "loss": 0.3293, + "step": 9185 + }, + { + "epoch": 1.4056618209640397, + "grad_norm": 1.9924191518899794, + "learning_rate": 4.2855136304548495e-06, + "loss": 0.3571, + "step": 9186 + }, + { + "epoch": 1.405814843152257, + "grad_norm": 2.5741502175394353, + "learning_rate": 4.283480115177925e-06, + "loss": 0.3318, + "step": 9187 + }, + { + "epoch": 1.4059678653404744, + "grad_norm": 2.2106988688252307, + "learning_rate": 4.2814469509748465e-06, + "loss": 0.3486, + "step": 9188 + }, + { + "epoch": 1.4061208875286917, + "grad_norm": 2.037048253663022, + "learning_rate": 4.279414137970478e-06, + "loss": 0.3917, + "step": 9189 + }, + { + "epoch": 1.406273909716909, + "grad_norm": 2.367760531391993, + "learning_rate": 4.277381676289673e-06, + "loss": 0.4048, + "step": 9190 + }, + { + "epoch": 1.4064269319051261, + "grad_norm": 2.3062782712082472, + "learning_rate": 4.27534956605724e-06, + "loss": 0.3714, + "step": 9191 + }, + { + "epoch": 1.4065799540933435, + "grad_norm": 2.023102891670905, + "learning_rate": 4.273317807397989e-06, + "loss": 0.3212, + "step": 9192 + }, + { + "epoch": 1.4067329762815608, + "grad_norm": 2.2230748607200814, + "learning_rate": 4.271286400436697e-06, + "loss": 0.3564, + "step": 9193 + }, + { + "epoch": 1.4068859984697781, + "grad_norm": 1.8733281634689514, + "learning_rate": 4.269255345298111e-06, + "loss": 0.336, + "step": 9194 + }, + { + "epoch": 1.4070390206579955, + "grad_norm": 2.0370688321086527, + "learning_rate": 4.267224642106977e-06, + "loss": 0.3371, + "step": 9195 + }, + { + "epoch": 1.4071920428462126, + "grad_norm": 2.140419683830952, + "learning_rate": 4.265194290988008e-06, + "loss": 0.3348, + "step": 9196 + }, + { + "epoch": 1.40734506503443, + "grad_norm": 2.0255097904919013, + "learning_rate": 4.2631642920658845e-06, + "loss": 0.3399, + "step": 9197 + }, + { + "epoch": 1.4074980872226472, + "grad_norm": 2.6557100376557865, + "learning_rate": 4.261134645465288e-06, + "loss": 0.3875, + "step": 9198 + }, + { + "epoch": 1.4076511094108646, + "grad_norm": 2.3843033837943812, + "learning_rate": 4.259105351310864e-06, + "loss": 0.3838, + "step": 9199 + }, + { + "epoch": 1.407804131599082, + "grad_norm": 2.2177489481232744, + "learning_rate": 4.2570764097272385e-06, + "loss": 0.3138, + "step": 9200 + }, + { + "epoch": 1.407957153787299, + "grad_norm": 2.1430301544228767, + "learning_rate": 4.255047820839018e-06, + "loss": 0.3316, + "step": 9201 + }, + { + "epoch": 1.4081101759755166, + "grad_norm": 2.03948756178882, + "learning_rate": 4.253019584770784e-06, + "loss": 0.3338, + "step": 9202 + }, + { + "epoch": 1.4082631981637337, + "grad_norm": 2.7064411052587674, + "learning_rate": 4.2509917016471e-06, + "loss": 0.3696, + "step": 9203 + }, + { + "epoch": 1.408416220351951, + "grad_norm": 2.157120376788771, + "learning_rate": 4.248964171592506e-06, + "loss": 0.3714, + "step": 9204 + }, + { + "epoch": 1.4085692425401684, + "grad_norm": 1.9402279140569512, + "learning_rate": 4.246936994731521e-06, + "loss": 0.3287, + "step": 9205 + }, + { + "epoch": 1.4087222647283857, + "grad_norm": 2.0005560931211925, + "learning_rate": 4.2449101711886405e-06, + "loss": 0.265, + "step": 9206 + }, + { + "epoch": 1.408875286916603, + "grad_norm": 2.0099989536188745, + "learning_rate": 4.242883701088342e-06, + "loss": 0.3342, + "step": 9207 + }, + { + "epoch": 1.4090283091048201, + "grad_norm": 1.89190018721622, + "learning_rate": 4.240857584555075e-06, + "loss": 0.3328, + "step": 9208 + }, + { + "epoch": 1.4091813312930375, + "grad_norm": 2.094044689048361, + "learning_rate": 4.2388318217132755e-06, + "loss": 0.3219, + "step": 9209 + }, + { + "epoch": 1.4093343534812548, + "grad_norm": 2.1431376339884576, + "learning_rate": 4.23680641268735e-06, + "loss": 0.3437, + "step": 9210 + }, + { + "epoch": 1.4094873756694721, + "grad_norm": 2.1750133312455393, + "learning_rate": 4.2347813576016896e-06, + "loss": 0.3898, + "step": 9211 + }, + { + "epoch": 1.4096403978576895, + "grad_norm": 2.026617738848952, + "learning_rate": 4.232756656580655e-06, + "loss": 0.3322, + "step": 9212 + }, + { + "epoch": 1.4097934200459066, + "grad_norm": 2.2552918578481944, + "learning_rate": 4.2307323097486045e-06, + "loss": 0.3443, + "step": 9213 + }, + { + "epoch": 1.409946442234124, + "grad_norm": 2.237233568584304, + "learning_rate": 4.228708317229849e-06, + "loss": 0.3377, + "step": 9214 + }, + { + "epoch": 1.4100994644223412, + "grad_norm": 2.1424246978573374, + "learning_rate": 4.226684679148687e-06, + "loss": 0.3334, + "step": 9215 + }, + { + "epoch": 1.4102524866105586, + "grad_norm": 2.0460843887698483, + "learning_rate": 4.2246613956294135e-06, + "loss": 0.3062, + "step": 9216 + }, + { + "epoch": 1.410405508798776, + "grad_norm": 2.144472320092002, + "learning_rate": 4.222638466796272e-06, + "loss": 0.2989, + "step": 9217 + }, + { + "epoch": 1.410558530986993, + "grad_norm": 1.9545756311695621, + "learning_rate": 4.220615892773501e-06, + "loss": 0.2725, + "step": 9218 + }, + { + "epoch": 1.4107115531752104, + "grad_norm": 2.2589372985105705, + "learning_rate": 4.218593673685324e-06, + "loss": 0.3786, + "step": 9219 + }, + { + "epoch": 1.4108645753634277, + "grad_norm": 2.010967215877556, + "learning_rate": 4.2165718096559196e-06, + "loss": 0.2905, + "step": 9220 + }, + { + "epoch": 1.411017597551645, + "grad_norm": 2.0807598973462884, + "learning_rate": 4.214550300809468e-06, + "loss": 0.3706, + "step": 9221 + }, + { + "epoch": 1.4111706197398624, + "grad_norm": 2.365584366602693, + "learning_rate": 4.212529147270119e-06, + "loss": 0.397, + "step": 9222 + }, + { + "epoch": 1.4113236419280795, + "grad_norm": 2.510130358885097, + "learning_rate": 4.210508349161989e-06, + "loss": 0.415, + "step": 9223 + }, + { + "epoch": 1.4114766641162968, + "grad_norm": 2.2722053771676265, + "learning_rate": 4.208487906609193e-06, + "loss": 0.3437, + "step": 9224 + }, + { + "epoch": 1.4116296863045141, + "grad_norm": 2.2739261112065448, + "learning_rate": 4.2064678197358155e-06, + "loss": 0.3325, + "step": 9225 + }, + { + "epoch": 1.4117827084927315, + "grad_norm": 2.0286388443546968, + "learning_rate": 4.204448088665906e-06, + "loss": 0.2738, + "step": 9226 + }, + { + "epoch": 1.4119357306809488, + "grad_norm": 2.021410749339718, + "learning_rate": 4.2024287135235156e-06, + "loss": 0.3109, + "step": 9227 + }, + { + "epoch": 1.412088752869166, + "grad_norm": 2.1979983118021713, + "learning_rate": 4.200409694432658e-06, + "loss": 0.3539, + "step": 9228 + }, + { + "epoch": 1.4122417750573832, + "grad_norm": 2.0107755272944243, + "learning_rate": 4.198391031517328e-06, + "loss": 0.3057, + "step": 9229 + }, + { + "epoch": 1.4123947972456006, + "grad_norm": 1.9654965383083711, + "learning_rate": 4.196372724901502e-06, + "loss": 0.3215, + "step": 9230 + }, + { + "epoch": 1.412547819433818, + "grad_norm": 2.0359147985124344, + "learning_rate": 4.1943547747091306e-06, + "loss": 0.2846, + "step": 9231 + }, + { + "epoch": 1.4127008416220352, + "grad_norm": 2.098622048300682, + "learning_rate": 4.192337181064145e-06, + "loss": 0.3234, + "step": 9232 + }, + { + "epoch": 1.4128538638102524, + "grad_norm": 2.4112177721252963, + "learning_rate": 4.190319944090452e-06, + "loss": 0.4037, + "step": 9233 + }, + { + "epoch": 1.41300688599847, + "grad_norm": 2.1986701033338933, + "learning_rate": 4.1883030639119385e-06, + "loss": 0.3305, + "step": 9234 + }, + { + "epoch": 1.413159908186687, + "grad_norm": 2.1304605397689422, + "learning_rate": 4.18628654065247e-06, + "loss": 0.3071, + "step": 9235 + }, + { + "epoch": 1.4133129303749044, + "grad_norm": 2.5068322091767747, + "learning_rate": 4.1842703744358885e-06, + "loss": 0.4024, + "step": 9236 + }, + { + "epoch": 1.4134659525631217, + "grad_norm": 1.8202902088615422, + "learning_rate": 4.182254565386015e-06, + "loss": 0.2486, + "step": 9237 + }, + { + "epoch": 1.413618974751339, + "grad_norm": 1.968323001514379, + "learning_rate": 4.180239113626646e-06, + "loss": 0.3794, + "step": 9238 + }, + { + "epoch": 1.4137719969395564, + "grad_norm": 2.0430665390514577, + "learning_rate": 4.1782240192815614e-06, + "loss": 0.3269, + "step": 9239 + }, + { + "epoch": 1.4139250191277735, + "grad_norm": 2.2109860232560603, + "learning_rate": 4.176209282474515e-06, + "loss": 0.319, + "step": 9240 + }, + { + "epoch": 1.4140780413159908, + "grad_norm": 2.0509503774215148, + "learning_rate": 4.174194903329239e-06, + "loss": 0.33, + "step": 9241 + }, + { + "epoch": 1.4142310635042081, + "grad_norm": 2.07717813707438, + "learning_rate": 4.1721808819694445e-06, + "loss": 0.2624, + "step": 9242 + }, + { + "epoch": 1.4143840856924255, + "grad_norm": 2.294446630754429, + "learning_rate": 4.17016721851882e-06, + "loss": 0.3071, + "step": 9243 + }, + { + "epoch": 1.4145371078806428, + "grad_norm": 2.5392494654134987, + "learning_rate": 4.168153913101029e-06, + "loss": 0.3624, + "step": 9244 + }, + { + "epoch": 1.41469013006886, + "grad_norm": 1.887829520244782, + "learning_rate": 4.16614096583973e-06, + "loss": 0.3119, + "step": 9245 + }, + { + "epoch": 1.4148431522570772, + "grad_norm": 2.5210457622741456, + "learning_rate": 4.1641283768585315e-06, + "loss": 0.3963, + "step": 9246 + }, + { + "epoch": 1.4149961744452946, + "grad_norm": 2.0711867206232166, + "learning_rate": 4.162116146281036e-06, + "loss": 0.296, + "step": 9247 + }, + { + "epoch": 1.415149196633512, + "grad_norm": 2.062152923881803, + "learning_rate": 4.160104274230833e-06, + "loss": 0.3765, + "step": 9248 + }, + { + "epoch": 1.4153022188217292, + "grad_norm": 2.0768357136167945, + "learning_rate": 4.1580927608314645e-06, + "loss": 0.4489, + "step": 9249 + }, + { + "epoch": 1.4154552410099464, + "grad_norm": 2.096843231214301, + "learning_rate": 4.156081606206478e-06, + "loss": 0.3397, + "step": 9250 + }, + { + "epoch": 1.4156082631981637, + "grad_norm": 1.947566872119692, + "learning_rate": 4.154070810479385e-06, + "loss": 0.316, + "step": 9251 + }, + { + "epoch": 1.415761285386381, + "grad_norm": 2.180262409466973, + "learning_rate": 4.152060373773665e-06, + "loss": 0.3205, + "step": 9252 + }, + { + "epoch": 1.4159143075745984, + "grad_norm": 2.19393243995704, + "learning_rate": 4.150050296212799e-06, + "loss": 0.328, + "step": 9253 + }, + { + "epoch": 1.4160673297628157, + "grad_norm": 2.2667999392118077, + "learning_rate": 4.148040577920233e-06, + "loss": 0.3422, + "step": 9254 + }, + { + "epoch": 1.4162203519510328, + "grad_norm": 2.3500757721659347, + "learning_rate": 4.146031219019381e-06, + "loss": 0.3223, + "step": 9255 + }, + { + "epoch": 1.4163733741392501, + "grad_norm": 2.038336086469945, + "learning_rate": 4.144022219633656e-06, + "loss": 0.3369, + "step": 9256 + }, + { + "epoch": 1.4165263963274675, + "grad_norm": 2.47373157191876, + "learning_rate": 4.142013579886435e-06, + "loss": 0.3522, + "step": 9257 + }, + { + "epoch": 1.4166794185156848, + "grad_norm": 2.006527184254753, + "learning_rate": 4.140005299901076e-06, + "loss": 0.3342, + "step": 9258 + }, + { + "epoch": 1.4168324407039021, + "grad_norm": 2.0188062531388566, + "learning_rate": 4.137997379800916e-06, + "loss": 0.2992, + "step": 9259 + }, + { + "epoch": 1.4169854628921192, + "grad_norm": 2.1837388368801305, + "learning_rate": 4.135989819709271e-06, + "loss": 0.3306, + "step": 9260 + }, + { + "epoch": 1.4171384850803366, + "grad_norm": 2.0237473797483907, + "learning_rate": 4.1339826197494305e-06, + "loss": 0.3127, + "step": 9261 + }, + { + "epoch": 1.417291507268554, + "grad_norm": 2.056314469069344, + "learning_rate": 4.131975780044665e-06, + "loss": 0.3071, + "step": 9262 + }, + { + "epoch": 1.4174445294567712, + "grad_norm": 2.196217556752408, + "learning_rate": 4.129969300718223e-06, + "loss": 0.3395, + "step": 9263 + }, + { + "epoch": 1.4175975516449886, + "grad_norm": 2.1032234644707835, + "learning_rate": 4.1279631818933295e-06, + "loss": 0.2709, + "step": 9264 + }, + { + "epoch": 1.4177505738332057, + "grad_norm": 2.4031479684877817, + "learning_rate": 4.125957423693186e-06, + "loss": 0.42, + "step": 9265 + }, + { + "epoch": 1.4179035960214232, + "grad_norm": 2.0786897004206732, + "learning_rate": 4.123952026240982e-06, + "loss": 0.3923, + "step": 9266 + }, + { + "epoch": 1.4180566182096404, + "grad_norm": 2.251632349890319, + "learning_rate": 4.121946989659869e-06, + "loss": 0.3852, + "step": 9267 + }, + { + "epoch": 1.4182096403978577, + "grad_norm": 2.555790821467786, + "learning_rate": 4.119942314072982e-06, + "loss": 0.411, + "step": 9268 + }, + { + "epoch": 1.418362662586075, + "grad_norm": 2.349511897533703, + "learning_rate": 4.117937999603448e-06, + "loss": 0.3667, + "step": 9269 + }, + { + "epoch": 1.4185156847742924, + "grad_norm": 2.4392205660352904, + "learning_rate": 4.115934046374348e-06, + "loss": 0.316, + "step": 9270 + }, + { + "epoch": 1.4186687069625097, + "grad_norm": 2.148913506260671, + "learning_rate": 4.1139304545087545e-06, + "loss": 0.3347, + "step": 9271 + }, + { + "epoch": 1.4188217291507268, + "grad_norm": 2.0878267481393658, + "learning_rate": 4.111927224129724e-06, + "loss": 0.342, + "step": 9272 + }, + { + "epoch": 1.4189747513389441, + "grad_norm": 2.161297083283504, + "learning_rate": 4.109924355360271e-06, + "loss": 0.3463, + "step": 9273 + }, + { + "epoch": 1.4191277735271615, + "grad_norm": 1.976150388866175, + "learning_rate": 4.107921848323409e-06, + "loss": 0.3227, + "step": 9274 + }, + { + "epoch": 1.4192807957153788, + "grad_norm": 2.19615636726198, + "learning_rate": 4.1059197031421185e-06, + "loss": 0.4099, + "step": 9275 + }, + { + "epoch": 1.4194338179035961, + "grad_norm": 1.9991519943828997, + "learning_rate": 4.1039179199393495e-06, + "loss": 0.2936, + "step": 9276 + }, + { + "epoch": 1.4195868400918132, + "grad_norm": 2.1666577229545427, + "learning_rate": 4.101916498838052e-06, + "loss": 0.3918, + "step": 9277 + }, + { + "epoch": 1.4197398622800306, + "grad_norm": 2.0607595100678964, + "learning_rate": 4.099915439961135e-06, + "loss": 0.2556, + "step": 9278 + }, + { + "epoch": 1.419892884468248, + "grad_norm": 2.2451488681433407, + "learning_rate": 4.097914743431491e-06, + "loss": 0.3761, + "step": 9279 + }, + { + "epoch": 1.4200459066564652, + "grad_norm": 2.2674171815366364, + "learning_rate": 4.095914409371994e-06, + "loss": 0.3863, + "step": 9280 + }, + { + "epoch": 1.4201989288446826, + "grad_norm": 2.1163719464876447, + "learning_rate": 4.093914437905489e-06, + "loss": 0.3283, + "step": 9281 + }, + { + "epoch": 1.4203519510328997, + "grad_norm": 2.193899121026817, + "learning_rate": 4.091914829154801e-06, + "loss": 0.3804, + "step": 9282 + }, + { + "epoch": 1.420504973221117, + "grad_norm": 2.1351207017567355, + "learning_rate": 4.089915583242738e-06, + "loss": 0.3554, + "step": 9283 + }, + { + "epoch": 1.4206579954093344, + "grad_norm": 2.151449603371385, + "learning_rate": 4.087916700292079e-06, + "loss": 0.3385, + "step": 9284 + }, + { + "epoch": 1.4208110175975517, + "grad_norm": 2.2234211243154087, + "learning_rate": 4.0859181804255845e-06, + "loss": 0.3383, + "step": 9285 + }, + { + "epoch": 1.420964039785769, + "grad_norm": 2.2507111139185745, + "learning_rate": 4.08392002376599e-06, + "loss": 0.3811, + "step": 9286 + }, + { + "epoch": 1.4211170619739861, + "grad_norm": 1.973399321101313, + "learning_rate": 4.08192223043601e-06, + "loss": 0.2961, + "step": 9287 + }, + { + "epoch": 1.4212700841622035, + "grad_norm": 2.133709019598902, + "learning_rate": 4.079924800558338e-06, + "loss": 0.3478, + "step": 9288 + }, + { + "epoch": 1.4214231063504208, + "grad_norm": 2.158294322387067, + "learning_rate": 4.077927734255643e-06, + "loss": 0.3751, + "step": 9289 + }, + { + "epoch": 1.4215761285386381, + "grad_norm": 1.9920504708122626, + "learning_rate": 4.075931031650574e-06, + "loss": 0.3415, + "step": 9290 + }, + { + "epoch": 1.4217291507268555, + "grad_norm": 2.20512398485947, + "learning_rate": 4.073934692865755e-06, + "loss": 0.3244, + "step": 9291 + }, + { + "epoch": 1.4218821729150726, + "grad_norm": 2.347120560371027, + "learning_rate": 4.07193871802379e-06, + "loss": 0.3721, + "step": 9292 + }, + { + "epoch": 1.42203519510329, + "grad_norm": 2.067692285877767, + "learning_rate": 4.069943107247259e-06, + "loss": 0.315, + "step": 9293 + }, + { + "epoch": 1.4221882172915072, + "grad_norm": 2.0118785839217943, + "learning_rate": 4.0679478606587155e-06, + "loss": 0.318, + "step": 9294 + }, + { + "epoch": 1.4223412394797246, + "grad_norm": 2.1728967714428493, + "learning_rate": 4.065952978380708e-06, + "loss": 0.3713, + "step": 9295 + }, + { + "epoch": 1.422494261667942, + "grad_norm": 2.311326910138758, + "learning_rate": 4.063958460535738e-06, + "loss": 0.3365, + "step": 9296 + }, + { + "epoch": 1.422647283856159, + "grad_norm": 2.1496506605822905, + "learning_rate": 4.061964307246299e-06, + "loss": 0.3167, + "step": 9297 + }, + { + "epoch": 1.4228003060443766, + "grad_norm": 2.143129053849377, + "learning_rate": 4.059970518634867e-06, + "loss": 0.3739, + "step": 9298 + }, + { + "epoch": 1.4229533282325937, + "grad_norm": 2.3506052011866156, + "learning_rate": 4.05797709482388e-06, + "loss": 0.3054, + "step": 9299 + }, + { + "epoch": 1.423106350420811, + "grad_norm": 2.0868900431403006, + "learning_rate": 4.055984035935762e-06, + "loss": 0.3172, + "step": 9300 + }, + { + "epoch": 1.4232593726090284, + "grad_norm": 2.064767658180076, + "learning_rate": 4.0539913420929235e-06, + "loss": 0.2893, + "step": 9301 + }, + { + "epoch": 1.4234123947972457, + "grad_norm": 2.1290755477464978, + "learning_rate": 4.051999013417731e-06, + "loss": 0.3727, + "step": 9302 + }, + { + "epoch": 1.423565416985463, + "grad_norm": 1.8483480553704854, + "learning_rate": 4.050007050032552e-06, + "loss": 0.2864, + "step": 9303 + }, + { + "epoch": 1.4237184391736801, + "grad_norm": 2.0023364387746447, + "learning_rate": 4.048015452059719e-06, + "loss": 0.3626, + "step": 9304 + }, + { + "epoch": 1.4238714613618975, + "grad_norm": 1.8518706736246955, + "learning_rate": 4.046024219621535e-06, + "loss": 0.2707, + "step": 9305 + }, + { + "epoch": 1.4240244835501148, + "grad_norm": 1.9748324071119407, + "learning_rate": 4.044033352840299e-06, + "loss": 0.3093, + "step": 9306 + }, + { + "epoch": 1.4241775057383321, + "grad_norm": 1.9481838626006736, + "learning_rate": 4.042042851838278e-06, + "loss": 0.2265, + "step": 9307 + }, + { + "epoch": 1.4243305279265495, + "grad_norm": 2.210466757271439, + "learning_rate": 4.040052716737707e-06, + "loss": 0.3629, + "step": 9308 + }, + { + "epoch": 1.4244835501147666, + "grad_norm": 2.0610090670428973, + "learning_rate": 4.038062947660817e-06, + "loss": 0.3107, + "step": 9309 + }, + { + "epoch": 1.424636572302984, + "grad_norm": 2.149631892573452, + "learning_rate": 4.0360735447298035e-06, + "loss": 0.3422, + "step": 9310 + }, + { + "epoch": 1.4247895944912012, + "grad_norm": 1.990278427506984, + "learning_rate": 4.034084508066846e-06, + "loss": 0.3334, + "step": 9311 + }, + { + "epoch": 1.4249426166794186, + "grad_norm": 2.2489829280017135, + "learning_rate": 4.0320958377940976e-06, + "loss": 0.3528, + "step": 9312 + }, + { + "epoch": 1.425095638867636, + "grad_norm": 2.3206819559913052, + "learning_rate": 4.03010753403369e-06, + "loss": 0.3797, + "step": 9313 + }, + { + "epoch": 1.425248661055853, + "grad_norm": 2.0766681221229097, + "learning_rate": 4.028119596907734e-06, + "loss": 0.3592, + "step": 9314 + }, + { + "epoch": 1.4254016832440703, + "grad_norm": 2.0848585897926974, + "learning_rate": 4.026132026538315e-06, + "loss": 0.3219, + "step": 9315 + }, + { + "epoch": 1.4255547054322877, + "grad_norm": 2.3234590571082636, + "learning_rate": 4.024144823047498e-06, + "loss": 0.3569, + "step": 9316 + }, + { + "epoch": 1.425707727620505, + "grad_norm": 2.00027416391328, + "learning_rate": 4.022157986557325e-06, + "loss": 0.3802, + "step": 9317 + }, + { + "epoch": 1.4258607498087223, + "grad_norm": 2.0622902572976707, + "learning_rate": 4.020171517189816e-06, + "loss": 0.3386, + "step": 9318 + }, + { + "epoch": 1.4260137719969395, + "grad_norm": 1.8924055155330906, + "learning_rate": 4.0181854150669665e-06, + "loss": 0.3028, + "step": 9319 + }, + { + "epoch": 1.4261667941851568, + "grad_norm": 2.576666626932052, + "learning_rate": 4.016199680310753e-06, + "loss": 0.3428, + "step": 9320 + }, + { + "epoch": 1.4263198163733741, + "grad_norm": 2.248274794714831, + "learning_rate": 4.014214313043124e-06, + "loss": 0.4452, + "step": 9321 + }, + { + "epoch": 1.4264728385615915, + "grad_norm": 1.8804937826661376, + "learning_rate": 4.012229313386013e-06, + "loss": 0.308, + "step": 9322 + }, + { + "epoch": 1.4266258607498088, + "grad_norm": 2.0792187323412, + "learning_rate": 4.010244681461319e-06, + "loss": 0.3295, + "step": 9323 + }, + { + "epoch": 1.426778882938026, + "grad_norm": 2.050108419893287, + "learning_rate": 4.008260417390938e-06, + "loss": 0.3332, + "step": 9324 + }, + { + "epoch": 1.4269319051262432, + "grad_norm": 2.073816356773294, + "learning_rate": 4.0062765212967215e-06, + "loss": 0.3315, + "step": 9325 + }, + { + "epoch": 1.4270849273144606, + "grad_norm": 2.05419996109476, + "learning_rate": 4.004292993300508e-06, + "loss": 0.3372, + "step": 9326 + }, + { + "epoch": 1.427237949502678, + "grad_norm": 2.4100490477444887, + "learning_rate": 4.002309833524124e-06, + "loss": 0.3862, + "step": 9327 + }, + { + "epoch": 1.4273909716908952, + "grad_norm": 1.9825638628012534, + "learning_rate": 4.000327042089353e-06, + "loss": 0.3253, + "step": 9328 + }, + { + "epoch": 1.4275439938791123, + "grad_norm": 2.2351531713408934, + "learning_rate": 3.998344619117965e-06, + "loss": 0.3989, + "step": 9329 + }, + { + "epoch": 1.4276970160673297, + "grad_norm": 2.0848182917817617, + "learning_rate": 3.996362564731721e-06, + "loss": 0.3643, + "step": 9330 + }, + { + "epoch": 1.427850038255547, + "grad_norm": 2.1695084103350344, + "learning_rate": 3.99438087905233e-06, + "loss": 0.3361, + "step": 9331 + }, + { + "epoch": 1.4280030604437643, + "grad_norm": 2.384032138139611, + "learning_rate": 3.992399562201507e-06, + "loss": 0.3408, + "step": 9332 + }, + { + "epoch": 1.4281560826319817, + "grad_norm": 1.984735482035343, + "learning_rate": 3.990418614300931e-06, + "loss": 0.2962, + "step": 9333 + }, + { + "epoch": 1.4283091048201988, + "grad_norm": 1.8606884139982514, + "learning_rate": 3.988438035472253e-06, + "loss": 0.3065, + "step": 9334 + }, + { + "epoch": 1.4284621270084163, + "grad_norm": 2.0350449158318558, + "learning_rate": 3.986457825837115e-06, + "loss": 0.2656, + "step": 9335 + }, + { + "epoch": 1.4286151491966335, + "grad_norm": 2.2668034428389268, + "learning_rate": 3.98447798551713e-06, + "loss": 0.3188, + "step": 9336 + }, + { + "epoch": 1.4287681713848508, + "grad_norm": 2.507306255993113, + "learning_rate": 3.982498514633879e-06, + "loss": 0.4137, + "step": 9337 + }, + { + "epoch": 1.4289211935730681, + "grad_norm": 2.2283287973014194, + "learning_rate": 3.980519413308938e-06, + "loss": 0.3417, + "step": 9338 + }, + { + "epoch": 1.4290742157612855, + "grad_norm": 2.0412354072050025, + "learning_rate": 3.9785406816638474e-06, + "loss": 0.371, + "step": 9339 + }, + { + "epoch": 1.4292272379495028, + "grad_norm": 2.4175597698942464, + "learning_rate": 3.97656231982013e-06, + "loss": 0.4123, + "step": 9340 + }, + { + "epoch": 1.42938026013772, + "grad_norm": 2.1106879107993812, + "learning_rate": 3.9745843278992835e-06, + "loss": 0.3441, + "step": 9341 + }, + { + "epoch": 1.4295332823259372, + "grad_norm": 2.178146614681262, + "learning_rate": 3.9726067060227855e-06, + "loss": 0.3439, + "step": 9342 + }, + { + "epoch": 1.4296863045141546, + "grad_norm": 1.9155241157979555, + "learning_rate": 3.97062945431209e-06, + "loss": 0.2897, + "step": 9343 + }, + { + "epoch": 1.429839326702372, + "grad_norm": 2.1661492829484477, + "learning_rate": 3.968652572888626e-06, + "loss": 0.3392, + "step": 9344 + }, + { + "epoch": 1.4299923488905892, + "grad_norm": 2.1917874006953553, + "learning_rate": 3.966676061873802e-06, + "loss": 0.3808, + "step": 9345 + }, + { + "epoch": 1.4301453710788063, + "grad_norm": 2.1886007253205038, + "learning_rate": 3.9646999213890045e-06, + "loss": 0.3263, + "step": 9346 + }, + { + "epoch": 1.4302983932670237, + "grad_norm": 1.8647429617175988, + "learning_rate": 3.96272415155559e-06, + "loss": 0.3029, + "step": 9347 + }, + { + "epoch": 1.430451415455241, + "grad_norm": 2.145380663913235, + "learning_rate": 3.960748752494912e-06, + "loss": 0.3374, + "step": 9348 + }, + { + "epoch": 1.4306044376434583, + "grad_norm": 1.8874504028970565, + "learning_rate": 3.9587737243282745e-06, + "loss": 0.3119, + "step": 9349 + }, + { + "epoch": 1.4307574598316757, + "grad_norm": 2.0694188207539903, + "learning_rate": 3.9567990671769715e-06, + "loss": 0.3817, + "step": 9350 + }, + { + "epoch": 1.4309104820198928, + "grad_norm": 1.9065080001772796, + "learning_rate": 3.954824781162288e-06, + "loss": 0.3096, + "step": 9351 + }, + { + "epoch": 1.4310635042081101, + "grad_norm": 2.121242007256196, + "learning_rate": 3.952850866405455e-06, + "loss": 0.3715, + "step": 9352 + }, + { + "epoch": 1.4312165263963275, + "grad_norm": 2.0622979357181963, + "learning_rate": 3.950877323027711e-06, + "loss": 0.3287, + "step": 9353 + }, + { + "epoch": 1.4313695485845448, + "grad_norm": 2.4372797945913227, + "learning_rate": 3.948904151150258e-06, + "loss": 0.4393, + "step": 9354 + }, + { + "epoch": 1.4315225707727621, + "grad_norm": 2.201072981321367, + "learning_rate": 3.946931350894267e-06, + "loss": 0.2913, + "step": 9355 + }, + { + "epoch": 1.4316755929609792, + "grad_norm": 2.1034205574833695, + "learning_rate": 3.944958922380904e-06, + "loss": 0.3524, + "step": 9356 + }, + { + "epoch": 1.4318286151491966, + "grad_norm": 2.172868895461568, + "learning_rate": 3.942986865731304e-06, + "loss": 0.3317, + "step": 9357 + }, + { + "epoch": 1.431981637337414, + "grad_norm": 2.0235574851664198, + "learning_rate": 3.94101518106657e-06, + "loss": 0.3337, + "step": 9358 + }, + { + "epoch": 1.4321346595256312, + "grad_norm": 1.9239559729267717, + "learning_rate": 3.939043868507801e-06, + "loss": 0.3044, + "step": 9359 + }, + { + "epoch": 1.4322876817138486, + "grad_norm": 2.116674414107678, + "learning_rate": 3.937072928176057e-06, + "loss": 0.3301, + "step": 9360 + }, + { + "epoch": 1.4324407039020657, + "grad_norm": 2.171452873241745, + "learning_rate": 3.935102360192382e-06, + "loss": 0.3639, + "step": 9361 + }, + { + "epoch": 1.432593726090283, + "grad_norm": 2.1857081150102005, + "learning_rate": 3.933132164677799e-06, + "loss": 0.3481, + "step": 9362 + }, + { + "epoch": 1.4327467482785003, + "grad_norm": 2.169013596341822, + "learning_rate": 3.931162341753302e-06, + "loss": 0.3509, + "step": 9363 + }, + { + "epoch": 1.4328997704667177, + "grad_norm": 1.896338149593098, + "learning_rate": 3.9291928915398685e-06, + "loss": 0.2785, + "step": 9364 + }, + { + "epoch": 1.433052792654935, + "grad_norm": 2.549158690084208, + "learning_rate": 3.927223814158447e-06, + "loss": 0.3658, + "step": 9365 + }, + { + "epoch": 1.4332058148431521, + "grad_norm": 2.0841384735924176, + "learning_rate": 3.925255109729969e-06, + "loss": 0.3119, + "step": 9366 + }, + { + "epoch": 1.4333588370313697, + "grad_norm": 1.9791111620268143, + "learning_rate": 3.923286778375339e-06, + "loss": 0.3146, + "step": 9367 + }, + { + "epoch": 1.4335118592195868, + "grad_norm": 2.2175978956874953, + "learning_rate": 3.92131882021544e-06, + "loss": 0.3854, + "step": 9368 + }, + { + "epoch": 1.4336648814078041, + "grad_norm": 1.9276236760923329, + "learning_rate": 3.919351235371133e-06, + "loss": 0.2764, + "step": 9369 + }, + { + "epoch": 1.4338179035960215, + "grad_norm": 2.4958786979979513, + "learning_rate": 3.917384023963254e-06, + "loss": 0.3887, + "step": 9370 + }, + { + "epoch": 1.4339709257842388, + "grad_norm": 1.8840779023709011, + "learning_rate": 3.9154171861126165e-06, + "loss": 0.2659, + "step": 9371 + }, + { + "epoch": 1.4341239479724561, + "grad_norm": 1.818201570937649, + "learning_rate": 3.913450721940013e-06, + "loss": 0.2917, + "step": 9372 + }, + { + "epoch": 1.4342769701606732, + "grad_norm": 2.309820936836204, + "learning_rate": 3.9114846315662114e-06, + "loss": 0.3719, + "step": 9373 + }, + { + "epoch": 1.4344299923488906, + "grad_norm": 2.023699879434676, + "learning_rate": 3.909518915111957e-06, + "loss": 0.3303, + "step": 9374 + }, + { + "epoch": 1.434583014537108, + "grad_norm": 1.9069980001503113, + "learning_rate": 3.9075535726979715e-06, + "loss": 0.2974, + "step": 9375 + }, + { + "epoch": 1.4347360367253252, + "grad_norm": 2.170571525569376, + "learning_rate": 3.905588604444953e-06, + "loss": 0.3023, + "step": 9376 + }, + { + "epoch": 1.4348890589135426, + "grad_norm": 2.1143488858850663, + "learning_rate": 3.903624010473585e-06, + "loss": 0.3012, + "step": 9377 + }, + { + "epoch": 1.4350420811017597, + "grad_norm": 2.2298559527760418, + "learning_rate": 3.901659790904514e-06, + "loss": 0.359, + "step": 9378 + }, + { + "epoch": 1.435195103289977, + "grad_norm": 1.8232134485763651, + "learning_rate": 3.899695945858367e-06, + "loss": 0.3267, + "step": 9379 + }, + { + "epoch": 1.4353481254781943, + "grad_norm": 2.029838669991213, + "learning_rate": 3.897732475455764e-06, + "loss": 0.2858, + "step": 9380 + }, + { + "epoch": 1.4355011476664117, + "grad_norm": 1.7961842764250062, + "learning_rate": 3.895769379817279e-06, + "loss": 0.3031, + "step": 9381 + }, + { + "epoch": 1.435654169854629, + "grad_norm": 2.169262090657161, + "learning_rate": 3.893806659063473e-06, + "loss": 0.2784, + "step": 9382 + }, + { + "epoch": 1.4358071920428461, + "grad_norm": 1.9856810913149714, + "learning_rate": 3.891844313314893e-06, + "loss": 0.2901, + "step": 9383 + }, + { + "epoch": 1.4359602142310635, + "grad_norm": 2.2450828272982783, + "learning_rate": 3.889882342692043e-06, + "loss": 0.3681, + "step": 9384 + }, + { + "epoch": 1.4361132364192808, + "grad_norm": 2.304650971104986, + "learning_rate": 3.8879207473154245e-06, + "loss": 0.3499, + "step": 9385 + }, + { + "epoch": 1.4362662586074981, + "grad_norm": 2.057041817931834, + "learning_rate": 3.885959527305507e-06, + "loss": 0.3643, + "step": 9386 + }, + { + "epoch": 1.4364192807957155, + "grad_norm": 2.1277655643358195, + "learning_rate": 3.883998682782727e-06, + "loss": 0.394, + "step": 9387 + }, + { + "epoch": 1.4365723029839326, + "grad_norm": 2.073493486256418, + "learning_rate": 3.882038213867516e-06, + "loss": 0.3343, + "step": 9388 + }, + { + "epoch": 1.43672532517215, + "grad_norm": 2.300417499890103, + "learning_rate": 3.880078120680273e-06, + "loss": 0.4377, + "step": 9389 + }, + { + "epoch": 1.4368783473603672, + "grad_norm": 2.1283040533902344, + "learning_rate": 3.878118403341373e-06, + "loss": 0.3926, + "step": 9390 + }, + { + "epoch": 1.4370313695485846, + "grad_norm": 2.0678784247283066, + "learning_rate": 3.876159061971172e-06, + "loss": 0.3086, + "step": 9391 + }, + { + "epoch": 1.437184391736802, + "grad_norm": 2.5933724046485556, + "learning_rate": 3.874200096689999e-06, + "loss": 0.4141, + "step": 9392 + }, + { + "epoch": 1.437337413925019, + "grad_norm": 2.569529420688582, + "learning_rate": 3.872241507618162e-06, + "loss": 0.3729, + "step": 9393 + }, + { + "epoch": 1.4374904361132363, + "grad_norm": 2.424768703143181, + "learning_rate": 3.870283294875946e-06, + "loss": 0.3334, + "step": 9394 + }, + { + "epoch": 1.4376434583014537, + "grad_norm": 2.075534358433235, + "learning_rate": 3.868325458583613e-06, + "loss": 0.3691, + "step": 9395 + }, + { + "epoch": 1.437796480489671, + "grad_norm": 2.035561827260354, + "learning_rate": 3.8663679988614004e-06, + "loss": 0.2953, + "step": 9396 + }, + { + "epoch": 1.4379495026778883, + "grad_norm": 2.5251761093520715, + "learning_rate": 3.864410915829523e-06, + "loss": 0.3889, + "step": 9397 + }, + { + "epoch": 1.4381025248661055, + "grad_norm": 2.4679716294913505, + "learning_rate": 3.862454209608175e-06, + "loss": 0.317, + "step": 9398 + }, + { + "epoch": 1.438255547054323, + "grad_norm": 2.148348350788539, + "learning_rate": 3.860497880317523e-06, + "loss": 0.3622, + "step": 9399 + }, + { + "epoch": 1.4384085692425401, + "grad_norm": 2.1987933936762842, + "learning_rate": 3.858541928077716e-06, + "loss": 0.3212, + "step": 9400 + }, + { + "epoch": 1.4385615914307575, + "grad_norm": 2.0618331098672638, + "learning_rate": 3.856586353008873e-06, + "loss": 0.3206, + "step": 9401 + }, + { + "epoch": 1.4387146136189748, + "grad_norm": 2.37921277084796, + "learning_rate": 3.854631155231096e-06, + "loss": 0.4022, + "step": 9402 + }, + { + "epoch": 1.4388676358071921, + "grad_norm": 2.0436503449931194, + "learning_rate": 3.85267633486446e-06, + "loss": 0.2922, + "step": 9403 + }, + { + "epoch": 1.4390206579954095, + "grad_norm": 1.8930267404115466, + "learning_rate": 3.850721892029019e-06, + "loss": 0.3687, + "step": 9404 + }, + { + "epoch": 1.4391736801836266, + "grad_norm": 2.01203672849637, + "learning_rate": 3.8487678268448005e-06, + "loss": 0.3022, + "step": 9405 + }, + { + "epoch": 1.439326702371844, + "grad_norm": 2.635657231491509, + "learning_rate": 3.846814139431819e-06, + "loss": 0.3954, + "step": 9406 + }, + { + "epoch": 1.4394797245600612, + "grad_norm": 1.7576099379374932, + "learning_rate": 3.844860829910048e-06, + "loss": 0.2292, + "step": 9407 + }, + { + "epoch": 1.4396327467482786, + "grad_norm": 2.0022483302447966, + "learning_rate": 3.842907898399452e-06, + "loss": 0.287, + "step": 9408 + }, + { + "epoch": 1.439785768936496, + "grad_norm": 2.3685326123050348, + "learning_rate": 3.840955345019973e-06, + "loss": 0.3695, + "step": 9409 + }, + { + "epoch": 1.439938791124713, + "grad_norm": 2.131612009218713, + "learning_rate": 3.839003169891519e-06, + "loss": 0.3578, + "step": 9410 + }, + { + "epoch": 1.4400918133129303, + "grad_norm": 2.3468392356985324, + "learning_rate": 3.8370513731339775e-06, + "loss": 0.3115, + "step": 9411 + }, + { + "epoch": 1.4402448355011477, + "grad_norm": 2.223056491521997, + "learning_rate": 3.835099954867228e-06, + "loss": 0.3995, + "step": 9412 + }, + { + "epoch": 1.440397857689365, + "grad_norm": 2.0760931406496095, + "learning_rate": 3.833148915211101e-06, + "loss": 0.2893, + "step": 9413 + }, + { + "epoch": 1.4405508798775823, + "grad_norm": 2.5262822068969313, + "learning_rate": 3.831198254285428e-06, + "loss": 0.4318, + "step": 9414 + }, + { + "epoch": 1.4407039020657995, + "grad_norm": 2.090177035772777, + "learning_rate": 3.829247972210005e-06, + "loss": 0.3353, + "step": 9415 + }, + { + "epoch": 1.4408569242540168, + "grad_norm": 2.2014983845962877, + "learning_rate": 3.827298069104598e-06, + "loss": 0.3364, + "step": 9416 + }, + { + "epoch": 1.4410099464422341, + "grad_norm": 2.12746696908804, + "learning_rate": 3.825348545088967e-06, + "loss": 0.3543, + "step": 9417 + }, + { + "epoch": 1.4411629686304515, + "grad_norm": 2.1360864766502017, + "learning_rate": 3.823399400282838e-06, + "loss": 0.3685, + "step": 9418 + }, + { + "epoch": 1.4413159908186688, + "grad_norm": 1.8667857752382644, + "learning_rate": 3.821450634805915e-06, + "loss": 0.2794, + "step": 9419 + }, + { + "epoch": 1.441469013006886, + "grad_norm": 2.4708104972068607, + "learning_rate": 3.819502248777878e-06, + "loss": 0.3114, + "step": 9420 + }, + { + "epoch": 1.4416220351951032, + "grad_norm": 2.0565915598013644, + "learning_rate": 3.8175542423183865e-06, + "loss": 0.3356, + "step": 9421 + }, + { + "epoch": 1.4417750573833206, + "grad_norm": 1.8670926466183604, + "learning_rate": 3.815606615547075e-06, + "loss": 0.3092, + "step": 9422 + }, + { + "epoch": 1.441928079571538, + "grad_norm": 2.108441622458775, + "learning_rate": 3.8136593685835545e-06, + "loss": 0.3217, + "step": 9423 + }, + { + "epoch": 1.4420811017597552, + "grad_norm": 2.2213785242745914, + "learning_rate": 3.811712501547413e-06, + "loss": 0.3894, + "step": 9424 + }, + { + "epoch": 1.4422341239479723, + "grad_norm": 2.2379539576928034, + "learning_rate": 3.809766014558216e-06, + "loss": 0.3155, + "step": 9425 + }, + { + "epoch": 1.4423871461361897, + "grad_norm": 2.1450936115838046, + "learning_rate": 3.8078199077355017e-06, + "loss": 0.3084, + "step": 9426 + }, + { + "epoch": 1.442540168324407, + "grad_norm": 1.9620617688500854, + "learning_rate": 3.8058741811987966e-06, + "loss": 0.3054, + "step": 9427 + }, + { + "epoch": 1.4426931905126243, + "grad_norm": 2.3851341339259178, + "learning_rate": 3.8039288350675862e-06, + "loss": 0.3356, + "step": 9428 + }, + { + "epoch": 1.4428462127008417, + "grad_norm": 2.002450463199621, + "learning_rate": 3.801983869461342e-06, + "loss": 0.3458, + "step": 9429 + }, + { + "epoch": 1.4429992348890588, + "grad_norm": 2.1660350030336653, + "learning_rate": 3.8000392844995227e-06, + "loss": 0.3357, + "step": 9430 + }, + { + "epoch": 1.4431522570772761, + "grad_norm": 1.869478686298557, + "learning_rate": 3.7980950803015417e-06, + "loss": 0.3066, + "step": 9431 + }, + { + "epoch": 1.4433052792654935, + "grad_norm": 2.3072334674826576, + "learning_rate": 3.7961512569867997e-06, + "loss": 0.3314, + "step": 9432 + }, + { + "epoch": 1.4434583014537108, + "grad_norm": 2.275252761892947, + "learning_rate": 3.7942078146746852e-06, + "loss": 0.3406, + "step": 9433 + }, + { + "epoch": 1.4436113236419281, + "grad_norm": 1.9311399642400455, + "learning_rate": 3.79226475348454e-06, + "loss": 0.2448, + "step": 9434 + }, + { + "epoch": 1.4437643458301452, + "grad_norm": 1.9937141861498646, + "learning_rate": 3.7903220735357037e-06, + "loss": 0.3383, + "step": 9435 + }, + { + "epoch": 1.4439173680183628, + "grad_norm": 2.2548689227421903, + "learning_rate": 3.788379774947486e-06, + "loss": 0.3373, + "step": 9436 + }, + { + "epoch": 1.44407039020658, + "grad_norm": 2.245991213469082, + "learning_rate": 3.786437857839158e-06, + "loss": 0.3213, + "step": 9437 + }, + { + "epoch": 1.4442234123947972, + "grad_norm": 2.1682580136836727, + "learning_rate": 3.7844963223299925e-06, + "loss": 0.3057, + "step": 9438 + }, + { + "epoch": 1.4443764345830146, + "grad_norm": 2.361384023733611, + "learning_rate": 3.782555168539227e-06, + "loss": 0.2986, + "step": 9439 + }, + { + "epoch": 1.444529456771232, + "grad_norm": 2.0796621089626828, + "learning_rate": 3.780614396586064e-06, + "loss": 0.3345, + "step": 9440 + }, + { + "epoch": 1.4446824789594492, + "grad_norm": 2.1994329703056628, + "learning_rate": 3.7786740065897055e-06, + "loss": 0.3354, + "step": 9441 + }, + { + "epoch": 1.4448355011476663, + "grad_norm": 1.8833177217194372, + "learning_rate": 3.776733998669314e-06, + "loss": 0.2956, + "step": 9442 + }, + { + "epoch": 1.4449885233358837, + "grad_norm": 2.3741820074807496, + "learning_rate": 3.774794372944033e-06, + "loss": 0.3962, + "step": 9443 + }, + { + "epoch": 1.445141545524101, + "grad_norm": 2.190022440966505, + "learning_rate": 3.772855129532982e-06, + "loss": 0.3202, + "step": 9444 + }, + { + "epoch": 1.4452945677123183, + "grad_norm": 2.0964969488525247, + "learning_rate": 3.7709162685552592e-06, + "loss": 0.333, + "step": 9445 + }, + { + "epoch": 1.4454475899005357, + "grad_norm": 2.028210724620882, + "learning_rate": 3.768977790129936e-06, + "loss": 0.2656, + "step": 9446 + }, + { + "epoch": 1.4456006120887528, + "grad_norm": 2.1346199022390726, + "learning_rate": 3.7670396943760635e-06, + "loss": 0.3149, + "step": 9447 + }, + { + "epoch": 1.4457536342769701, + "grad_norm": 2.042384274192098, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.351, + "step": 9448 + }, + { + "epoch": 1.4459066564651875, + "grad_norm": 2.1653394714366656, + "learning_rate": 3.7631646513587473e-06, + "loss": 0.3592, + "step": 9449 + }, + { + "epoch": 1.4460596786534048, + "grad_norm": 1.9813776866303934, + "learning_rate": 3.7612277043332857e-06, + "loss": 0.2843, + "step": 9450 + }, + { + "epoch": 1.4462127008416221, + "grad_norm": 2.0487930782771206, + "learning_rate": 3.759291140455237e-06, + "loss": 0.2968, + "step": 9451 + }, + { + "epoch": 1.4463657230298392, + "grad_norm": 2.1717832715799537, + "learning_rate": 3.7573549598435334e-06, + "loss": 0.3326, + "step": 9452 + }, + { + "epoch": 1.4465187452180566, + "grad_norm": 2.469544616284677, + "learning_rate": 3.755419162617082e-06, + "loss": 0.3397, + "step": 9453 + }, + { + "epoch": 1.446671767406274, + "grad_norm": 2.3789326927343257, + "learning_rate": 3.7534837488947705e-06, + "loss": 0.3217, + "step": 9454 + }, + { + "epoch": 1.4468247895944912, + "grad_norm": 2.0716322076496927, + "learning_rate": 3.7515487187954536e-06, + "loss": 0.3123, + "step": 9455 + }, + { + "epoch": 1.4469778117827086, + "grad_norm": 2.0332000399276637, + "learning_rate": 3.7496140724379815e-06, + "loss": 0.3046, + "step": 9456 + }, + { + "epoch": 1.4471308339709257, + "grad_norm": 1.999069011734315, + "learning_rate": 3.7476798099411583e-06, + "loss": 0.3494, + "step": 9457 + }, + { + "epoch": 1.447283856159143, + "grad_norm": 2.2347715534841033, + "learning_rate": 3.745745931423772e-06, + "loss": 0.3115, + "step": 9458 + }, + { + "epoch": 1.4474368783473603, + "grad_norm": 2.0316516810035026, + "learning_rate": 3.7438124370046025e-06, + "loss": 0.3059, + "step": 9459 + }, + { + "epoch": 1.4475899005355777, + "grad_norm": 1.8778006090816908, + "learning_rate": 3.7418793268023823e-06, + "loss": 0.2661, + "step": 9460 + }, + { + "epoch": 1.447742922723795, + "grad_norm": 2.087926127627727, + "learning_rate": 3.739946600935831e-06, + "loss": 0.3777, + "step": 9461 + }, + { + "epoch": 1.4478959449120121, + "grad_norm": 1.910762098685788, + "learning_rate": 3.738014259523656e-06, + "loss": 0.3092, + "step": 9462 + }, + { + "epoch": 1.4480489671002295, + "grad_norm": 2.1419651512255604, + "learning_rate": 3.736082302684514e-06, + "loss": 0.3237, + "step": 9463 + }, + { + "epoch": 1.4482019892884468, + "grad_norm": 1.9853049413866304, + "learning_rate": 3.734150730537067e-06, + "loss": 0.3298, + "step": 9464 + }, + { + "epoch": 1.4483550114766641, + "grad_norm": 2.169481095354853, + "learning_rate": 3.7322195431999386e-06, + "loss": 0.3032, + "step": 9465 + }, + { + "epoch": 1.4485080336648815, + "grad_norm": 2.3886882013046042, + "learning_rate": 3.730288740791721e-06, + "loss": 0.3152, + "step": 9466 + }, + { + "epoch": 1.4486610558530986, + "grad_norm": 2.1726594327249535, + "learning_rate": 3.728358323431003e-06, + "loss": 0.3891, + "step": 9467 + }, + { + "epoch": 1.4488140780413161, + "grad_norm": 2.3278280144910983, + "learning_rate": 3.726428291236338e-06, + "loss": 0.3213, + "step": 9468 + }, + { + "epoch": 1.4489671002295332, + "grad_norm": 2.0213566163252144, + "learning_rate": 3.7244986443262464e-06, + "loss": 0.2588, + "step": 9469 + }, + { + "epoch": 1.4491201224177506, + "grad_norm": 2.123312387822712, + "learning_rate": 3.7225693828192476e-06, + "loss": 0.364, + "step": 9470 + }, + { + "epoch": 1.449273144605968, + "grad_norm": 1.95586400386957, + "learning_rate": 3.7206405068338203e-06, + "loss": 0.3514, + "step": 9471 + }, + { + "epoch": 1.4494261667941852, + "grad_norm": 1.8020489683421461, + "learning_rate": 3.718712016488425e-06, + "loss": 0.2893, + "step": 9472 + }, + { + "epoch": 1.4495791889824026, + "grad_norm": 2.1640656820411133, + "learning_rate": 3.716783911901496e-06, + "loss": 0.3673, + "step": 9473 + }, + { + "epoch": 1.4497322111706197, + "grad_norm": 2.0896268775804034, + "learning_rate": 3.7148561931914472e-06, + "loss": 0.325, + "step": 9474 + }, + { + "epoch": 1.449885233358837, + "grad_norm": 2.1222460335419298, + "learning_rate": 3.712928860476668e-06, + "loss": 0.3339, + "step": 9475 + }, + { + "epoch": 1.4500382555470543, + "grad_norm": 2.3419143356584082, + "learning_rate": 3.7110019138755227e-06, + "loss": 0.3304, + "step": 9476 + }, + { + "epoch": 1.4501912777352717, + "grad_norm": 1.9960251059645366, + "learning_rate": 3.7090753535063517e-06, + "loss": 0.3324, + "step": 9477 + }, + { + "epoch": 1.450344299923489, + "grad_norm": 2.0769593423066866, + "learning_rate": 3.7071491794874736e-06, + "loss": 0.2987, + "step": 9478 + }, + { + "epoch": 1.4504973221117061, + "grad_norm": 2.232353678787646, + "learning_rate": 3.7052233919371826e-06, + "loss": 0.3502, + "step": 9479 + }, + { + "epoch": 1.4506503442999235, + "grad_norm": 2.2035358060269954, + "learning_rate": 3.703297990973749e-06, + "loss": 0.3136, + "step": 9480 + }, + { + "epoch": 1.4508033664881408, + "grad_norm": 2.2382851331924285, + "learning_rate": 3.7013729767154172e-06, + "loss": 0.3332, + "step": 9481 + }, + { + "epoch": 1.4509563886763581, + "grad_norm": 2.2981953616018767, + "learning_rate": 3.6994483492804124e-06, + "loss": 0.3646, + "step": 9482 + }, + { + "epoch": 1.4511094108645755, + "grad_norm": 2.1714996604088523, + "learning_rate": 3.6975241087869317e-06, + "loss": 0.3799, + "step": 9483 + }, + { + "epoch": 1.4512624330527926, + "grad_norm": 2.2662738415618726, + "learning_rate": 3.6956002553531513e-06, + "loss": 0.3537, + "step": 9484 + }, + { + "epoch": 1.45141545524101, + "grad_norm": 2.149039370046887, + "learning_rate": 3.6936767890972215e-06, + "loss": 0.3257, + "step": 9485 + }, + { + "epoch": 1.4515684774292272, + "grad_norm": 1.9961110412792133, + "learning_rate": 3.6917537101372714e-06, + "loss": 0.2954, + "step": 9486 + }, + { + "epoch": 1.4517214996174446, + "grad_norm": 2.0931555997712676, + "learning_rate": 3.6898310185914e-06, + "loss": 0.318, + "step": 9487 + }, + { + "epoch": 1.451874521805662, + "grad_norm": 2.483815223080329, + "learning_rate": 3.687908714577698e-06, + "loss": 0.3547, + "step": 9488 + }, + { + "epoch": 1.452027543993879, + "grad_norm": 2.0723286404767487, + "learning_rate": 3.6859867982142126e-06, + "loss": 0.2933, + "step": 9489 + }, + { + "epoch": 1.4521805661820963, + "grad_norm": 2.1016415401998407, + "learning_rate": 3.6840652696189737e-06, + "loss": 0.3124, + "step": 9490 + }, + { + "epoch": 1.4523335883703137, + "grad_norm": 2.363286862435652, + "learning_rate": 3.6821441289100025e-06, + "loss": 0.3849, + "step": 9491 + }, + { + "epoch": 1.452486610558531, + "grad_norm": 1.9554556428601573, + "learning_rate": 3.6802233762052687e-06, + "loss": 0.2859, + "step": 9492 + }, + { + "epoch": 1.4526396327467483, + "grad_norm": 2.2228505660039666, + "learning_rate": 3.6783030116227437e-06, + "loss": 0.3699, + "step": 9493 + }, + { + "epoch": 1.4527926549349655, + "grad_norm": 2.064645992589437, + "learning_rate": 3.676383035280364e-06, + "loss": 0.3022, + "step": 9494 + }, + { + "epoch": 1.4529456771231828, + "grad_norm": 2.0773468735735694, + "learning_rate": 3.674463447296035e-06, + "loss": 0.2924, + "step": 9495 + }, + { + "epoch": 1.4530986993114001, + "grad_norm": 2.433049644238986, + "learning_rate": 3.672544247787654e-06, + "loss": 0.3803, + "step": 9496 + }, + { + "epoch": 1.4532517214996175, + "grad_norm": 1.9816445100523168, + "learning_rate": 3.6706254368730877e-06, + "loss": 0.3232, + "step": 9497 + }, + { + "epoch": 1.4534047436878348, + "grad_norm": 2.6411701926591324, + "learning_rate": 3.6687070146701665e-06, + "loss": 0.3543, + "step": 9498 + }, + { + "epoch": 1.453557765876052, + "grad_norm": 2.2355609159511505, + "learning_rate": 3.6667889812967194e-06, + "loss": 0.3351, + "step": 9499 + }, + { + "epoch": 1.4537107880642695, + "grad_norm": 2.371135827805477, + "learning_rate": 3.664871336870537e-06, + "loss": 0.3725, + "step": 9500 + }, + { + "epoch": 1.4538638102524866, + "grad_norm": 1.9464656212441733, + "learning_rate": 3.662954081509388e-06, + "loss": 0.3134, + "step": 9501 + }, + { + "epoch": 1.454016832440704, + "grad_norm": 2.5268455510244294, + "learning_rate": 3.6610372153310202e-06, + "loss": 0.355, + "step": 9502 + }, + { + "epoch": 1.4541698546289212, + "grad_norm": 2.0024853844314277, + "learning_rate": 3.6591207384531557e-06, + "loss": 0.2974, + "step": 9503 + }, + { + "epoch": 1.4543228768171386, + "grad_norm": 2.352329885285306, + "learning_rate": 3.657204650993491e-06, + "loss": 0.3534, + "step": 9504 + }, + { + "epoch": 1.454475899005356, + "grad_norm": 2.1327778506979604, + "learning_rate": 3.6552889530697024e-06, + "loss": 0.3452, + "step": 9505 + }, + { + "epoch": 1.454628921193573, + "grad_norm": 1.9937031650471355, + "learning_rate": 3.6533736447994404e-06, + "loss": 0.2642, + "step": 9506 + }, + { + "epoch": 1.4547819433817903, + "grad_norm": 2.178908960404664, + "learning_rate": 3.651458726300331e-06, + "loss": 0.3035, + "step": 9507 + }, + { + "epoch": 1.4549349655700077, + "grad_norm": 1.9255218183461211, + "learning_rate": 3.649544197689973e-06, + "loss": 0.2867, + "step": 9508 + }, + { + "epoch": 1.455087987758225, + "grad_norm": 2.154114129270153, + "learning_rate": 3.6476300590859547e-06, + "loss": 0.2427, + "step": 9509 + }, + { + "epoch": 1.4552410099464423, + "grad_norm": 2.2499540356570407, + "learning_rate": 3.6457163106058236e-06, + "loss": 0.3877, + "step": 9510 + }, + { + "epoch": 1.4553940321346595, + "grad_norm": 2.223283595049156, + "learning_rate": 3.643802952367107e-06, + "loss": 0.3417, + "step": 9511 + }, + { + "epoch": 1.4555470543228768, + "grad_norm": 2.92579443127091, + "learning_rate": 3.6418899844873246e-06, + "loss": 0.3641, + "step": 9512 + }, + { + "epoch": 1.4557000765110941, + "grad_norm": 2.2939770131216632, + "learning_rate": 3.639977407083948e-06, + "loss": 0.2643, + "step": 9513 + }, + { + "epoch": 1.4558530986993115, + "grad_norm": 2.201385659868088, + "learning_rate": 3.638065220274435e-06, + "loss": 0.3657, + "step": 9514 + }, + { + "epoch": 1.4560061208875288, + "grad_norm": 2.522618790905064, + "learning_rate": 3.636153424176232e-06, + "loss": 0.3216, + "step": 9515 + }, + { + "epoch": 1.456159143075746, + "grad_norm": 2.041065696565262, + "learning_rate": 3.634242018906736e-06, + "loss": 0.3, + "step": 9516 + }, + { + "epoch": 1.4563121652639632, + "grad_norm": 1.9566790868626256, + "learning_rate": 3.632331004583345e-06, + "loss": 0.3274, + "step": 9517 + }, + { + "epoch": 1.4564651874521806, + "grad_norm": 2.1886566680108515, + "learning_rate": 3.6304203813234198e-06, + "loss": 0.357, + "step": 9518 + }, + { + "epoch": 1.456618209640398, + "grad_norm": 2.1183396413555466, + "learning_rate": 3.6285101492442897e-06, + "loss": 0.3473, + "step": 9519 + }, + { + "epoch": 1.4567712318286152, + "grad_norm": 2.2295734686348543, + "learning_rate": 3.6266003084632804e-06, + "loss": 0.2975, + "step": 9520 + }, + { + "epoch": 1.4569242540168323, + "grad_norm": 2.412345482780333, + "learning_rate": 3.6246908590976826e-06, + "loss": 0.3353, + "step": 9521 + }, + { + "epoch": 1.4570772762050497, + "grad_norm": 2.1028388857901392, + "learning_rate": 3.622781801264753e-06, + "loss": 0.3837, + "step": 9522 + }, + { + "epoch": 1.457230298393267, + "grad_norm": 2.2847187812274217, + "learning_rate": 3.6208731350817437e-06, + "loss": 0.3641, + "step": 9523 + }, + { + "epoch": 1.4573833205814843, + "grad_norm": 2.4648038833329204, + "learning_rate": 3.6189648606658712e-06, + "loss": 0.3452, + "step": 9524 + }, + { + "epoch": 1.4575363427697017, + "grad_norm": 1.9838995278832425, + "learning_rate": 3.617056978134329e-06, + "loss": 0.2456, + "step": 9525 + }, + { + "epoch": 1.4576893649579188, + "grad_norm": 2.623646002262567, + "learning_rate": 3.615149487604288e-06, + "loss": 0.3425, + "step": 9526 + }, + { + "epoch": 1.4578423871461361, + "grad_norm": 1.950155135421621, + "learning_rate": 3.6132423891928947e-06, + "loss": 0.2618, + "step": 9527 + }, + { + "epoch": 1.4579954093343535, + "grad_norm": 2.081492680781751, + "learning_rate": 3.611335683017272e-06, + "loss": 0.2437, + "step": 9528 + }, + { + "epoch": 1.4581484315225708, + "grad_norm": 2.2862166927483623, + "learning_rate": 3.609429369194516e-06, + "loss": 0.3857, + "step": 9529 + }, + { + "epoch": 1.4583014537107881, + "grad_norm": 1.9554078644755999, + "learning_rate": 3.6075234478417032e-06, + "loss": 0.2799, + "step": 9530 + }, + { + "epoch": 1.4584544758990052, + "grad_norm": 2.0393830421015435, + "learning_rate": 3.6056179190758833e-06, + "loss": 0.3213, + "step": 9531 + }, + { + "epoch": 1.4586074980872226, + "grad_norm": 1.8992585160790252, + "learning_rate": 3.6037127830140817e-06, + "loss": 0.2712, + "step": 9532 + }, + { + "epoch": 1.45876052027544, + "grad_norm": 2.1471319143065704, + "learning_rate": 3.6018080397733013e-06, + "loss": 0.3378, + "step": 9533 + }, + { + "epoch": 1.4589135424636572, + "grad_norm": 2.2031908095595756, + "learning_rate": 3.5999036894705185e-06, + "loss": 0.3217, + "step": 9534 + }, + { + "epoch": 1.4590665646518746, + "grad_norm": 2.3220186298678893, + "learning_rate": 3.5979997322226866e-06, + "loss": 0.3869, + "step": 9535 + }, + { + "epoch": 1.4592195868400917, + "grad_norm": 2.5306612080969524, + "learning_rate": 3.5960961681467364e-06, + "loss": 0.3477, + "step": 9536 + }, + { + "epoch": 1.4593726090283092, + "grad_norm": 2.073885257035563, + "learning_rate": 3.5941929973595703e-06, + "loss": 0.3113, + "step": 9537 + }, + { + "epoch": 1.4595256312165263, + "grad_norm": 2.2510208071748097, + "learning_rate": 3.5922902199780775e-06, + "loss": 0.3027, + "step": 9538 + }, + { + "epoch": 1.4596786534047437, + "grad_norm": 2.170543165202409, + "learning_rate": 3.590387836119107e-06, + "loss": 0.3865, + "step": 9539 + }, + { + "epoch": 1.459831675592961, + "grad_norm": 1.9292932434270147, + "learning_rate": 3.58848584589949e-06, + "loss": 0.2739, + "step": 9540 + }, + { + "epoch": 1.4599846977811783, + "grad_norm": 1.8320076614211738, + "learning_rate": 3.586584249436046e-06, + "loss": 0.2785, + "step": 9541 + }, + { + "epoch": 1.4601377199693957, + "grad_norm": 2.1935291445909013, + "learning_rate": 3.5846830468455497e-06, + "loss": 0.3469, + "step": 9542 + }, + { + "epoch": 1.4602907421576128, + "grad_norm": 2.176716608400466, + "learning_rate": 3.58278223824476e-06, + "loss": 0.272, + "step": 9543 + }, + { + "epoch": 1.4604437643458301, + "grad_norm": 1.799210093073552, + "learning_rate": 3.580881823750425e-06, + "loss": 0.2937, + "step": 9544 + }, + { + "epoch": 1.4605967865340475, + "grad_norm": 1.9702394956780114, + "learning_rate": 3.578981803479241e-06, + "loss": 0.261, + "step": 9545 + }, + { + "epoch": 1.4607498087222648, + "grad_norm": 2.2698743825940917, + "learning_rate": 3.577082177547908e-06, + "loss": 0.3467, + "step": 9546 + }, + { + "epoch": 1.4609028309104821, + "grad_norm": 2.4331017290730057, + "learning_rate": 3.575182946073089e-06, + "loss": 0.3096, + "step": 9547 + }, + { + "epoch": 1.4610558530986992, + "grad_norm": 2.2378192392486937, + "learning_rate": 3.573284109171412e-06, + "loss": 0.2813, + "step": 9548 + }, + { + "epoch": 1.4612088752869166, + "grad_norm": 1.949646785682065, + "learning_rate": 3.5713856669595025e-06, + "loss": 0.2898, + "step": 9549 + }, + { + "epoch": 1.461361897475134, + "grad_norm": 2.3546898453130605, + "learning_rate": 3.5694876195539506e-06, + "loss": 0.3316, + "step": 9550 + }, + { + "epoch": 1.4615149196633512, + "grad_norm": 2.0028492455077735, + "learning_rate": 3.5675899670713156e-06, + "loss": 0.3081, + "step": 9551 + }, + { + "epoch": 1.4616679418515686, + "grad_norm": 2.3157113370587377, + "learning_rate": 3.565692709628146e-06, + "loss": 0.3438, + "step": 9552 + }, + { + "epoch": 1.4618209640397857, + "grad_norm": 2.0500920849676345, + "learning_rate": 3.5637958473409595e-06, + "loss": 0.283, + "step": 9553 + }, + { + "epoch": 1.461973986228003, + "grad_norm": 2.2982221861588386, + "learning_rate": 3.561899380326248e-06, + "loss": 0.3415, + "step": 9554 + }, + { + "epoch": 1.4621270084162203, + "grad_norm": 2.1105916046380777, + "learning_rate": 3.5600033087004814e-06, + "loss": 0.3392, + "step": 9555 + }, + { + "epoch": 1.4622800306044377, + "grad_norm": 2.106406268139071, + "learning_rate": 3.5581076325801045e-06, + "loss": 0.3018, + "step": 9556 + }, + { + "epoch": 1.462433052792655, + "grad_norm": 1.8577142920496017, + "learning_rate": 3.5562123520815395e-06, + "loss": 0.3208, + "step": 9557 + }, + { + "epoch": 1.4625860749808721, + "grad_norm": 1.9650333120954178, + "learning_rate": 3.554317467321182e-06, + "loss": 0.3118, + "step": 9558 + }, + { + "epoch": 1.4627390971690895, + "grad_norm": 1.9942111925567405, + "learning_rate": 3.552422978415405e-06, + "loss": 0.2956, + "step": 9559 + }, + { + "epoch": 1.4628921193573068, + "grad_norm": 1.8524074124963654, + "learning_rate": 3.5505288854805555e-06, + "loss": 0.2709, + "step": 9560 + }, + { + "epoch": 1.4630451415455241, + "grad_norm": 2.328829234349851, + "learning_rate": 3.548635188632957e-06, + "loss": 0.3441, + "step": 9561 + }, + { + "epoch": 1.4631981637337415, + "grad_norm": 2.0595048605859057, + "learning_rate": 3.5467418879889104e-06, + "loss": 0.3298, + "step": 9562 + }, + { + "epoch": 1.4633511859219586, + "grad_norm": 2.1752269089426495, + "learning_rate": 3.5448489836646895e-06, + "loss": 0.351, + "step": 9563 + }, + { + "epoch": 1.463504208110176, + "grad_norm": 2.206069599112191, + "learning_rate": 3.5429564757765454e-06, + "loss": 0.3253, + "step": 9564 + }, + { + "epoch": 1.4636572302983932, + "grad_norm": 2.2034438785230352, + "learning_rate": 3.541064364440704e-06, + "loss": 0.3659, + "step": 9565 + }, + { + "epoch": 1.4638102524866106, + "grad_norm": 2.2927363798245346, + "learning_rate": 3.5391726497733637e-06, + "loss": 0.4403, + "step": 9566 + }, + { + "epoch": 1.463963274674828, + "grad_norm": 2.1978108874987576, + "learning_rate": 3.537281331890713e-06, + "loss": 0.3383, + "step": 9567 + }, + { + "epoch": 1.464116296863045, + "grad_norm": 2.44448450236065, + "learning_rate": 3.5353904109088945e-06, + "loss": 0.304, + "step": 9568 + }, + { + "epoch": 1.4642693190512626, + "grad_norm": 2.137051347698189, + "learning_rate": 3.533499886944037e-06, + "loss": 0.3344, + "step": 9569 + }, + { + "epoch": 1.4644223412394797, + "grad_norm": 2.2505696018802697, + "learning_rate": 3.531609760112257e-06, + "loss": 0.4358, + "step": 9570 + }, + { + "epoch": 1.464575363427697, + "grad_norm": 2.051597472463943, + "learning_rate": 3.5297200305296208e-06, + "loss": 0.2754, + "step": 9571 + }, + { + "epoch": 1.4647283856159143, + "grad_norm": 2.1406649536647047, + "learning_rate": 3.527830698312187e-06, + "loss": 0.3756, + "step": 9572 + }, + { + "epoch": 1.4648814078041317, + "grad_norm": 2.0506454652157005, + "learning_rate": 3.525941763575995e-06, + "loss": 0.3418, + "step": 9573 + }, + { + "epoch": 1.465034429992349, + "grad_norm": 2.1326475184422082, + "learning_rate": 3.5240532264370396e-06, + "loss": 0.2633, + "step": 9574 + }, + { + "epoch": 1.4651874521805661, + "grad_norm": 2.1494489636744185, + "learning_rate": 3.5221650870113134e-06, + "loss": 0.3263, + "step": 9575 + }, + { + "epoch": 1.4653404743687835, + "grad_norm": 2.2083023526559318, + "learning_rate": 3.5202773454147733e-06, + "loss": 0.3563, + "step": 9576 + }, + { + "epoch": 1.4654934965570008, + "grad_norm": 2.1455256554573956, + "learning_rate": 3.5183900017633445e-06, + "loss": 0.3259, + "step": 9577 + }, + { + "epoch": 1.4656465187452181, + "grad_norm": 2.0692219151315685, + "learning_rate": 3.516503056172944e-06, + "loss": 0.3062, + "step": 9578 + }, + { + "epoch": 1.4657995409334355, + "grad_norm": 2.435498607808429, + "learning_rate": 3.5146165087594586e-06, + "loss": 0.3979, + "step": 9579 + }, + { + "epoch": 1.4659525631216526, + "grad_norm": 2.251816963015846, + "learning_rate": 3.5127303596387386e-06, + "loss": 0.3213, + "step": 9580 + }, + { + "epoch": 1.46610558530987, + "grad_norm": 2.1677463950149294, + "learning_rate": 3.510844608926627e-06, + "loss": 0.3407, + "step": 9581 + }, + { + "epoch": 1.4662586074980872, + "grad_norm": 2.4337802808014017, + "learning_rate": 3.5089592567389352e-06, + "loss": 0.3395, + "step": 9582 + }, + { + "epoch": 1.4664116296863046, + "grad_norm": 2.178833688485482, + "learning_rate": 3.507074303191448e-06, + "loss": 0.3467, + "step": 9583 + }, + { + "epoch": 1.466564651874522, + "grad_norm": 2.079719791026871, + "learning_rate": 3.5051897483999274e-06, + "loss": 0.3201, + "step": 9584 + }, + { + "epoch": 1.466717674062739, + "grad_norm": 2.318754978550932, + "learning_rate": 3.503305592480113e-06, + "loss": 0.31, + "step": 9585 + }, + { + "epoch": 1.4668706962509563, + "grad_norm": 2.2671430905226684, + "learning_rate": 3.501421835547718e-06, + "loss": 0.3336, + "step": 9586 + }, + { + "epoch": 1.4670237184391737, + "grad_norm": 1.9679858647231845, + "learning_rate": 3.4995384777184295e-06, + "loss": 0.3267, + "step": 9587 + }, + { + "epoch": 1.467176740627391, + "grad_norm": 2.059412945703858, + "learning_rate": 3.4976555191079142e-06, + "loss": 0.2965, + "step": 9588 + }, + { + "epoch": 1.4673297628156083, + "grad_norm": 1.8621298863000937, + "learning_rate": 3.4957729598318104e-06, + "loss": 0.3184, + "step": 9589 + }, + { + "epoch": 1.4674827850038255, + "grad_norm": 2.0029242916594794, + "learning_rate": 3.493890800005729e-06, + "loss": 0.3288, + "step": 9590 + }, + { + "epoch": 1.4676358071920428, + "grad_norm": 2.104395772468994, + "learning_rate": 3.4920090397452743e-06, + "loss": 0.2823, + "step": 9591 + }, + { + "epoch": 1.4677888293802601, + "grad_norm": 1.8928052562757283, + "learning_rate": 3.490127679166e-06, + "loss": 0.2928, + "step": 9592 + }, + { + "epoch": 1.4679418515684775, + "grad_norm": 2.0663466283826906, + "learning_rate": 3.4882467183834478e-06, + "loss": 0.274, + "step": 9593 + }, + { + "epoch": 1.4680948737566948, + "grad_norm": 1.8060456661072481, + "learning_rate": 3.486366157513146e-06, + "loss": 0.3044, + "step": 9594 + }, + { + "epoch": 1.468247895944912, + "grad_norm": 1.9513142978452787, + "learning_rate": 3.4844859966705726e-06, + "loss": 0.2943, + "step": 9595 + }, + { + "epoch": 1.4684009181331292, + "grad_norm": 2.2091242439116665, + "learning_rate": 3.4826062359712065e-06, + "loss": 0.299, + "step": 9596 + }, + { + "epoch": 1.4685539403213466, + "grad_norm": 2.041058385323502, + "learning_rate": 3.4807268755304914e-06, + "loss": 0.2695, + "step": 9597 + }, + { + "epoch": 1.468706962509564, + "grad_norm": 2.1822736478283944, + "learning_rate": 3.4788479154638356e-06, + "loss": 0.357, + "step": 9598 + }, + { + "epoch": 1.4688599846977812, + "grad_norm": 2.1286379228362606, + "learning_rate": 3.4769693558866434e-06, + "loss": 0.3365, + "step": 9599 + }, + { + "epoch": 1.4690130068859983, + "grad_norm": 2.127037295930163, + "learning_rate": 3.475091196914284e-06, + "loss": 0.3397, + "step": 9600 + }, + { + "epoch": 1.469166029074216, + "grad_norm": 2.1368346349315583, + "learning_rate": 3.473213438662094e-06, + "loss": 0.3509, + "step": 9601 + }, + { + "epoch": 1.469319051262433, + "grad_norm": 2.079146890204384, + "learning_rate": 3.4713360812454033e-06, + "loss": 0.3229, + "step": 9602 + }, + { + "epoch": 1.4694720734506503, + "grad_norm": 2.142254720047488, + "learning_rate": 3.4694591247795028e-06, + "loss": 0.3195, + "step": 9603 + }, + { + "epoch": 1.4696250956388677, + "grad_norm": 2.0262041871453778, + "learning_rate": 3.4675825693796662e-06, + "loss": 0.3646, + "step": 9604 + }, + { + "epoch": 1.469778117827085, + "grad_norm": 1.9264466580480444, + "learning_rate": 3.4657064151611385e-06, + "loss": 0.3186, + "step": 9605 + }, + { + "epoch": 1.4699311400153023, + "grad_norm": 2.0276181691278503, + "learning_rate": 3.4638306622391426e-06, + "loss": 0.3137, + "step": 9606 + }, + { + "epoch": 1.4700841622035195, + "grad_norm": 2.534691854587415, + "learning_rate": 3.461955310728875e-06, + "loss": 0.4236, + "step": 9607 + }, + { + "epoch": 1.4702371843917368, + "grad_norm": 1.9148967223518105, + "learning_rate": 3.4600803607455093e-06, + "loss": 0.3288, + "step": 9608 + }, + { + "epoch": 1.4703902065799541, + "grad_norm": 2.2857281715004416, + "learning_rate": 3.4582058124041915e-06, + "loss": 0.3491, + "step": 9609 + }, + { + "epoch": 1.4705432287681715, + "grad_norm": 2.1665463187107314, + "learning_rate": 3.4563316658200484e-06, + "loss": 0.3467, + "step": 9610 + }, + { + "epoch": 1.4706962509563888, + "grad_norm": 2.143549381240846, + "learning_rate": 3.4544579211081753e-06, + "loss": 0.3849, + "step": 9611 + }, + { + "epoch": 1.470849273144606, + "grad_norm": 1.8653919338791964, + "learning_rate": 3.4525845783836474e-06, + "loss": 0.238, + "step": 9612 + }, + { + "epoch": 1.4710022953328232, + "grad_norm": 2.1729156855165352, + "learning_rate": 3.4507116377615157e-06, + "loss": 0.3141, + "step": 9613 + }, + { + "epoch": 1.4711553175210406, + "grad_norm": 2.3265866813135965, + "learning_rate": 3.448839099356802e-06, + "loss": 0.3338, + "step": 9614 + }, + { + "epoch": 1.471308339709258, + "grad_norm": 1.854469823617688, + "learning_rate": 3.4469669632845073e-06, + "loss": 0.3019, + "step": 9615 + }, + { + "epoch": 1.4714613618974752, + "grad_norm": 1.9374657496486478, + "learning_rate": 3.4450952296596072e-06, + "loss": 0.2676, + "step": 9616 + }, + { + "epoch": 1.4716143840856923, + "grad_norm": 1.9448743453259358, + "learning_rate": 3.4432238985970523e-06, + "loss": 0.2811, + "step": 9617 + }, + { + "epoch": 1.4717674062739097, + "grad_norm": 2.1803253590810434, + "learning_rate": 3.441352970211769e-06, + "loss": 0.3335, + "step": 9618 + }, + { + "epoch": 1.471920428462127, + "grad_norm": 2.1774294836502586, + "learning_rate": 3.4394824446186527e-06, + "loss": 0.3215, + "step": 9619 + }, + { + "epoch": 1.4720734506503443, + "grad_norm": 2.1656602648557874, + "learning_rate": 3.437612321932591e-06, + "loss": 0.3493, + "step": 9620 + }, + { + "epoch": 1.4722264728385617, + "grad_norm": 2.111235629798471, + "learning_rate": 3.4357426022684257e-06, + "loss": 0.2601, + "step": 9621 + }, + { + "epoch": 1.4723794950267788, + "grad_norm": 1.912232160522444, + "learning_rate": 3.433873285740984e-06, + "loss": 0.3647, + "step": 9622 + }, + { + "epoch": 1.4725325172149961, + "grad_norm": 2.31825086042197, + "learning_rate": 3.4320043724650763e-06, + "loss": 0.3651, + "step": 9623 + }, + { + "epoch": 1.4726855394032135, + "grad_norm": 1.7970660470580329, + "learning_rate": 3.4301358625554713e-06, + "loss": 0.2483, + "step": 9624 + }, + { + "epoch": 1.4728385615914308, + "grad_norm": 2.2998009808220927, + "learning_rate": 3.4282677561269217e-06, + "loss": 0.2858, + "step": 9625 + }, + { + "epoch": 1.4729915837796481, + "grad_norm": 2.1552214382883514, + "learning_rate": 3.4264000532941644e-06, + "loss": 0.3032, + "step": 9626 + }, + { + "epoch": 1.4731446059678652, + "grad_norm": 2.1964411418119654, + "learning_rate": 3.424532754171889e-06, + "loss": 0.3218, + "step": 9627 + }, + { + "epoch": 1.4732976281560826, + "grad_norm": 2.0569648465887544, + "learning_rate": 3.422665858874784e-06, + "loss": 0.3243, + "step": 9628 + }, + { + "epoch": 1.4734506503443, + "grad_norm": 2.167669952320896, + "learning_rate": 3.4207993675175023e-06, + "loss": 0.3219, + "step": 9629 + }, + { + "epoch": 1.4736036725325172, + "grad_norm": 1.9886377425796664, + "learning_rate": 3.4189332802146623e-06, + "loss": 0.3845, + "step": 9630 + }, + { + "epoch": 1.4737566947207346, + "grad_norm": 2.3674925318781863, + "learning_rate": 3.4170675970808797e-06, + "loss": 0.3497, + "step": 9631 + }, + { + "epoch": 1.4739097169089517, + "grad_norm": 2.0213186273832533, + "learning_rate": 3.415202318230727e-06, + "loss": 0.3284, + "step": 9632 + }, + { + "epoch": 1.4740627390971692, + "grad_norm": 1.9433332846288336, + "learning_rate": 3.4133374437787604e-06, + "loss": 0.3333, + "step": 9633 + }, + { + "epoch": 1.4742157612853863, + "grad_norm": 1.8829935353968115, + "learning_rate": 3.411472973839509e-06, + "loss": 0.2995, + "step": 9634 + }, + { + "epoch": 1.4743687834736037, + "grad_norm": 2.199541576399467, + "learning_rate": 3.4096089085274763e-06, + "loss": 0.387, + "step": 9635 + }, + { + "epoch": 1.474521805661821, + "grad_norm": 2.320246022684784, + "learning_rate": 3.4077452479571425e-06, + "loss": 0.3588, + "step": 9636 + }, + { + "epoch": 1.4746748278500381, + "grad_norm": 2.2086550938995453, + "learning_rate": 3.405881992242962e-06, + "loss": 0.396, + "step": 9637 + }, + { + "epoch": 1.4748278500382557, + "grad_norm": 2.5880420208227575, + "learning_rate": 3.404019141499364e-06, + "loss": 0.3122, + "step": 9638 + }, + { + "epoch": 1.4749808722264728, + "grad_norm": 2.14820259118592, + "learning_rate": 3.4021566958407547e-06, + "loss": 0.3345, + "step": 9639 + }, + { + "epoch": 1.4751338944146901, + "grad_norm": 2.291144556279864, + "learning_rate": 3.4002946553815143e-06, + "loss": 0.4546, + "step": 9640 + }, + { + "epoch": 1.4752869166029074, + "grad_norm": 2.2789618485242107, + "learning_rate": 3.398433020235996e-06, + "loss": 0.2864, + "step": 9641 + }, + { + "epoch": 1.4754399387911248, + "grad_norm": 2.16718251307733, + "learning_rate": 3.3965717905185326e-06, + "loss": 0.3261, + "step": 9642 + }, + { + "epoch": 1.4755929609793421, + "grad_norm": 1.8345178592877107, + "learning_rate": 3.3947109663434274e-06, + "loss": 0.2828, + "step": 9643 + }, + { + "epoch": 1.4757459831675592, + "grad_norm": 2.0816444823106215, + "learning_rate": 3.392850547824962e-06, + "loss": 0.3154, + "step": 9644 + }, + { + "epoch": 1.4758990053557766, + "grad_norm": 2.2321250664297914, + "learning_rate": 3.390990535077392e-06, + "loss": 0.3723, + "step": 9645 + }, + { + "epoch": 1.476052027543994, + "grad_norm": 1.9008291069198198, + "learning_rate": 3.3891309282149476e-06, + "loss": 0.3035, + "step": 9646 + }, + { + "epoch": 1.4762050497322112, + "grad_norm": 2.0403337195080007, + "learning_rate": 3.3872717273518352e-06, + "loss": 0.3225, + "step": 9647 + }, + { + "epoch": 1.4763580719204286, + "grad_norm": 2.3313497369392864, + "learning_rate": 3.3854129326022312e-06, + "loss": 0.3249, + "step": 9648 + }, + { + "epoch": 1.4765110941086457, + "grad_norm": 1.9367820769137207, + "learning_rate": 3.383554544080303e-06, + "loss": 0.3423, + "step": 9649 + }, + { + "epoch": 1.476664116296863, + "grad_norm": 1.9343688897956826, + "learning_rate": 3.3816965619001697e-06, + "loss": 0.2844, + "step": 9650 + }, + { + "epoch": 1.4768171384850803, + "grad_norm": 2.0350599708053183, + "learning_rate": 3.3798389861759385e-06, + "loss": 0.2567, + "step": 9651 + }, + { + "epoch": 1.4769701606732977, + "grad_norm": 2.0884917699535226, + "learning_rate": 3.3779818170216994e-06, + "loss": 0.3647, + "step": 9652 + }, + { + "epoch": 1.477123182861515, + "grad_norm": 2.053173963511413, + "learning_rate": 3.3761250545515e-06, + "loss": 0.3367, + "step": 9653 + }, + { + "epoch": 1.4772762050497321, + "grad_norm": 2.406740971692602, + "learning_rate": 3.3742686988793703e-06, + "loss": 0.3357, + "step": 9654 + }, + { + "epoch": 1.4774292272379494, + "grad_norm": 2.2395311160784273, + "learning_rate": 3.372412750119326e-06, + "loss": 0.3188, + "step": 9655 + }, + { + "epoch": 1.4775822494261668, + "grad_norm": 2.090272851729783, + "learning_rate": 3.3705572083853354e-06, + "loss": 0.3041, + "step": 9656 + }, + { + "epoch": 1.4777352716143841, + "grad_norm": 1.9825871720189625, + "learning_rate": 3.3687020737913645e-06, + "loss": 0.2794, + "step": 9657 + }, + { + "epoch": 1.4778882938026014, + "grad_norm": 1.799646543850758, + "learning_rate": 3.366847346451345e-06, + "loss": 0.3137, + "step": 9658 + }, + { + "epoch": 1.4780413159908186, + "grad_norm": 2.1457698517236365, + "learning_rate": 3.364993026479172e-06, + "loss": 0.3418, + "step": 9659 + }, + { + "epoch": 1.478194338179036, + "grad_norm": 1.881613328637775, + "learning_rate": 3.363139113988736e-06, + "loss": 0.3176, + "step": 9660 + }, + { + "epoch": 1.4783473603672532, + "grad_norm": 2.0964508147709364, + "learning_rate": 3.361285609093895e-06, + "loss": 0.368, + "step": 9661 + }, + { + "epoch": 1.4785003825554706, + "grad_norm": 2.3956986412823214, + "learning_rate": 3.359432511908468e-06, + "loss": 0.4279, + "step": 9662 + }, + { + "epoch": 1.478653404743688, + "grad_norm": 2.177333571774147, + "learning_rate": 3.357579822546273e-06, + "loss": 0.3332, + "step": 9663 + }, + { + "epoch": 1.478806426931905, + "grad_norm": 1.920267606292228, + "learning_rate": 3.3557275411210857e-06, + "loss": 0.3456, + "step": 9664 + }, + { + "epoch": 1.4789594491201223, + "grad_norm": 2.13065516579347, + "learning_rate": 3.353875667746662e-06, + "loss": 0.2951, + "step": 9665 + }, + { + "epoch": 1.4791124713083397, + "grad_norm": 2.1802357900868103, + "learning_rate": 3.3520242025367345e-06, + "loss": 0.3092, + "step": 9666 + }, + { + "epoch": 1.479265493496557, + "grad_norm": 2.2588888010046184, + "learning_rate": 3.350173145605007e-06, + "loss": 0.3029, + "step": 9667 + }, + { + "epoch": 1.4794185156847743, + "grad_norm": 2.149844137638874, + "learning_rate": 3.3483224970651618e-06, + "loss": 0.3287, + "step": 9668 + }, + { + "epoch": 1.4795715378729914, + "grad_norm": 1.9285140452203176, + "learning_rate": 3.3464722570308495e-06, + "loss": 0.2797, + "step": 9669 + }, + { + "epoch": 1.479724560061209, + "grad_norm": 2.174794412352257, + "learning_rate": 3.3446224256157113e-06, + "loss": 0.3432, + "step": 9670 + }, + { + "epoch": 1.4798775822494261, + "grad_norm": 1.9492744650336518, + "learning_rate": 3.3427730029333427e-06, + "loss": 0.3068, + "step": 9671 + }, + { + "epoch": 1.4800306044376434, + "grad_norm": 2.2467264250993737, + "learning_rate": 3.3409239890973234e-06, + "loss": 0.3348, + "step": 9672 + }, + { + "epoch": 1.4801836266258608, + "grad_norm": 2.148025214316827, + "learning_rate": 3.3390753842212196e-06, + "loss": 0.3306, + "step": 9673 + }, + { + "epoch": 1.4803366488140781, + "grad_norm": 2.3871375437801916, + "learning_rate": 3.3372271884185503e-06, + "loss": 0.3524, + "step": 9674 + }, + { + "epoch": 1.4804896710022954, + "grad_norm": 2.281354552877399, + "learning_rate": 3.3353794018028216e-06, + "loss": 0.339, + "step": 9675 + }, + { + "epoch": 1.4806426931905126, + "grad_norm": 2.16698236232759, + "learning_rate": 3.3335320244875226e-06, + "loss": 0.3163, + "step": 9676 + }, + { + "epoch": 1.48079571537873, + "grad_norm": 1.9526029274383037, + "learning_rate": 3.3316850565860937e-06, + "loss": 0.2826, + "step": 9677 + }, + { + "epoch": 1.4809487375669472, + "grad_norm": 2.420557240605018, + "learning_rate": 3.3298384982119767e-06, + "loss": 0.3786, + "step": 9678 + }, + { + "epoch": 1.4811017597551646, + "grad_norm": 2.28465035245862, + "learning_rate": 3.327992349478574e-06, + "loss": 0.3461, + "step": 9679 + }, + { + "epoch": 1.481254781943382, + "grad_norm": 2.0005123469497588, + "learning_rate": 3.3261466104992557e-06, + "loss": 0.2779, + "step": 9680 + }, + { + "epoch": 1.481407804131599, + "grad_norm": 2.264288271376078, + "learning_rate": 3.3243012813873854e-06, + "loss": 0.3615, + "step": 9681 + }, + { + "epoch": 1.4815608263198163, + "grad_norm": 2.1961574735260805, + "learning_rate": 3.322456362256292e-06, + "loss": 0.3248, + "step": 9682 + }, + { + "epoch": 1.4817138485080337, + "grad_norm": 1.987011464688957, + "learning_rate": 3.32061185321927e-06, + "loss": 0.33, + "step": 9683 + }, + { + "epoch": 1.481866870696251, + "grad_norm": 1.9619117786824731, + "learning_rate": 3.318767754389607e-06, + "loss": 0.3812, + "step": 9684 + }, + { + "epoch": 1.4820198928844683, + "grad_norm": 2.075830782082472, + "learning_rate": 3.3169240658805533e-06, + "loss": 0.2877, + "step": 9685 + }, + { + "epoch": 1.4821729150726854, + "grad_norm": 2.0458494055910927, + "learning_rate": 3.3150807878053383e-06, + "loss": 0.3171, + "step": 9686 + }, + { + "epoch": 1.4823259372609028, + "grad_norm": 2.493267414344924, + "learning_rate": 3.3132379202771613e-06, + "loss": 0.3898, + "step": 9687 + }, + { + "epoch": 1.4824789594491201, + "grad_norm": 2.275969640971838, + "learning_rate": 3.3113954634092037e-06, + "loss": 0.3653, + "step": 9688 + }, + { + "epoch": 1.4826319816373374, + "grad_norm": 2.1293537358603514, + "learning_rate": 3.309553417314617e-06, + "loss": 0.3176, + "step": 9689 + }, + { + "epoch": 1.4827850038255548, + "grad_norm": 2.131470108037654, + "learning_rate": 3.3077117821065274e-06, + "loss": 0.2997, + "step": 9690 + }, + { + "epoch": 1.482938026013772, + "grad_norm": 2.2836857699911315, + "learning_rate": 3.305870557898039e-06, + "loss": 0.3644, + "step": 9691 + }, + { + "epoch": 1.4830910482019892, + "grad_norm": 1.8703424111131515, + "learning_rate": 3.3040297448022253e-06, + "loss": 0.2544, + "step": 9692 + }, + { + "epoch": 1.4832440703902066, + "grad_norm": 1.8819671745260147, + "learning_rate": 3.3021893429321407e-06, + "loss": 0.2973, + "step": 9693 + }, + { + "epoch": 1.483397092578424, + "grad_norm": 2.2023836981052343, + "learning_rate": 3.300349352400811e-06, + "loss": 0.2955, + "step": 9694 + }, + { + "epoch": 1.4835501147666412, + "grad_norm": 1.8542863847355404, + "learning_rate": 3.2985097733212356e-06, + "loss": 0.259, + "step": 9695 + }, + { + "epoch": 1.4837031369548583, + "grad_norm": 2.201198857571715, + "learning_rate": 3.2966706058063924e-06, + "loss": 0.3678, + "step": 9696 + }, + { + "epoch": 1.4838561591430757, + "grad_norm": 2.0005225343509174, + "learning_rate": 3.294831849969231e-06, + "loss": 0.2636, + "step": 9697 + }, + { + "epoch": 1.484009181331293, + "grad_norm": 2.0741880232412813, + "learning_rate": 3.292993505922676e-06, + "loss": 0.3257, + "step": 9698 + }, + { + "epoch": 1.4841622035195103, + "grad_norm": 2.1015544039730294, + "learning_rate": 3.2911555737796274e-06, + "loss": 0.3469, + "step": 9699 + }, + { + "epoch": 1.4843152257077277, + "grad_norm": 2.154984766214943, + "learning_rate": 3.28931805365296e-06, + "loss": 0.3341, + "step": 9700 + }, + { + "epoch": 1.4844682478959448, + "grad_norm": 2.00350612128123, + "learning_rate": 3.2874809456555214e-06, + "loss": 0.3539, + "step": 9701 + }, + { + "epoch": 1.4846212700841623, + "grad_norm": 2.2878236503785114, + "learning_rate": 3.285644249900143e-06, + "loss": 0.3498, + "step": 9702 + }, + { + "epoch": 1.4847742922723794, + "grad_norm": 2.0944591858421227, + "learning_rate": 3.283807966499615e-06, + "loss": 0.311, + "step": 9703 + }, + { + "epoch": 1.4849273144605968, + "grad_norm": 2.117959473650116, + "learning_rate": 3.281972095566709e-06, + "loss": 0.3289, + "step": 9704 + }, + { + "epoch": 1.4850803366488141, + "grad_norm": 2.1069078550225284, + "learning_rate": 3.2801366372141854e-06, + "loss": 0.2824, + "step": 9705 + }, + { + "epoch": 1.4852333588370314, + "grad_norm": 1.7445064623021305, + "learning_rate": 3.278301591554753e-06, + "loss": 0.2507, + "step": 9706 + }, + { + "epoch": 1.4853863810252488, + "grad_norm": 2.1744862309060826, + "learning_rate": 3.2764669587011176e-06, + "loss": 0.2917, + "step": 9707 + }, + { + "epoch": 1.485539403213466, + "grad_norm": 2.099115560590431, + "learning_rate": 3.2746327387659528e-06, + "loss": 0.3, + "step": 9708 + }, + { + "epoch": 1.4856924254016832, + "grad_norm": 2.383542556858228, + "learning_rate": 3.272798931861895e-06, + "loss": 0.3499, + "step": 9709 + }, + { + "epoch": 1.4858454475899006, + "grad_norm": 1.958523894698532, + "learning_rate": 3.2709655381015747e-06, + "loss": 0.3295, + "step": 9710 + }, + { + "epoch": 1.485998469778118, + "grad_norm": 2.316274078181258, + "learning_rate": 3.2691325575975873e-06, + "loss": 0.2929, + "step": 9711 + }, + { + "epoch": 1.4861514919663352, + "grad_norm": 2.285956647889531, + "learning_rate": 3.267299990462496e-06, + "loss": 0.3307, + "step": 9712 + }, + { + "epoch": 1.4863045141545523, + "grad_norm": 2.21087747854537, + "learning_rate": 3.265467836808852e-06, + "loss": 0.3439, + "step": 9713 + }, + { + "epoch": 1.4864575363427697, + "grad_norm": 2.512942424866177, + "learning_rate": 3.2636360967491753e-06, + "loss": 0.3233, + "step": 9714 + }, + { + "epoch": 1.486610558530987, + "grad_norm": 2.2420113787509695, + "learning_rate": 3.2618047703959587e-06, + "loss": 0.3966, + "step": 9715 + }, + { + "epoch": 1.4867635807192043, + "grad_norm": 1.827313128956776, + "learning_rate": 3.25997385786167e-06, + "loss": 0.3345, + "step": 9716 + }, + { + "epoch": 1.4869166029074217, + "grad_norm": 2.540856908588475, + "learning_rate": 3.258143359258755e-06, + "loss": 0.3662, + "step": 9717 + }, + { + "epoch": 1.4870696250956388, + "grad_norm": 2.0729164941270626, + "learning_rate": 3.2563132746996295e-06, + "loss": 0.2955, + "step": 9718 + }, + { + "epoch": 1.4872226472838561, + "grad_norm": 2.2431616508070684, + "learning_rate": 3.2544836042966887e-06, + "loss": 0.2798, + "step": 9719 + }, + { + "epoch": 1.4873756694720734, + "grad_norm": 2.388865441696811, + "learning_rate": 3.2526543481622972e-06, + "loss": 0.3584, + "step": 9720 + }, + { + "epoch": 1.4875286916602908, + "grad_norm": 2.3876493187240446, + "learning_rate": 3.250825506408798e-06, + "loss": 0.3654, + "step": 9721 + }, + { + "epoch": 1.4876817138485081, + "grad_norm": 1.8569930412454958, + "learning_rate": 3.248997079148508e-06, + "loss": 0.3371, + "step": 9722 + }, + { + "epoch": 1.4878347360367252, + "grad_norm": 1.9617608209841546, + "learning_rate": 3.247169066493717e-06, + "loss": 0.3279, + "step": 9723 + }, + { + "epoch": 1.4879877582249426, + "grad_norm": 2.1004602635403073, + "learning_rate": 3.245341468556691e-06, + "loss": 0.3163, + "step": 9724 + }, + { + "epoch": 1.48814078041316, + "grad_norm": 1.8281973503764155, + "learning_rate": 3.2435142854496695e-06, + "loss": 0.3414, + "step": 9725 + }, + { + "epoch": 1.4882938026013772, + "grad_norm": 2.0119727311462468, + "learning_rate": 3.241687517284867e-06, + "loss": 0.3131, + "step": 9726 + }, + { + "epoch": 1.4884468247895946, + "grad_norm": 2.0671398705003354, + "learning_rate": 3.239861164174474e-06, + "loss": 0.3292, + "step": 9727 + }, + { + "epoch": 1.4885998469778117, + "grad_norm": 2.183939371421867, + "learning_rate": 3.2380352262306516e-06, + "loss": 0.3413, + "step": 9728 + }, + { + "epoch": 1.488752869166029, + "grad_norm": 2.6176496586308264, + "learning_rate": 3.2362097035655395e-06, + "loss": 0.3849, + "step": 9729 + }, + { + "epoch": 1.4889058913542463, + "grad_norm": 1.984006435842004, + "learning_rate": 3.234384596291246e-06, + "loss": 0.2825, + "step": 9730 + }, + { + "epoch": 1.4890589135424637, + "grad_norm": 2.218983095645015, + "learning_rate": 3.2325599045198686e-06, + "loss": 0.3254, + "step": 9731 + }, + { + "epoch": 1.489211935730681, + "grad_norm": 2.1406761509420753, + "learning_rate": 3.230735628363457e-06, + "loss": 0.2822, + "step": 9732 + }, + { + "epoch": 1.4893649579188981, + "grad_norm": 2.156889368226719, + "learning_rate": 3.228911767934051e-06, + "loss": 0.3079, + "step": 9733 + }, + { + "epoch": 1.4895179801071157, + "grad_norm": 2.2627592273788806, + "learning_rate": 3.2270883233436668e-06, + "loss": 0.3561, + "step": 9734 + }, + { + "epoch": 1.4896710022953328, + "grad_norm": 2.1268555796252, + "learning_rate": 3.225265294704278e-06, + "loss": 0.2414, + "step": 9735 + }, + { + "epoch": 1.4898240244835501, + "grad_norm": 2.5191995631578443, + "learning_rate": 3.2234426821278553e-06, + "loss": 0.4362, + "step": 9736 + }, + { + "epoch": 1.4899770466717674, + "grad_norm": 2.1702180434659444, + "learning_rate": 3.221620485726329e-06, + "loss": 0.36, + "step": 9737 + }, + { + "epoch": 1.4901300688599848, + "grad_norm": 2.2050301008986017, + "learning_rate": 3.219798705611601e-06, + "loss": 0.3054, + "step": 9738 + }, + { + "epoch": 1.4902830910482021, + "grad_norm": 2.2089552013113334, + "learning_rate": 3.2179773418955605e-06, + "loss": 0.3696, + "step": 9739 + }, + { + "epoch": 1.4904361132364192, + "grad_norm": 1.9418105664383982, + "learning_rate": 3.2161563946900665e-06, + "loss": 0.2662, + "step": 9740 + }, + { + "epoch": 1.4905891354246366, + "grad_norm": 2.1010010388460523, + "learning_rate": 3.2143358641069412e-06, + "loss": 0.294, + "step": 9741 + }, + { + "epoch": 1.490742157612854, + "grad_norm": 2.4393911103033235, + "learning_rate": 3.2125157502579985e-06, + "loss": 0.3728, + "step": 9742 + }, + { + "epoch": 1.4908951798010712, + "grad_norm": 2.1391415962137, + "learning_rate": 3.2106960532550158e-06, + "loss": 0.298, + "step": 9743 + }, + { + "epoch": 1.4910482019892886, + "grad_norm": 2.0613432267115925, + "learning_rate": 3.2088767732097493e-06, + "loss": 0.2871, + "step": 9744 + }, + { + "epoch": 1.4912012241775057, + "grad_norm": 1.974687288819572, + "learning_rate": 3.2070579102339274e-06, + "loss": 0.3126, + "step": 9745 + }, + { + "epoch": 1.491354246365723, + "grad_norm": 2.0887558994869884, + "learning_rate": 3.2052394644392525e-06, + "loss": 0.3445, + "step": 9746 + }, + { + "epoch": 1.4915072685539403, + "grad_norm": 2.186477735593589, + "learning_rate": 3.2034214359374038e-06, + "loss": 0.3372, + "step": 9747 + }, + { + "epoch": 1.4916602907421577, + "grad_norm": 2.3663298192862454, + "learning_rate": 3.2016038248400338e-06, + "loss": 0.3463, + "step": 9748 + }, + { + "epoch": 1.491813312930375, + "grad_norm": 1.9550108079123667, + "learning_rate": 3.199786631258768e-06, + "loss": 0.3061, + "step": 9749 + }, + { + "epoch": 1.4919663351185921, + "grad_norm": 2.179613795100359, + "learning_rate": 3.1979698553052065e-06, + "loss": 0.3472, + "step": 9750 + }, + { + "epoch": 1.4921193573068094, + "grad_norm": 2.2746548727858054, + "learning_rate": 3.1961534970909237e-06, + "loss": 0.4066, + "step": 9751 + }, + { + "epoch": 1.4922723794950268, + "grad_norm": 1.939192701965579, + "learning_rate": 3.194337556727478e-06, + "loss": 0.2677, + "step": 9752 + }, + { + "epoch": 1.4924254016832441, + "grad_norm": 2.264172372199918, + "learning_rate": 3.192522034326383e-06, + "loss": 0.2917, + "step": 9753 + }, + { + "epoch": 1.4925784238714614, + "grad_norm": 2.302377291578821, + "learning_rate": 3.1907069299991387e-06, + "loss": 0.3661, + "step": 9754 + }, + { + "epoch": 1.4927314460596786, + "grad_norm": 1.8882191488728923, + "learning_rate": 3.1888922438572257e-06, + "loss": 0.2901, + "step": 9755 + }, + { + "epoch": 1.492884468247896, + "grad_norm": 2.1078551552877935, + "learning_rate": 3.1870779760120818e-06, + "loss": 0.3591, + "step": 9756 + }, + { + "epoch": 1.4930374904361132, + "grad_norm": 2.277226366949766, + "learning_rate": 3.1852641265751294e-06, + "loss": 0.3308, + "step": 9757 + }, + { + "epoch": 1.4931905126243306, + "grad_norm": 2.0104743672396794, + "learning_rate": 3.1834506956577716e-06, + "loss": 0.2591, + "step": 9758 + }, + { + "epoch": 1.493343534812548, + "grad_norm": 2.018241900306442, + "learning_rate": 3.181637683371367e-06, + "loss": 0.2783, + "step": 9759 + }, + { + "epoch": 1.493496557000765, + "grad_norm": 2.220811704079774, + "learning_rate": 3.17982508982727e-06, + "loss": 0.3125, + "step": 9760 + }, + { + "epoch": 1.4936495791889823, + "grad_norm": 2.0207958924684144, + "learning_rate": 3.1780129151367965e-06, + "loss": 0.3498, + "step": 9761 + }, + { + "epoch": 1.4938026013771997, + "grad_norm": 1.9336176704001233, + "learning_rate": 3.176201159411232e-06, + "loss": 0.2944, + "step": 9762 + }, + { + "epoch": 1.493955623565417, + "grad_norm": 2.0518057739899347, + "learning_rate": 3.174389822761853e-06, + "loss": 0.3147, + "step": 9763 + }, + { + "epoch": 1.4941086457536343, + "grad_norm": 2.0524245884405903, + "learning_rate": 3.172578905299899e-06, + "loss": 0.3182, + "step": 9764 + }, + { + "epoch": 1.4942616679418514, + "grad_norm": 2.403753805984202, + "learning_rate": 3.1707684071365786e-06, + "loss": 0.3841, + "step": 9765 + }, + { + "epoch": 1.4944146901300688, + "grad_norm": 2.057508983016502, + "learning_rate": 3.168958328383088e-06, + "loss": 0.3566, + "step": 9766 + }, + { + "epoch": 1.4945677123182861, + "grad_norm": 1.9234820653276592, + "learning_rate": 3.1671486691505906e-06, + "loss": 0.2878, + "step": 9767 + }, + { + "epoch": 1.4947207345065034, + "grad_norm": 2.063565746243901, + "learning_rate": 3.1653394295502226e-06, + "loss": 0.3536, + "step": 9768 + }, + { + "epoch": 1.4948737566947208, + "grad_norm": 2.19824145243771, + "learning_rate": 3.1635306096930985e-06, + "loss": 0.3115, + "step": 9769 + }, + { + "epoch": 1.495026778882938, + "grad_norm": 2.3743711638225085, + "learning_rate": 3.1617222096903045e-06, + "loss": 0.3565, + "step": 9770 + }, + { + "epoch": 1.4951798010711554, + "grad_norm": 2.041310825172398, + "learning_rate": 3.1599142296529005e-06, + "loss": 0.3498, + "step": 9771 + }, + { + "epoch": 1.4953328232593726, + "grad_norm": 2.103580031841215, + "learning_rate": 3.1581066696919216e-06, + "loss": 0.3418, + "step": 9772 + }, + { + "epoch": 1.49548584544759, + "grad_norm": 1.6728540200328423, + "learning_rate": 3.1562995299183786e-06, + "loss": 0.293, + "step": 9773 + }, + { + "epoch": 1.4956388676358072, + "grad_norm": 2.052812844688033, + "learning_rate": 3.1544928104432536e-06, + "loss": 0.3172, + "step": 9774 + }, + { + "epoch": 1.4957918898240246, + "grad_norm": 1.99378419575446, + "learning_rate": 3.1526865113775063e-06, + "loss": 0.2531, + "step": 9775 + }, + { + "epoch": 1.495944912012242, + "grad_norm": 2.206302011302261, + "learning_rate": 3.1508806328320653e-06, + "loss": 0.3198, + "step": 9776 + }, + { + "epoch": 1.496097934200459, + "grad_norm": 2.375311640575206, + "learning_rate": 3.1490751749178394e-06, + "loss": 0.3092, + "step": 9777 + }, + { + "epoch": 1.4962509563886763, + "grad_norm": 2.3242990738348412, + "learning_rate": 3.1472701377457082e-06, + "loss": 0.3788, + "step": 9778 + }, + { + "epoch": 1.4964039785768937, + "grad_norm": 2.068015816110016, + "learning_rate": 3.145465521426525e-06, + "loss": 0.3278, + "step": 9779 + }, + { + "epoch": 1.496557000765111, + "grad_norm": 2.089112948601707, + "learning_rate": 3.1436613260711166e-06, + "loss": 0.2761, + "step": 9780 + }, + { + "epoch": 1.4967100229533283, + "grad_norm": 2.3392197952388565, + "learning_rate": 3.141857551790295e-06, + "loss": 0.3837, + "step": 9781 + }, + { + "epoch": 1.4968630451415454, + "grad_norm": 2.4626352152836493, + "learning_rate": 3.1400541986948274e-06, + "loss": 0.3356, + "step": 9782 + }, + { + "epoch": 1.4970160673297628, + "grad_norm": 1.943327658989901, + "learning_rate": 3.1382512668954635e-06, + "loss": 0.2664, + "step": 9783 + }, + { + "epoch": 1.4971690895179801, + "grad_norm": 2.0015017364893657, + "learning_rate": 3.13644875650294e-06, + "loss": 0.277, + "step": 9784 + }, + { + "epoch": 1.4973221117061974, + "grad_norm": 1.991863536015547, + "learning_rate": 3.1346466676279453e-06, + "loss": 0.3068, + "step": 9785 + }, + { + "epoch": 1.4974751338944148, + "grad_norm": 2.1289913120304527, + "learning_rate": 3.1328450003811526e-06, + "loss": 0.2519, + "step": 9786 + }, + { + "epoch": 1.497628156082632, + "grad_norm": 2.4980243023068014, + "learning_rate": 3.131043754873219e-06, + "loss": 0.3528, + "step": 9787 + }, + { + "epoch": 1.4977811782708492, + "grad_norm": 2.2767558515508273, + "learning_rate": 3.1292429312147542e-06, + "loss": 0.368, + "step": 9788 + }, + { + "epoch": 1.4979342004590666, + "grad_norm": 2.16350370343632, + "learning_rate": 3.127442529516362e-06, + "loss": 0.4144, + "step": 9789 + }, + { + "epoch": 1.498087222647284, + "grad_norm": 2.3882333717742847, + "learning_rate": 3.1256425498886132e-06, + "loss": 0.3766, + "step": 9790 + }, + { + "epoch": 1.4982402448355012, + "grad_norm": 2.1894069009771053, + "learning_rate": 3.123842992442042e-06, + "loss": 0.356, + "step": 9791 + }, + { + "epoch": 1.4983932670237183, + "grad_norm": 2.1408227407508496, + "learning_rate": 3.122043857287176e-06, + "loss": 0.2882, + "step": 9792 + }, + { + "epoch": 1.4985462892119357, + "grad_norm": 1.9419697702027892, + "learning_rate": 3.120245144534505e-06, + "loss": 0.2727, + "step": 9793 + }, + { + "epoch": 1.498699311400153, + "grad_norm": 2.0701350644107066, + "learning_rate": 3.118446854294488e-06, + "loss": 0.2933, + "step": 9794 + }, + { + "epoch": 1.4988523335883703, + "grad_norm": 1.7478356478182149, + "learning_rate": 3.116648986677574e-06, + "loss": 0.2833, + "step": 9795 + }, + { + "epoch": 1.4990053557765877, + "grad_norm": 2.0429077290330424, + "learning_rate": 3.114851541794173e-06, + "loss": 0.2941, + "step": 9796 + }, + { + "epoch": 1.4991583779648048, + "grad_norm": 2.2981563657573347, + "learning_rate": 3.1130545197546735e-06, + "loss": 0.3765, + "step": 9797 + }, + { + "epoch": 1.4993114001530221, + "grad_norm": 2.1883567916548214, + "learning_rate": 3.111257920669438e-06, + "loss": 0.3353, + "step": 9798 + }, + { + "epoch": 1.4994644223412394, + "grad_norm": 2.0688655353997576, + "learning_rate": 3.109461744648803e-06, + "loss": 0.2819, + "step": 9799 + }, + { + "epoch": 1.4996174445294568, + "grad_norm": 2.1705196804072133, + "learning_rate": 3.107665991803078e-06, + "loss": 0.368, + "step": 9800 + }, + { + "epoch": 1.4997704667176741, + "grad_norm": 2.2061746818166714, + "learning_rate": 3.1058706622425473e-06, + "loss": 0.3808, + "step": 9801 + }, + { + "epoch": 1.4999234889058912, + "grad_norm": 2.054827111884937, + "learning_rate": 3.1040757560774694e-06, + "loss": 0.3585, + "step": 9802 + }, + { + "epoch": 1.5000765110941088, + "grad_norm": 2.121121071144255, + "learning_rate": 3.1022812734180765e-06, + "loss": 0.274, + "step": 9803 + }, + { + "epoch": 1.5002295332823259, + "grad_norm": 2.080239488774426, + "learning_rate": 3.1004872143745734e-06, + "loss": 0.2902, + "step": 9804 + }, + { + "epoch": 1.5003825554705432, + "grad_norm": 2.049230990374885, + "learning_rate": 3.0986935790571427e-06, + "loss": 0.3381, + "step": 9805 + }, + { + "epoch": 1.5005355776587606, + "grad_norm": 2.2067647222870383, + "learning_rate": 3.0969003675759368e-06, + "loss": 0.3135, + "step": 9806 + }, + { + "epoch": 1.5006885998469777, + "grad_norm": 1.9066668807836955, + "learning_rate": 3.0951075800410847e-06, + "loss": 0.4619, + "step": 9807 + }, + { + "epoch": 1.5008416220351952, + "grad_norm": 2.2749545171097, + "learning_rate": 3.093315216562688e-06, + "loss": 0.3464, + "step": 9808 + }, + { + "epoch": 1.5009946442234123, + "grad_norm": 2.0411962491402984, + "learning_rate": 3.0915232772508196e-06, + "loss": 0.2835, + "step": 9809 + }, + { + "epoch": 1.5011476664116297, + "grad_norm": 2.191472037931345, + "learning_rate": 3.08973176221554e-06, + "loss": 0.2974, + "step": 9810 + }, + { + "epoch": 1.501300688599847, + "grad_norm": 2.1908312969139843, + "learning_rate": 3.0879406715668623e-06, + "loss": 0.3244, + "step": 9811 + }, + { + "epoch": 1.501453710788064, + "grad_norm": 2.136787196257136, + "learning_rate": 3.086150005414784e-06, + "loss": 0.3084, + "step": 9812 + }, + { + "epoch": 1.5016067329762817, + "grad_norm": 1.894927128405544, + "learning_rate": 3.084359763869289e-06, + "loss": 0.3001, + "step": 9813 + }, + { + "epoch": 1.5017597551644988, + "grad_norm": 2.1745865419199646, + "learning_rate": 3.0825699470403114e-06, + "loss": 0.3858, + "step": 9814 + }, + { + "epoch": 1.501912777352716, + "grad_norm": 1.9608382991991724, + "learning_rate": 3.080780555037771e-06, + "loss": 0.3078, + "step": 9815 + }, + { + "epoch": 1.5020657995409334, + "grad_norm": 2.1040554688231343, + "learning_rate": 3.0789915879715715e-06, + "loss": 0.3482, + "step": 9816 + }, + { + "epoch": 1.5022188217291508, + "grad_norm": 1.9905730959553891, + "learning_rate": 3.077203045951567e-06, + "loss": 0.277, + "step": 9817 + }, + { + "epoch": 1.502371843917368, + "grad_norm": 2.1890814408914747, + "learning_rate": 3.075414929087609e-06, + "loss": 0.2968, + "step": 9818 + }, + { + "epoch": 1.5025248661055852, + "grad_norm": 2.495997552639193, + "learning_rate": 3.0736272374895115e-06, + "loss": 0.2815, + "step": 9819 + }, + { + "epoch": 1.5026778882938026, + "grad_norm": 2.0086629122191604, + "learning_rate": 3.0718399712670566e-06, + "loss": 0.2857, + "step": 9820 + }, + { + "epoch": 1.5028309104820199, + "grad_norm": 2.2914428417288217, + "learning_rate": 3.0700531305300153e-06, + "loss": 0.3293, + "step": 9821 + }, + { + "epoch": 1.5029839326702372, + "grad_norm": 2.1358875597800973, + "learning_rate": 3.0682667153881228e-06, + "loss": 0.273, + "step": 9822 + }, + { + "epoch": 1.5031369548584546, + "grad_norm": 1.958831411297443, + "learning_rate": 3.0664807259510842e-06, + "loss": 0.3609, + "step": 9823 + }, + { + "epoch": 1.5032899770466717, + "grad_norm": 2.0320226227226743, + "learning_rate": 3.0646951623285904e-06, + "loss": 0.2767, + "step": 9824 + }, + { + "epoch": 1.5034429992348892, + "grad_norm": 2.2398279587072087, + "learning_rate": 3.062910024630298e-06, + "loss": 0.3316, + "step": 9825 + }, + { + "epoch": 1.5035960214231063, + "grad_norm": 2.0793764036363735, + "learning_rate": 3.061125312965838e-06, + "loss": 0.2947, + "step": 9826 + }, + { + "epoch": 1.5037490436113237, + "grad_norm": 2.350717021248169, + "learning_rate": 3.059341027444819e-06, + "loss": 0.3965, + "step": 9827 + }, + { + "epoch": 1.503902065799541, + "grad_norm": 2.0080218668074705, + "learning_rate": 3.057557168176819e-06, + "loss": 0.4766, + "step": 9828 + }, + { + "epoch": 1.504055087987758, + "grad_norm": 2.192980069727335, + "learning_rate": 3.0557737352713925e-06, + "loss": 0.3138, + "step": 9829 + }, + { + "epoch": 1.5042081101759757, + "grad_norm": 2.277627119535517, + "learning_rate": 3.0539907288380664e-06, + "loss": 0.271, + "step": 9830 + }, + { + "epoch": 1.5043611323641928, + "grad_norm": 2.060097566540218, + "learning_rate": 3.0522081489863433e-06, + "loss": 0.2744, + "step": 9831 + }, + { + "epoch": 1.50451415455241, + "grad_norm": 2.1953025152712855, + "learning_rate": 3.0504259958256966e-06, + "loss": 0.2951, + "step": 9832 + }, + { + "epoch": 1.5046671767406274, + "grad_norm": 2.0592054511449187, + "learning_rate": 3.0486442694655737e-06, + "loss": 0.3091, + "step": 9833 + }, + { + "epoch": 1.5048201989288446, + "grad_norm": 2.3291429300803124, + "learning_rate": 3.0468629700154072e-06, + "loss": 0.3384, + "step": 9834 + }, + { + "epoch": 1.504973221117062, + "grad_norm": 2.1766038192419943, + "learning_rate": 3.045082097584583e-06, + "loss": 0.3497, + "step": 9835 + }, + { + "epoch": 1.5051262433052792, + "grad_norm": 2.269668303049232, + "learning_rate": 3.043301652282471e-06, + "loss": 0.3362, + "step": 9836 + }, + { + "epoch": 1.5052792654934966, + "grad_norm": 2.488874865174472, + "learning_rate": 3.041521634218426e-06, + "loss": 0.3862, + "step": 9837 + }, + { + "epoch": 1.5054322876817139, + "grad_norm": 1.9458045410505225, + "learning_rate": 3.0397420435017565e-06, + "loss": 0.2328, + "step": 9838 + }, + { + "epoch": 1.505585309869931, + "grad_norm": 2.125888170369278, + "learning_rate": 3.0379628802417525e-06, + "loss": 0.2961, + "step": 9839 + }, + { + "epoch": 1.5057383320581486, + "grad_norm": 2.210814431359663, + "learning_rate": 3.0361841445476914e-06, + "loss": 0.3397, + "step": 9840 + }, + { + "epoch": 1.5058913542463657, + "grad_norm": 2.1884877256421937, + "learning_rate": 3.0344058365287977e-06, + "loss": 0.3376, + "step": 9841 + }, + { + "epoch": 1.506044376434583, + "grad_norm": 2.419550996918986, + "learning_rate": 3.032627956294294e-06, + "loss": 0.3823, + "step": 9842 + }, + { + "epoch": 1.5061973986228003, + "grad_norm": 2.299565953799042, + "learning_rate": 3.0308505039533675e-06, + "loss": 0.3659, + "step": 9843 + }, + { + "epoch": 1.5063504208110174, + "grad_norm": 2.213477786477071, + "learning_rate": 3.0290734796151687e-06, + "loss": 0.3586, + "step": 9844 + }, + { + "epoch": 1.506503442999235, + "grad_norm": 2.1247224716904185, + "learning_rate": 3.0272968833888407e-06, + "loss": 0.3304, + "step": 9845 + }, + { + "epoch": 1.506656465187452, + "grad_norm": 2.1254476906607307, + "learning_rate": 3.0255207153834886e-06, + "loss": 0.2704, + "step": 9846 + }, + { + "epoch": 1.5068094873756694, + "grad_norm": 2.436520978872859, + "learning_rate": 3.023744975708194e-06, + "loss": 0.3755, + "step": 9847 + }, + { + "epoch": 1.5069625095638868, + "grad_norm": 2.2069707007141095, + "learning_rate": 3.021969664472012e-06, + "loss": 0.2866, + "step": 9848 + }, + { + "epoch": 1.507115531752104, + "grad_norm": 1.9306592406380063, + "learning_rate": 3.0201947817839704e-06, + "loss": 0.3086, + "step": 9849 + }, + { + "epoch": 1.5072685539403214, + "grad_norm": 2.26216940279369, + "learning_rate": 3.0184203277530723e-06, + "loss": 0.38, + "step": 9850 + }, + { + "epoch": 1.5074215761285386, + "grad_norm": 2.0832013755845, + "learning_rate": 3.0166463024882943e-06, + "loss": 0.3438, + "step": 9851 + }, + { + "epoch": 1.5075745983167559, + "grad_norm": 2.2765780863751166, + "learning_rate": 3.014872706098586e-06, + "loss": 0.3067, + "step": 9852 + }, + { + "epoch": 1.5077276205049732, + "grad_norm": 2.5699472822824223, + "learning_rate": 3.0130995386928706e-06, + "loss": 0.3503, + "step": 9853 + }, + { + "epoch": 1.5078806426931906, + "grad_norm": 2.005607019002461, + "learning_rate": 3.0113268003800456e-06, + "loss": 0.2863, + "step": 9854 + }, + { + "epoch": 1.5080336648814079, + "grad_norm": 1.9684914601471286, + "learning_rate": 3.0095544912689822e-06, + "loss": 0.2837, + "step": 9855 + }, + { + "epoch": 1.508186687069625, + "grad_norm": 1.7614254445638609, + "learning_rate": 3.007782611468524e-06, + "loss": 0.3154, + "step": 9856 + }, + { + "epoch": 1.5083397092578426, + "grad_norm": 2.0535358999385505, + "learning_rate": 3.0060111610874886e-06, + "loss": 0.2978, + "step": 9857 + }, + { + "epoch": 1.5084927314460597, + "grad_norm": 2.2628065523811456, + "learning_rate": 3.0042401402346687e-06, + "loss": 0.3182, + "step": 9858 + }, + { + "epoch": 1.508645753634277, + "grad_norm": 1.9545007110492567, + "learning_rate": 3.0024695490188296e-06, + "loss": 0.3188, + "step": 9859 + }, + { + "epoch": 1.5087987758224943, + "grad_norm": 2.1848377979596627, + "learning_rate": 3.00069938754871e-06, + "loss": 0.3695, + "step": 9860 + }, + { + "epoch": 1.5089517980107114, + "grad_norm": 2.1635874874215633, + "learning_rate": 2.9989296559330215e-06, + "loss": 0.3139, + "step": 9861 + }, + { + "epoch": 1.509104820198929, + "grad_norm": 2.338481731423221, + "learning_rate": 2.9971603542804495e-06, + "loss": 0.3425, + "step": 9862 + }, + { + "epoch": 1.509257842387146, + "grad_norm": 2.2282324761992554, + "learning_rate": 2.995391482699661e-06, + "loss": 0.3796, + "step": 9863 + }, + { + "epoch": 1.5094108645753634, + "grad_norm": 2.276627350922109, + "learning_rate": 2.9936230412992805e-06, + "loss": 0.2884, + "step": 9864 + }, + { + "epoch": 1.5095638867635808, + "grad_norm": 1.9414960546579652, + "learning_rate": 2.9918550301879145e-06, + "loss": 0.336, + "step": 9865 + }, + { + "epoch": 1.5097169089517979, + "grad_norm": 2.1301779835444377, + "learning_rate": 2.990087449474154e-06, + "loss": 0.3303, + "step": 9866 + }, + { + "epoch": 1.5098699311400154, + "grad_norm": 1.9945124667754917, + "learning_rate": 2.9883202992665438e-06, + "loss": 0.2838, + "step": 9867 + }, + { + "epoch": 1.5100229533282326, + "grad_norm": 2.2782006257588474, + "learning_rate": 2.986553579673609e-06, + "loss": 0.3139, + "step": 9868 + }, + { + "epoch": 1.5101759755164499, + "grad_norm": 1.9001158252077175, + "learning_rate": 2.984787290803863e-06, + "loss": 0.279, + "step": 9869 + }, + { + "epoch": 1.5103289977046672, + "grad_norm": 1.9890668141893428, + "learning_rate": 2.983021432765767e-06, + "loss": 0.3103, + "step": 9870 + }, + { + "epoch": 1.5104820198928843, + "grad_norm": 1.9261758653878254, + "learning_rate": 2.9812560056677785e-06, + "loss": 0.2946, + "step": 9871 + }, + { + "epoch": 1.5106350420811019, + "grad_norm": 2.2334108039049054, + "learning_rate": 2.9794910096183183e-06, + "loss": 0.3224, + "step": 9872 + }, + { + "epoch": 1.510788064269319, + "grad_norm": 2.1237274490949276, + "learning_rate": 2.9777264447257748e-06, + "loss": 0.3449, + "step": 9873 + }, + { + "epoch": 1.5109410864575363, + "grad_norm": 2.1770374643161667, + "learning_rate": 2.9759623110985236e-06, + "loss": 0.3317, + "step": 9874 + }, + { + "epoch": 1.5110941086457537, + "grad_norm": 2.4761340756648473, + "learning_rate": 2.974198608844906e-06, + "loss": 0.3783, + "step": 9875 + }, + { + "epoch": 1.5112471308339708, + "grad_norm": 2.249465172824092, + "learning_rate": 2.9724353380732364e-06, + "loss": 0.4108, + "step": 9876 + }, + { + "epoch": 1.5114001530221883, + "grad_norm": 2.1262704705662143, + "learning_rate": 2.9706724988918043e-06, + "loss": 0.2787, + "step": 9877 + }, + { + "epoch": 1.5115531752104054, + "grad_norm": 2.608599424875475, + "learning_rate": 2.968910091408873e-06, + "loss": 0.3657, + "step": 9878 + }, + { + "epoch": 1.5117061973986228, + "grad_norm": 2.0937507208713497, + "learning_rate": 2.9671481157326785e-06, + "loss": 0.273, + "step": 9879 + }, + { + "epoch": 1.51185921958684, + "grad_norm": 1.946594115741587, + "learning_rate": 2.9653865719714326e-06, + "loss": 0.2656, + "step": 9880 + }, + { + "epoch": 1.5120122417750572, + "grad_norm": 1.962879276420753, + "learning_rate": 2.9636254602333147e-06, + "loss": 0.2833, + "step": 9881 + }, + { + "epoch": 1.5121652639632748, + "grad_norm": 2.6116433052824033, + "learning_rate": 2.9618647806264856e-06, + "loss": 0.315, + "step": 9882 + }, + { + "epoch": 1.5123182861514919, + "grad_norm": 2.350676279516989, + "learning_rate": 2.9601045332590727e-06, + "loss": 0.3421, + "step": 9883 + }, + { + "epoch": 1.5124713083397092, + "grad_norm": 1.989790085557811, + "learning_rate": 2.9583447182391813e-06, + "loss": 0.3153, + "step": 9884 + }, + { + "epoch": 1.5126243305279266, + "grad_norm": 1.7572196885042126, + "learning_rate": 2.9565853356748863e-06, + "loss": 0.2197, + "step": 9885 + }, + { + "epoch": 1.5127773527161439, + "grad_norm": 2.297276054650986, + "learning_rate": 2.954826385674241e-06, + "loss": 0.3149, + "step": 9886 + }, + { + "epoch": 1.5129303749043612, + "grad_norm": 2.0684649483413184, + "learning_rate": 2.9530678683452685e-06, + "loss": 0.2807, + "step": 9887 + }, + { + "epoch": 1.5130833970925783, + "grad_norm": 2.083082465518758, + "learning_rate": 2.951309783795965e-06, + "loss": 0.3097, + "step": 9888 + }, + { + "epoch": 1.5132364192807957, + "grad_norm": 2.232352667798138, + "learning_rate": 2.9495521321343035e-06, + "loss": 0.2664, + "step": 9889 + }, + { + "epoch": 1.513389441469013, + "grad_norm": 2.0746473923615874, + "learning_rate": 2.947794913468226e-06, + "loss": 0.3053, + "step": 9890 + }, + { + "epoch": 1.5135424636572303, + "grad_norm": 2.248018803212016, + "learning_rate": 2.9460381279056482e-06, + "loss": 0.285, + "step": 9891 + }, + { + "epoch": 1.5136954858454477, + "grad_norm": 2.091935846460022, + "learning_rate": 2.9442817755544707e-06, + "loss": 0.2858, + "step": 9892 + }, + { + "epoch": 1.5138485080336648, + "grad_norm": 1.9566609105598456, + "learning_rate": 2.942525856522549e-06, + "loss": 0.3297, + "step": 9893 + }, + { + "epoch": 1.5140015302218823, + "grad_norm": 2.137727077428057, + "learning_rate": 2.9407703709177193e-06, + "loss": 0.2879, + "step": 9894 + }, + { + "epoch": 1.5141545524100994, + "grad_norm": 2.2093864848262124, + "learning_rate": 2.939015318847804e-06, + "loss": 0.2882, + "step": 9895 + }, + { + "epoch": 1.5143075745983168, + "grad_norm": 2.095268692569443, + "learning_rate": 2.9372607004205777e-06, + "loss": 0.3523, + "step": 9896 + }, + { + "epoch": 1.514460596786534, + "grad_norm": 2.4814418845151143, + "learning_rate": 2.935506515743798e-06, + "loss": 0.3139, + "step": 9897 + }, + { + "epoch": 1.5146136189747512, + "grad_norm": 1.987799327564212, + "learning_rate": 2.933752764925205e-06, + "loss": 0.2749, + "step": 9898 + }, + { + "epoch": 1.5147666411629688, + "grad_norm": 2.2456331156220917, + "learning_rate": 2.931999448072492e-06, + "loss": 0.3214, + "step": 9899 + }, + { + "epoch": 1.5149196633511859, + "grad_norm": 2.0249520005224007, + "learning_rate": 2.9302465652933476e-06, + "loss": 0.2876, + "step": 9900 + }, + { + "epoch": 1.5150726855394032, + "grad_norm": 1.9927759375396923, + "learning_rate": 2.9284941166954207e-06, + "loss": 0.2752, + "step": 9901 + }, + { + "epoch": 1.5152257077276206, + "grad_norm": 1.8998708876323396, + "learning_rate": 2.9267421023863274e-06, + "loss": 0.3092, + "step": 9902 + }, + { + "epoch": 1.5153787299158377, + "grad_norm": 2.1212028678752803, + "learning_rate": 2.924990522473676e-06, + "loss": 0.3021, + "step": 9903 + }, + { + "epoch": 1.5155317521040552, + "grad_norm": 1.9764806188380462, + "learning_rate": 2.923239377065038e-06, + "loss": 0.2792, + "step": 9904 + }, + { + "epoch": 1.5156847742922723, + "grad_norm": 1.919797871945186, + "learning_rate": 2.9214886662679467e-06, + "loss": 0.2539, + "step": 9905 + }, + { + "epoch": 1.5158377964804897, + "grad_norm": 1.7930166385110369, + "learning_rate": 2.9197383901899314e-06, + "loss": 0.27, + "step": 9906 + }, + { + "epoch": 1.515990818668707, + "grad_norm": 1.9817109015167513, + "learning_rate": 2.9179885489384797e-06, + "loss": 0.2365, + "step": 9907 + }, + { + "epoch": 1.516143840856924, + "grad_norm": 2.177398279153643, + "learning_rate": 2.916239142621057e-06, + "loss": 0.2703, + "step": 9908 + }, + { + "epoch": 1.5162968630451417, + "grad_norm": 2.1794166380623454, + "learning_rate": 2.914490171345099e-06, + "loss": 0.3407, + "step": 9909 + }, + { + "epoch": 1.5164498852333588, + "grad_norm": 2.40714069176058, + "learning_rate": 2.9127416352180195e-06, + "loss": 0.3685, + "step": 9910 + }, + { + "epoch": 1.516602907421576, + "grad_norm": 1.8834924452301767, + "learning_rate": 2.9109935343472007e-06, + "loss": 0.3248, + "step": 9911 + }, + { + "epoch": 1.5167559296097934, + "grad_norm": 2.1304268074488304, + "learning_rate": 2.9092458688399983e-06, + "loss": 0.2673, + "step": 9912 + }, + { + "epoch": 1.5169089517980106, + "grad_norm": 2.299973146784591, + "learning_rate": 2.907498638803753e-06, + "loss": 0.3088, + "step": 9913 + }, + { + "epoch": 1.517061973986228, + "grad_norm": 2.0890436295813135, + "learning_rate": 2.905751844345759e-06, + "loss": 0.2982, + "step": 9914 + }, + { + "epoch": 1.5172149961744452, + "grad_norm": 2.243008873416047, + "learning_rate": 2.9040054855732934e-06, + "loss": 0.3665, + "step": 9915 + }, + { + "epoch": 1.5173680183626626, + "grad_norm": 2.231300366616912, + "learning_rate": 2.902259562593618e-06, + "loss": 0.3656, + "step": 9916 + }, + { + "epoch": 1.5175210405508799, + "grad_norm": 2.314260763210238, + "learning_rate": 2.900514075513945e-06, + "loss": 0.3373, + "step": 9917 + }, + { + "epoch": 1.5176740627390972, + "grad_norm": 2.2686857788300676, + "learning_rate": 2.8987690244414735e-06, + "loss": 0.3182, + "step": 9918 + }, + { + "epoch": 1.5178270849273146, + "grad_norm": 2.0332689531706256, + "learning_rate": 2.8970244094833834e-06, + "loss": 0.2877, + "step": 9919 + }, + { + "epoch": 1.5179801071155317, + "grad_norm": 2.332225199392639, + "learning_rate": 2.895280230746804e-06, + "loss": 0.3246, + "step": 9920 + }, + { + "epoch": 1.518133129303749, + "grad_norm": 2.213902332927608, + "learning_rate": 2.893536488338863e-06, + "loss": 0.2677, + "step": 9921 + }, + { + "epoch": 1.5182861514919663, + "grad_norm": 2.058077346563243, + "learning_rate": 2.89179318236665e-06, + "loss": 0.3128, + "step": 9922 + }, + { + "epoch": 1.5184391736801837, + "grad_norm": 2.1206458507799684, + "learning_rate": 2.890050312937218e-06, + "loss": 0.2985, + "step": 9923 + }, + { + "epoch": 1.518592195868401, + "grad_norm": 2.5255851317820666, + "learning_rate": 2.8883078801576148e-06, + "loss": 0.3492, + "step": 9924 + }, + { + "epoch": 1.518745218056618, + "grad_norm": 2.239478346754188, + "learning_rate": 2.886565884134849e-06, + "loss": 0.3186, + "step": 9925 + }, + { + "epoch": 1.5188982402448357, + "grad_norm": 2.1589701070470655, + "learning_rate": 2.8848243249758934e-06, + "loss": 0.2861, + "step": 9926 + }, + { + "epoch": 1.5190512624330528, + "grad_norm": 2.038957431232218, + "learning_rate": 2.8830832027877134e-06, + "loss": 0.3916, + "step": 9927 + }, + { + "epoch": 1.51920428462127, + "grad_norm": 2.290410983887635, + "learning_rate": 2.8813425176772357e-06, + "loss": 0.3217, + "step": 9928 + }, + { + "epoch": 1.5193573068094874, + "grad_norm": 2.1377659989683018, + "learning_rate": 2.8796022697513627e-06, + "loss": 0.3644, + "step": 9929 + }, + { + "epoch": 1.5195103289977046, + "grad_norm": 2.1557974796471977, + "learning_rate": 2.87786245911697e-06, + "loss": 0.3259, + "step": 9930 + }, + { + "epoch": 1.519663351185922, + "grad_norm": 2.197537522420924, + "learning_rate": 2.876123085880904e-06, + "loss": 0.3123, + "step": 9931 + }, + { + "epoch": 1.5198163733741392, + "grad_norm": 1.8849797423696186, + "learning_rate": 2.874384150149989e-06, + "loss": 0.2838, + "step": 9932 + }, + { + "epoch": 1.5199693955623566, + "grad_norm": 2.355866028332641, + "learning_rate": 2.87264565203102e-06, + "loss": 0.3106, + "step": 9933 + }, + { + "epoch": 1.5201224177505739, + "grad_norm": 2.4298439369368086, + "learning_rate": 2.8709075916307626e-06, + "loss": 0.3564, + "step": 9934 + }, + { + "epoch": 1.520275439938791, + "grad_norm": 2.138768183921702, + "learning_rate": 2.8691699690559594e-06, + "loss": 0.3101, + "step": 9935 + }, + { + "epoch": 1.5204284621270086, + "grad_norm": 2.1084569658813064, + "learning_rate": 2.8674327844133243e-06, + "loss": 0.3014, + "step": 9936 + }, + { + "epoch": 1.5205814843152257, + "grad_norm": 1.894295714318006, + "learning_rate": 2.8656960378095444e-06, + "loss": 0.2854, + "step": 9937 + }, + { + "epoch": 1.520734506503443, + "grad_norm": 2.328776255293014, + "learning_rate": 2.863959729351281e-06, + "loss": 0.3072, + "step": 9938 + }, + { + "epoch": 1.5208875286916603, + "grad_norm": 2.4823991572727184, + "learning_rate": 2.8622238591451668e-06, + "loss": 0.3271, + "step": 9939 + }, + { + "epoch": 1.5210405508798774, + "grad_norm": 2.054904895081333, + "learning_rate": 2.8604884272978083e-06, + "loss": 0.2628, + "step": 9940 + }, + { + "epoch": 1.521193573068095, + "grad_norm": 2.0507676713357705, + "learning_rate": 2.8587534339157854e-06, + "loss": 0.3012, + "step": 9941 + }, + { + "epoch": 1.521346595256312, + "grad_norm": 2.174337355448324, + "learning_rate": 2.8570188791056507e-06, + "loss": 0.3171, + "step": 9942 + }, + { + "epoch": 1.5214996174445294, + "grad_norm": 1.8669974265756857, + "learning_rate": 2.855284762973931e-06, + "loss": 0.2611, + "step": 9943 + }, + { + "epoch": 1.5216526396327468, + "grad_norm": 2.063747456601812, + "learning_rate": 2.8535510856271207e-06, + "loss": 0.3354, + "step": 9944 + }, + { + "epoch": 1.5218056618209639, + "grad_norm": 2.4098971885429474, + "learning_rate": 2.8518178471717008e-06, + "loss": 0.3642, + "step": 9945 + }, + { + "epoch": 1.5219586840091814, + "grad_norm": 2.3625660024538857, + "learning_rate": 2.8500850477141086e-06, + "loss": 0.3025, + "step": 9946 + }, + { + "epoch": 1.5221117061973986, + "grad_norm": 2.3558171194167774, + "learning_rate": 2.8483526873607603e-06, + "loss": 0.3086, + "step": 9947 + }, + { + "epoch": 1.5222647283856159, + "grad_norm": 2.101587584902661, + "learning_rate": 2.8466207662180578e-06, + "loss": 0.2378, + "step": 9948 + }, + { + "epoch": 1.5224177505738332, + "grad_norm": 2.147038880991046, + "learning_rate": 2.8448892843923513e-06, + "loss": 0.3382, + "step": 9949 + }, + { + "epoch": 1.5225707727620506, + "grad_norm": 2.1708700692575142, + "learning_rate": 2.8431582419899894e-06, + "loss": 0.3196, + "step": 9950 + }, + { + "epoch": 1.5227237949502679, + "grad_norm": 2.2319621602439548, + "learning_rate": 2.841427639117279e-06, + "loss": 0.2803, + "step": 9951 + }, + { + "epoch": 1.522876817138485, + "grad_norm": 1.840557601791769, + "learning_rate": 2.839697475880496e-06, + "loss": 0.2953, + "step": 9952 + }, + { + "epoch": 1.5230298393267023, + "grad_norm": 1.9537839576314175, + "learning_rate": 2.837967752385906e-06, + "loss": 0.3168, + "step": 9953 + }, + { + "epoch": 1.5231828615149197, + "grad_norm": 1.9900787991732978, + "learning_rate": 2.836238468739737e-06, + "loss": 0.3043, + "step": 9954 + }, + { + "epoch": 1.523335883703137, + "grad_norm": 2.0738800146254412, + "learning_rate": 2.834509625048182e-06, + "loss": 0.2901, + "step": 9955 + }, + { + "epoch": 1.5234889058913543, + "grad_norm": 2.270382910441135, + "learning_rate": 2.8327812214174265e-06, + "loss": 0.301, + "step": 9956 + }, + { + "epoch": 1.5236419280795714, + "grad_norm": 1.9818118098041126, + "learning_rate": 2.831053257953613e-06, + "loss": 0.2977, + "step": 9957 + }, + { + "epoch": 1.523794950267789, + "grad_norm": 2.233450766833048, + "learning_rate": 2.8293257347628655e-06, + "loss": 0.3289, + "step": 9958 + }, + { + "epoch": 1.523947972456006, + "grad_norm": 2.298257535897543, + "learning_rate": 2.8275986519512753e-06, + "loss": 0.3549, + "step": 9959 + }, + { + "epoch": 1.5241009946442234, + "grad_norm": 2.2911297653055316, + "learning_rate": 2.8258720096249116e-06, + "loss": 0.3306, + "step": 9960 + }, + { + "epoch": 1.5242540168324408, + "grad_norm": 2.0928809609607075, + "learning_rate": 2.824145807889812e-06, + "loss": 0.3591, + "step": 9961 + }, + { + "epoch": 1.5244070390206579, + "grad_norm": 2.085267934794212, + "learning_rate": 2.8224200468519914e-06, + "loss": 0.2872, + "step": 9962 + }, + { + "epoch": 1.5245600612088754, + "grad_norm": 2.441486160019589, + "learning_rate": 2.8206947266174346e-06, + "loss": 0.3302, + "step": 9963 + }, + { + "epoch": 1.5247130833970926, + "grad_norm": 2.1910546788096297, + "learning_rate": 2.818969847292099e-06, + "loss": 0.2417, + "step": 9964 + }, + { + "epoch": 1.5248661055853099, + "grad_norm": 2.224279279401526, + "learning_rate": 2.817245408981919e-06, + "loss": 0.2423, + "step": 9965 + }, + { + "epoch": 1.5250191277735272, + "grad_norm": 2.1629445091645914, + "learning_rate": 2.8155214117927964e-06, + "loss": 0.3308, + "step": 9966 + }, + { + "epoch": 1.5251721499617443, + "grad_norm": 1.8555635506479633, + "learning_rate": 2.81379785583061e-06, + "loss": 0.2878, + "step": 9967 + }, + { + "epoch": 1.5253251721499619, + "grad_norm": 1.9671814004381067, + "learning_rate": 2.8120747412012096e-06, + "loss": 0.285, + "step": 9968 + }, + { + "epoch": 1.525478194338179, + "grad_norm": 2.16675648292726, + "learning_rate": 2.810352068010419e-06, + "loss": 0.3092, + "step": 9969 + }, + { + "epoch": 1.5256312165263963, + "grad_norm": 2.2905687791113416, + "learning_rate": 2.808629836364034e-06, + "loss": 0.3053, + "step": 9970 + }, + { + "epoch": 1.5257842387146137, + "grad_norm": 1.9946227786642385, + "learning_rate": 2.806908046367823e-06, + "loss": 0.2843, + "step": 9971 + }, + { + "epoch": 1.5259372609028308, + "grad_norm": 2.174909387644281, + "learning_rate": 2.8051866981275298e-06, + "loss": 0.3053, + "step": 9972 + }, + { + "epoch": 1.5260902830910483, + "grad_norm": 2.359615508272599, + "learning_rate": 2.803465791748864e-06, + "loss": 0.281, + "step": 9973 + }, + { + "epoch": 1.5262433052792654, + "grad_norm": 2.2931829206578898, + "learning_rate": 2.8017453273375227e-06, + "loss": 0.3119, + "step": 9974 + }, + { + "epoch": 1.5263963274674828, + "grad_norm": 2.2215604585576694, + "learning_rate": 2.8000253049991577e-06, + "loss": 0.3205, + "step": 9975 + }, + { + "epoch": 1.5265493496557, + "grad_norm": 2.576249260369154, + "learning_rate": 2.798305724839402e-06, + "loss": 0.351, + "step": 9976 + }, + { + "epoch": 1.5267023718439172, + "grad_norm": 2.59817822012346, + "learning_rate": 2.796586586963871e-06, + "loss": 0.3171, + "step": 9977 + }, + { + "epoch": 1.5268553940321348, + "grad_norm": 2.8692211062905124, + "learning_rate": 2.7948678914781347e-06, + "loss": 0.3653, + "step": 9978 + }, + { + "epoch": 1.5270084162203519, + "grad_norm": 2.1262724881376536, + "learning_rate": 2.7931496384877443e-06, + "loss": 0.2916, + "step": 9979 + }, + { + "epoch": 1.5271614384085692, + "grad_norm": 1.8299487915344057, + "learning_rate": 2.7914318280982346e-06, + "loss": 0.2591, + "step": 9980 + }, + { + "epoch": 1.5273144605967865, + "grad_norm": 2.3353736019494318, + "learning_rate": 2.7897144604150907e-06, + "loss": 0.3522, + "step": 9981 + }, + { + "epoch": 1.5274674827850037, + "grad_norm": 2.333490426475123, + "learning_rate": 2.7879975355437904e-06, + "loss": 0.3201, + "step": 9982 + }, + { + "epoch": 1.5276205049732212, + "grad_norm": 2.195731537268738, + "learning_rate": 2.786281053589779e-06, + "loss": 0.3418, + "step": 9983 + }, + { + "epoch": 1.5277735271614383, + "grad_norm": 2.755496101595831, + "learning_rate": 2.7845650146584626e-06, + "loss": 0.3516, + "step": 9984 + }, + { + "epoch": 1.5279265493496557, + "grad_norm": 2.3936080171100325, + "learning_rate": 2.782849418855238e-06, + "loss": 0.3365, + "step": 9985 + }, + { + "epoch": 1.528079571537873, + "grad_norm": 2.258254891109735, + "learning_rate": 2.7811342662854636e-06, + "loss": 0.3154, + "step": 9986 + }, + { + "epoch": 1.5282325937260903, + "grad_norm": 2.3318371299600336, + "learning_rate": 2.7794195570544745e-06, + "loss": 0.2555, + "step": 9987 + }, + { + "epoch": 1.5283856159143077, + "grad_norm": 2.022711659963486, + "learning_rate": 2.7777052912675785e-06, + "loss": 0.325, + "step": 9988 + }, + { + "epoch": 1.5285386381025248, + "grad_norm": 2.2626567628611745, + "learning_rate": 2.7759914690300536e-06, + "loss": 0.3974, + "step": 9989 + }, + { + "epoch": 1.528691660290742, + "grad_norm": 2.236395178465297, + "learning_rate": 2.7742780904471536e-06, + "loss": 0.3225, + "step": 9990 + }, + { + "epoch": 1.5288446824789594, + "grad_norm": 2.183933947211389, + "learning_rate": 2.772565155624103e-06, + "loss": 0.3282, + "step": 9991 + }, + { + "epoch": 1.5289977046671768, + "grad_norm": 2.3450557515589407, + "learning_rate": 2.7708526646660993e-06, + "loss": 0.3569, + "step": 9992 + }, + { + "epoch": 1.529150726855394, + "grad_norm": 1.7888436151823426, + "learning_rate": 2.769140617678315e-06, + "loss": 0.2846, + "step": 9993 + }, + { + "epoch": 1.5293037490436112, + "grad_norm": 2.0381056570029945, + "learning_rate": 2.767429014765889e-06, + "loss": 0.3703, + "step": 9994 + }, + { + "epoch": 1.5294567712318288, + "grad_norm": 2.2819477215176645, + "learning_rate": 2.7657178560339483e-06, + "loss": 0.2884, + "step": 9995 + }, + { + "epoch": 1.5296097934200459, + "grad_norm": 2.3538836527548694, + "learning_rate": 2.7640071415875703e-06, + "loss": 0.2908, + "step": 9996 + }, + { + "epoch": 1.5297628156082632, + "grad_norm": 2.017525350368832, + "learning_rate": 2.762296871531819e-06, + "loss": 0.369, + "step": 9997 + }, + { + "epoch": 1.5299158377964805, + "grad_norm": 2.146410794652233, + "learning_rate": 2.7605870459717367e-06, + "loss": 0.3486, + "step": 9998 + }, + { + "epoch": 1.5300688599846977, + "grad_norm": 2.170593065314875, + "learning_rate": 2.7588776650123215e-06, + "loss": 0.312, + "step": 9999 + }, + { + "epoch": 1.5302218821729152, + "grad_norm": 2.277094271624863, + "learning_rate": 2.7571687287585524e-06, + "loss": 0.3413, + "step": 10000 + }, + { + "epoch": 1.5303749043611323, + "grad_norm": 2.468686944978311, + "learning_rate": 2.7554602373153938e-06, + "loss": 0.3502, + "step": 10001 + }, + { + "epoch": 1.5305279265493497, + "grad_norm": 2.378155880261049, + "learning_rate": 2.7537521907877553e-06, + "loss": 0.3338, + "step": 10002 + }, + { + "epoch": 1.530680948737567, + "grad_norm": 2.177448943408823, + "learning_rate": 2.7520445892805457e-06, + "loss": 0.3183, + "step": 10003 + }, + { + "epoch": 1.530833970925784, + "grad_norm": 2.3647380308511594, + "learning_rate": 2.7503374328986355e-06, + "loss": 0.359, + "step": 10004 + }, + { + "epoch": 1.5309869931140017, + "grad_norm": 2.137209652082463, + "learning_rate": 2.748630721746858e-06, + "loss": 0.324, + "step": 10005 + }, + { + "epoch": 1.5311400153022188, + "grad_norm": 2.421214324585907, + "learning_rate": 2.7469244559300377e-06, + "loss": 0.3415, + "step": 10006 + }, + { + "epoch": 1.531293037490436, + "grad_norm": 2.1578732014508675, + "learning_rate": 2.7452186355529642e-06, + "loss": 0.3266, + "step": 10007 + }, + { + "epoch": 1.5314460596786534, + "grad_norm": 2.110705447832477, + "learning_rate": 2.7435132607203896e-06, + "loss": 0.3343, + "step": 10008 + }, + { + "epoch": 1.5315990818668705, + "grad_norm": 2.1867109539775305, + "learning_rate": 2.7418083315370557e-06, + "loss": 0.2972, + "step": 10009 + }, + { + "epoch": 1.531752104055088, + "grad_norm": 2.1025768332337798, + "learning_rate": 2.740103848107667e-06, + "loss": 0.3167, + "step": 10010 + }, + { + "epoch": 1.5319051262433052, + "grad_norm": 1.956002678795125, + "learning_rate": 2.7383998105369014e-06, + "loss": 0.3239, + "step": 10011 + }, + { + "epoch": 1.5320581484315225, + "grad_norm": 2.234365363174236, + "learning_rate": 2.736696218929411e-06, + "loss": 0.3692, + "step": 10012 + }, + { + "epoch": 1.5322111706197399, + "grad_norm": 2.360613287989332, + "learning_rate": 2.73499307338982e-06, + "loss": 0.3553, + "step": 10013 + }, + { + "epoch": 1.532364192807957, + "grad_norm": 2.0106390616987166, + "learning_rate": 2.7332903740227256e-06, + "loss": 0.2996, + "step": 10014 + }, + { + "epoch": 1.5325172149961745, + "grad_norm": 1.772493142680713, + "learning_rate": 2.7315881209326967e-06, + "loss": 0.2977, + "step": 10015 + }, + { + "epoch": 1.5326702371843917, + "grad_norm": 2.069526077229527, + "learning_rate": 2.729886314224275e-06, + "loss": 0.3248, + "step": 10016 + }, + { + "epoch": 1.532823259372609, + "grad_norm": 2.1173816114427004, + "learning_rate": 2.7281849540019755e-06, + "loss": 0.2778, + "step": 10017 + }, + { + "epoch": 1.5329762815608263, + "grad_norm": 2.4353246203851686, + "learning_rate": 2.726484040370286e-06, + "loss": 0.3159, + "step": 10018 + }, + { + "epoch": 1.5331293037490437, + "grad_norm": 2.080059375980662, + "learning_rate": 2.724783573433666e-06, + "loss": 0.2722, + "step": 10019 + }, + { + "epoch": 1.533282325937261, + "grad_norm": 2.231678350736275, + "learning_rate": 2.7230835532965473e-06, + "loss": 0.2778, + "step": 10020 + }, + { + "epoch": 1.533435348125478, + "grad_norm": 2.077749797828933, + "learning_rate": 2.7213839800633346e-06, + "loss": 0.2966, + "step": 10021 + }, + { + "epoch": 1.5335883703136954, + "grad_norm": 2.3270159778273998, + "learning_rate": 2.719684853838406e-06, + "loss": 0.3374, + "step": 10022 + }, + { + "epoch": 1.5337413925019128, + "grad_norm": 2.210011146928926, + "learning_rate": 2.717986174726108e-06, + "loss": 0.3091, + "step": 10023 + }, + { + "epoch": 1.53389441469013, + "grad_norm": 2.2217583618277064, + "learning_rate": 2.716287942830773e-06, + "loss": 0.3315, + "step": 10024 + }, + { + "epoch": 1.5340474368783474, + "grad_norm": 2.041482697435787, + "learning_rate": 2.714590158256687e-06, + "loss": 0.2848, + "step": 10025 + }, + { + "epoch": 1.5342004590665645, + "grad_norm": 2.1043763796800947, + "learning_rate": 2.7128928211081153e-06, + "loss": 0.2653, + "step": 10026 + }, + { + "epoch": 1.534353481254782, + "grad_norm": 2.154308009610621, + "learning_rate": 2.71119593148931e-06, + "loss": 0.3251, + "step": 10027 + }, + { + "epoch": 1.5345065034429992, + "grad_norm": 2.1032935774833383, + "learning_rate": 2.7094994895044736e-06, + "loss": 0.3703, + "step": 10028 + }, + { + "epoch": 1.5346595256312165, + "grad_norm": 2.290845349239085, + "learning_rate": 2.7078034952577905e-06, + "loss": 0.3318, + "step": 10029 + }, + { + "epoch": 1.5348125478194339, + "grad_norm": 2.1385725936478908, + "learning_rate": 2.706107948853428e-06, + "loss": 0.3057, + "step": 10030 + }, + { + "epoch": 1.534965570007651, + "grad_norm": 2.129033724441874, + "learning_rate": 2.704412850395505e-06, + "loss": 0.2826, + "step": 10031 + }, + { + "epoch": 1.5351185921958685, + "grad_norm": 2.1770308821410462, + "learning_rate": 2.7027181999881326e-06, + "loss": 0.4557, + "step": 10032 + }, + { + "epoch": 1.5352716143840857, + "grad_norm": 1.7528694429046794, + "learning_rate": 2.701023997735385e-06, + "loss": 0.2957, + "step": 10033 + }, + { + "epoch": 1.535424636572303, + "grad_norm": 2.167855205306468, + "learning_rate": 2.6993302437413006e-06, + "loss": 0.3485, + "step": 10034 + }, + { + "epoch": 1.5355776587605203, + "grad_norm": 2.3501051359832354, + "learning_rate": 2.697636938109911e-06, + "loss": 0.3644, + "step": 10035 + }, + { + "epoch": 1.5357306809487374, + "grad_norm": 1.9241981996516704, + "learning_rate": 2.6959440809452055e-06, + "loss": 0.2791, + "step": 10036 + }, + { + "epoch": 1.535883703136955, + "grad_norm": 1.9695651915181258, + "learning_rate": 2.6942516723511424e-06, + "loss": 0.3818, + "step": 10037 + }, + { + "epoch": 1.536036725325172, + "grad_norm": 1.8640844680454627, + "learning_rate": 2.6925597124316673e-06, + "loss": 0.29, + "step": 10038 + }, + { + "epoch": 1.5361897475133894, + "grad_norm": 2.6503153964138466, + "learning_rate": 2.6908682012906874e-06, + "loss": 0.2854, + "step": 10039 + }, + { + "epoch": 1.5363427697016068, + "grad_norm": 2.1630247633251076, + "learning_rate": 2.689177139032084e-06, + "loss": 0.2942, + "step": 10040 + }, + { + "epoch": 1.5364957918898239, + "grad_norm": 2.031261244127211, + "learning_rate": 2.6874865257597126e-06, + "loss": 0.2848, + "step": 10041 + }, + { + "epoch": 1.5366488140780414, + "grad_norm": 1.9718912748795485, + "learning_rate": 2.685796361577402e-06, + "loss": 0.2793, + "step": 10042 + }, + { + "epoch": 1.5368018362662585, + "grad_norm": 2.306037088480719, + "learning_rate": 2.684106646588949e-06, + "loss": 0.32, + "step": 10043 + }, + { + "epoch": 1.5369548584544759, + "grad_norm": 2.270018705118447, + "learning_rate": 2.682417380898126e-06, + "loss": 0.397, + "step": 10044 + }, + { + "epoch": 1.5371078806426932, + "grad_norm": 1.9460779455142152, + "learning_rate": 2.680728564608679e-06, + "loss": 0.3066, + "step": 10045 + }, + { + "epoch": 1.5372609028309103, + "grad_norm": 2.040083304159909, + "learning_rate": 2.679040197824324e-06, + "loss": 0.3022, + "step": 10046 + }, + { + "epoch": 1.5374139250191279, + "grad_norm": 1.698927479185634, + "learning_rate": 2.6773522806487494e-06, + "loss": 0.2381, + "step": 10047 + }, + { + "epoch": 1.537566947207345, + "grad_norm": 2.159105375673701, + "learning_rate": 2.675664813185619e-06, + "loss": 0.3544, + "step": 10048 + }, + { + "epoch": 1.5377199693955623, + "grad_norm": 2.1797933921292647, + "learning_rate": 2.673977795538565e-06, + "loss": 0.3073, + "step": 10049 + }, + { + "epoch": 1.5378729915837797, + "grad_norm": 2.1427002258465944, + "learning_rate": 2.6722912278111936e-06, + "loss": 0.3055, + "step": 10050 + }, + { + "epoch": 1.538026013771997, + "grad_norm": 2.273707572216091, + "learning_rate": 2.670605110107084e-06, + "loss": 0.3434, + "step": 10051 + }, + { + "epoch": 1.5381790359602143, + "grad_norm": 2.0306393620915353, + "learning_rate": 2.668919442529785e-06, + "loss": 0.2714, + "step": 10052 + }, + { + "epoch": 1.5383320581484314, + "grad_norm": 2.1099140870250404, + "learning_rate": 2.6672342251828274e-06, + "loss": 0.2864, + "step": 10053 + }, + { + "epoch": 1.5384850803366488, + "grad_norm": 2.9079589709237963, + "learning_rate": 2.6655494581696994e-06, + "loss": 0.4068, + "step": 10054 + }, + { + "epoch": 1.538638102524866, + "grad_norm": 1.9689432719376831, + "learning_rate": 2.6638651415938697e-06, + "loss": 0.3411, + "step": 10055 + }, + { + "epoch": 1.5387911247130834, + "grad_norm": 2.114549094429897, + "learning_rate": 2.662181275558786e-06, + "loss": 0.307, + "step": 10056 + }, + { + "epoch": 1.5389441469013008, + "grad_norm": 2.057594249117251, + "learning_rate": 2.6604978601678523e-06, + "loss": 0.2666, + "step": 10057 + }, + { + "epoch": 1.5390971690895179, + "grad_norm": 2.232955568610074, + "learning_rate": 2.658814895524455e-06, + "loss": 0.3322, + "step": 10058 + }, + { + "epoch": 1.5392501912777354, + "grad_norm": 2.2654884542844154, + "learning_rate": 2.6571323817319593e-06, + "loss": 0.3371, + "step": 10059 + }, + { + "epoch": 1.5394032134659525, + "grad_norm": 2.4057706843267748, + "learning_rate": 2.6554503188936844e-06, + "loss": 0.3654, + "step": 10060 + }, + { + "epoch": 1.5395562356541699, + "grad_norm": 2.1902711910222266, + "learning_rate": 2.6537687071129404e-06, + "loss": 0.3098, + "step": 10061 + }, + { + "epoch": 1.5397092578423872, + "grad_norm": 2.073188025318028, + "learning_rate": 2.6520875464930006e-06, + "loss": 0.2854, + "step": 10062 + }, + { + "epoch": 1.5398622800306043, + "grad_norm": 2.0989366632201287, + "learning_rate": 2.650406837137104e-06, + "loss": 0.3067, + "step": 10063 + }, + { + "epoch": 1.5400153022188219, + "grad_norm": 2.2188949698155147, + "learning_rate": 2.6487265791484795e-06, + "loss": 0.2936, + "step": 10064 + }, + { + "epoch": 1.540168324407039, + "grad_norm": 2.263393577409095, + "learning_rate": 2.6470467726303163e-06, + "loss": 0.3334, + "step": 10065 + }, + { + "epoch": 1.5403213465952563, + "grad_norm": 2.1229052913443023, + "learning_rate": 2.6453674176857693e-06, + "loss": 0.3446, + "step": 10066 + }, + { + "epoch": 1.5404743687834737, + "grad_norm": 2.0820187078812995, + "learning_rate": 2.6436885144179848e-06, + "loss": 0.36, + "step": 10067 + }, + { + "epoch": 1.5406273909716908, + "grad_norm": 2.036822752359246, + "learning_rate": 2.6420100629300648e-06, + "loss": 0.3523, + "step": 10068 + }, + { + "epoch": 1.5407804131599083, + "grad_norm": 2.118531167022728, + "learning_rate": 2.640332063325093e-06, + "loss": 0.2695, + "step": 10069 + }, + { + "epoch": 1.5409334353481254, + "grad_norm": 2.1807415957671457, + "learning_rate": 2.6386545157061207e-06, + "loss": 0.3469, + "step": 10070 + }, + { + "epoch": 1.5410864575363428, + "grad_norm": 2.1707900822925947, + "learning_rate": 2.636977420176171e-06, + "loss": 0.2875, + "step": 10071 + }, + { + "epoch": 1.54123947972456, + "grad_norm": 2.4577498652524254, + "learning_rate": 2.635300776838243e-06, + "loss": 0.3673, + "step": 10072 + }, + { + "epoch": 1.5413925019127772, + "grad_norm": 2.0434551137883297, + "learning_rate": 2.633624585795305e-06, + "loss": 0.3203, + "step": 10073 + }, + { + "epoch": 1.5415455241009948, + "grad_norm": 1.7411301508901227, + "learning_rate": 2.6319488471502984e-06, + "loss": 0.2563, + "step": 10074 + }, + { + "epoch": 1.5416985462892119, + "grad_norm": 2.1478712253541206, + "learning_rate": 2.630273561006138e-06, + "loss": 0.295, + "step": 10075 + }, + { + "epoch": 1.5418515684774292, + "grad_norm": 1.90429011588242, + "learning_rate": 2.6285987274657045e-06, + "loss": 0.2753, + "step": 10076 + }, + { + "epoch": 1.5420045906656465, + "grad_norm": 2.1754047440005437, + "learning_rate": 2.6269243466318673e-06, + "loss": 0.3106, + "step": 10077 + }, + { + "epoch": 1.5421576128538637, + "grad_norm": 2.2461452419660524, + "learning_rate": 2.625250418607446e-06, + "loss": 0.3741, + "step": 10078 + }, + { + "epoch": 1.5423106350420812, + "grad_norm": 2.193766353742696, + "learning_rate": 2.623576943495244e-06, + "loss": 0.3146, + "step": 10079 + }, + { + "epoch": 1.5424636572302983, + "grad_norm": 1.7423098728249427, + "learning_rate": 2.6219039213980445e-06, + "loss": 0.2811, + "step": 10080 + }, + { + "epoch": 1.5426166794185157, + "grad_norm": 1.961978767244435, + "learning_rate": 2.620231352418585e-06, + "loss": 0.5452, + "step": 10081 + }, + { + "epoch": 1.542769701606733, + "grad_norm": 2.062082207863998, + "learning_rate": 2.618559236659586e-06, + "loss": 0.3589, + "step": 10082 + }, + { + "epoch": 1.54292272379495, + "grad_norm": 2.0213839010709878, + "learning_rate": 2.6168875742237454e-06, + "loss": 0.288, + "step": 10083 + }, + { + "epoch": 1.5430757459831677, + "grad_norm": 2.0088016852578936, + "learning_rate": 2.6152163652137165e-06, + "loss": 0.2905, + "step": 10084 + }, + { + "epoch": 1.5432287681713848, + "grad_norm": 1.8323811164141244, + "learning_rate": 2.613545609732142e-06, + "loss": 0.3337, + "step": 10085 + }, + { + "epoch": 1.543381790359602, + "grad_norm": 2.0627168321060365, + "learning_rate": 2.6118753078816315e-06, + "loss": 0.3066, + "step": 10086 + }, + { + "epoch": 1.5435348125478194, + "grad_norm": 2.2910285838159834, + "learning_rate": 2.610205459764755e-06, + "loss": 0.3516, + "step": 10087 + }, + { + "epoch": 1.5436878347360368, + "grad_norm": 1.9985040435482564, + "learning_rate": 2.608536065484073e-06, + "loss": 0.3262, + "step": 10088 + }, + { + "epoch": 1.543840856924254, + "grad_norm": 2.2496865624069855, + "learning_rate": 2.606867125142107e-06, + "loss": 0.3033, + "step": 10089 + }, + { + "epoch": 1.5439938791124712, + "grad_norm": 2.241097444447969, + "learning_rate": 2.605198638841353e-06, + "loss": 0.3357, + "step": 10090 + }, + { + "epoch": 1.5441469013006885, + "grad_norm": 2.1992885647227625, + "learning_rate": 2.603530606684279e-06, + "loss": 0.3333, + "step": 10091 + }, + { + "epoch": 1.5442999234889059, + "grad_norm": 1.9012812969047344, + "learning_rate": 2.601863028773326e-06, + "loss": 0.2586, + "step": 10092 + }, + { + "epoch": 1.5444529456771232, + "grad_norm": 1.8123047465528561, + "learning_rate": 2.600195905210905e-06, + "loss": 0.2823, + "step": 10093 + }, + { + "epoch": 1.5446059678653405, + "grad_norm": 2.3152030446910787, + "learning_rate": 2.598529236099403e-06, + "loss": 0.3443, + "step": 10094 + }, + { + "epoch": 1.5447589900535577, + "grad_norm": 2.1743439640613484, + "learning_rate": 2.596863021541175e-06, + "loss": 0.3019, + "step": 10095 + }, + { + "epoch": 1.5449120122417752, + "grad_norm": 1.9172471293093292, + "learning_rate": 2.595197261638549e-06, + "loss": 0.2936, + "step": 10096 + }, + { + "epoch": 1.5450650344299923, + "grad_norm": 1.9253862639636048, + "learning_rate": 2.5935319564938275e-06, + "loss": 0.2985, + "step": 10097 + }, + { + "epoch": 1.5452180566182097, + "grad_norm": 1.9430570051191849, + "learning_rate": 2.5918671062092836e-06, + "loss": 0.2914, + "step": 10098 + }, + { + "epoch": 1.545371078806427, + "grad_norm": 2.2577123270161428, + "learning_rate": 2.590202710887161e-06, + "loss": 0.3272, + "step": 10099 + }, + { + "epoch": 1.545524100994644, + "grad_norm": 2.2632677012722895, + "learning_rate": 2.588538770629677e-06, + "loss": 0.2962, + "step": 10100 + }, + { + "epoch": 1.5456771231828617, + "grad_norm": 1.9504870317490723, + "learning_rate": 2.586875285539021e-06, + "loss": 0.2772, + "step": 10101 + }, + { + "epoch": 1.5458301453710788, + "grad_norm": 1.9342150897613446, + "learning_rate": 2.5852122557173542e-06, + "loss": 0.3467, + "step": 10102 + }, + { + "epoch": 1.545983167559296, + "grad_norm": 1.951851157913786, + "learning_rate": 2.5835496812668095e-06, + "loss": 0.2735, + "step": 10103 + }, + { + "epoch": 1.5461361897475134, + "grad_norm": 2.142026102977652, + "learning_rate": 2.581887562289491e-06, + "loss": 0.2972, + "step": 10104 + }, + { + "epoch": 1.5462892119357305, + "grad_norm": 2.193269441158662, + "learning_rate": 2.5802258988874762e-06, + "loss": 0.3203, + "step": 10105 + }, + { + "epoch": 1.546442234123948, + "grad_norm": 1.8676996400343906, + "learning_rate": 2.5785646911628193e-06, + "loss": 0.3037, + "step": 10106 + }, + { + "epoch": 1.5465952563121652, + "grad_norm": 2.093923674004089, + "learning_rate": 2.5769039392175353e-06, + "loss": 0.3632, + "step": 10107 + }, + { + "epoch": 1.5467482785003825, + "grad_norm": 1.9006140368770028, + "learning_rate": 2.5752436431536174e-06, + "loss": 0.2753, + "step": 10108 + }, + { + "epoch": 1.5469013006885999, + "grad_norm": 2.2120676657834943, + "learning_rate": 2.573583803073039e-06, + "loss": 0.2726, + "step": 10109 + }, + { + "epoch": 1.547054322876817, + "grad_norm": 2.1076883294477424, + "learning_rate": 2.571924419077728e-06, + "loss": 0.3216, + "step": 10110 + }, + { + "epoch": 1.5472073450650345, + "grad_norm": 2.0232240951285716, + "learning_rate": 2.5702654912695945e-06, + "loss": 0.3267, + "step": 10111 + }, + { + "epoch": 1.5473603672532517, + "grad_norm": 2.373217544040392, + "learning_rate": 2.568607019750529e-06, + "loss": 0.3187, + "step": 10112 + }, + { + "epoch": 1.547513389441469, + "grad_norm": 2.182283013798894, + "learning_rate": 2.5669490046223713e-06, + "loss": 0.3073, + "step": 10113 + }, + { + "epoch": 1.5476664116296863, + "grad_norm": 1.9217437998228828, + "learning_rate": 2.5652914459869573e-06, + "loss": 0.2509, + "step": 10114 + }, + { + "epoch": 1.5478194338179034, + "grad_norm": 2.2608685850264014, + "learning_rate": 2.563634343946082e-06, + "loss": 0.3531, + "step": 10115 + }, + { + "epoch": 1.547972456006121, + "grad_norm": 2.32705780663519, + "learning_rate": 2.5619776986015077e-06, + "loss": 0.3757, + "step": 10116 + }, + { + "epoch": 1.548125478194338, + "grad_norm": 2.3217036817700647, + "learning_rate": 2.560321510054984e-06, + "loss": 0.3321, + "step": 10117 + }, + { + "epoch": 1.5482785003825554, + "grad_norm": 2.1098728752447795, + "learning_rate": 2.5586657784082236e-06, + "loss": 0.3464, + "step": 10118 + }, + { + "epoch": 1.5484315225707728, + "grad_norm": 2.1231402121495524, + "learning_rate": 2.5570105037629013e-06, + "loss": 0.2992, + "step": 10119 + }, + { + "epoch": 1.54858454475899, + "grad_norm": 2.303456688738964, + "learning_rate": 2.5553556862206852e-06, + "loss": 0.3267, + "step": 10120 + }, + { + "epoch": 1.5487375669472074, + "grad_norm": 2.12653714669132, + "learning_rate": 2.5537013258832e-06, + "loss": 0.2904, + "step": 10121 + }, + { + "epoch": 1.5488905891354245, + "grad_norm": 2.3782425204672273, + "learning_rate": 2.5520474228520454e-06, + "loss": 0.3159, + "step": 10122 + }, + { + "epoch": 1.5490436113236419, + "grad_norm": 2.0454415788829925, + "learning_rate": 2.5503939772287957e-06, + "loss": 0.283, + "step": 10123 + }, + { + "epoch": 1.5491966335118592, + "grad_norm": 2.184366989942858, + "learning_rate": 2.548740989114995e-06, + "loss": 0.3246, + "step": 10124 + }, + { + "epoch": 1.5493496557000765, + "grad_norm": 2.144316447092154, + "learning_rate": 2.5470884586121604e-06, + "loss": 0.3074, + "step": 10125 + }, + { + "epoch": 1.5495026778882939, + "grad_norm": 2.292841312921272, + "learning_rate": 2.5454363858217778e-06, + "loss": 0.291, + "step": 10126 + }, + { + "epoch": 1.549655700076511, + "grad_norm": 2.352491997320857, + "learning_rate": 2.543784770845311e-06, + "loss": 0.3496, + "step": 10127 + }, + { + "epoch": 1.5498087222647285, + "grad_norm": 2.204005708224258, + "learning_rate": 2.542133613784189e-06, + "loss": 0.3373, + "step": 10128 + }, + { + "epoch": 1.5499617444529457, + "grad_norm": 2.2452574362968023, + "learning_rate": 2.540482914739818e-06, + "loss": 0.3437, + "step": 10129 + }, + { + "epoch": 1.550114766641163, + "grad_norm": 2.279324045464706, + "learning_rate": 2.5388326738135726e-06, + "loss": 0.3119, + "step": 10130 + }, + { + "epoch": 1.5502677888293803, + "grad_norm": 2.256376745605677, + "learning_rate": 2.537182891106801e-06, + "loss": 0.346, + "step": 10131 + }, + { + "epoch": 1.5504208110175974, + "grad_norm": 1.927680952087128, + "learning_rate": 2.5355335667208226e-06, + "loss": 0.2626, + "step": 10132 + }, + { + "epoch": 1.550573833205815, + "grad_norm": 1.925711586506596, + "learning_rate": 2.533884700756929e-06, + "loss": 0.2571, + "step": 10133 + }, + { + "epoch": 1.550726855394032, + "grad_norm": 1.9586609335259824, + "learning_rate": 2.5322362933163803e-06, + "loss": 0.2807, + "step": 10134 + }, + { + "epoch": 1.5508798775822494, + "grad_norm": 2.305373905796804, + "learning_rate": 2.5305883445004207e-06, + "loss": 0.333, + "step": 10135 + }, + { + "epoch": 1.5510328997704668, + "grad_norm": 1.9006313901845546, + "learning_rate": 2.5289408544102488e-06, + "loss": 0.2386, + "step": 10136 + }, + { + "epoch": 1.5511859219586839, + "grad_norm": 1.9406112813516276, + "learning_rate": 2.5272938231470433e-06, + "loss": 0.3152, + "step": 10137 + }, + { + "epoch": 1.5513389441469014, + "grad_norm": 2.021371847556138, + "learning_rate": 2.5256472508119633e-06, + "loss": 0.2632, + "step": 10138 + }, + { + "epoch": 1.5514919663351185, + "grad_norm": 1.9119227896899973, + "learning_rate": 2.5240011375061226e-06, + "loss": 0.3115, + "step": 10139 + }, + { + "epoch": 1.5516449885233359, + "grad_norm": 2.018861968257367, + "learning_rate": 2.5223554833306153e-06, + "loss": 0.316, + "step": 10140 + }, + { + "epoch": 1.5517980107115532, + "grad_norm": 2.133098819530022, + "learning_rate": 2.5207102883865163e-06, + "loss": 0.2879, + "step": 10141 + }, + { + "epoch": 1.5519510328997703, + "grad_norm": 1.928501157699594, + "learning_rate": 2.519065552774851e-06, + "loss": 0.219, + "step": 10142 + }, + { + "epoch": 1.5521040550879879, + "grad_norm": 2.0714889924024713, + "learning_rate": 2.5174212765966398e-06, + "loss": 0.3353, + "step": 10143 + }, + { + "epoch": 1.552257077276205, + "grad_norm": 1.9810537630101772, + "learning_rate": 2.5157774599528627e-06, + "loss": 0.316, + "step": 10144 + }, + { + "epoch": 1.5524100994644223, + "grad_norm": 2.097340227564223, + "learning_rate": 2.5141341029444634e-06, + "loss": 0.243, + "step": 10145 + }, + { + "epoch": 1.5525631216526397, + "grad_norm": 2.322435168108173, + "learning_rate": 2.5124912056723784e-06, + "loss": 0.3103, + "step": 10146 + }, + { + "epoch": 1.5527161438408568, + "grad_norm": 2.3468231639208232, + "learning_rate": 2.5108487682375017e-06, + "loss": 0.3325, + "step": 10147 + }, + { + "epoch": 1.5528691660290743, + "grad_norm": 2.497911269354406, + "learning_rate": 2.509206790740694e-06, + "loss": 0.3676, + "step": 10148 + }, + { + "epoch": 1.5530221882172914, + "grad_norm": 1.8090734411304665, + "learning_rate": 2.5075652732828036e-06, + "loss": 0.2453, + "step": 10149 + }, + { + "epoch": 1.5531752104055088, + "grad_norm": 2.403872379075245, + "learning_rate": 2.5059242159646412e-06, + "loss": 0.2632, + "step": 10150 + }, + { + "epoch": 1.553328232593726, + "grad_norm": 2.430625917359335, + "learning_rate": 2.504283618886989e-06, + "loss": 0.38, + "step": 10151 + }, + { + "epoch": 1.5534812547819434, + "grad_norm": 1.8545837232546376, + "learning_rate": 2.502643482150604e-06, + "loss": 0.2823, + "step": 10152 + }, + { + "epoch": 1.5536342769701608, + "grad_norm": 1.9418639929290726, + "learning_rate": 2.5010038058562127e-06, + "loss": 0.3061, + "step": 10153 + }, + { + "epoch": 1.5537872991583779, + "grad_norm": 1.8986203674410438, + "learning_rate": 2.499364590104514e-06, + "loss": 0.2776, + "step": 10154 + }, + { + "epoch": 1.5539403213465952, + "grad_norm": 2.0270382335653707, + "learning_rate": 2.4977258349961774e-06, + "loss": 0.2981, + "step": 10155 + }, + { + "epoch": 1.5540933435348125, + "grad_norm": 2.402283314200567, + "learning_rate": 2.496087540631846e-06, + "loss": 0.3866, + "step": 10156 + }, + { + "epoch": 1.5542463657230299, + "grad_norm": 2.310733483309164, + "learning_rate": 2.4944497071121355e-06, + "loss": 0.3839, + "step": 10157 + }, + { + "epoch": 1.5543993879112472, + "grad_norm": 1.8695856951851826, + "learning_rate": 2.4928123345376276e-06, + "loss": 0.249, + "step": 10158 + }, + { + "epoch": 1.5545524100994643, + "grad_norm": 2.1169212162195827, + "learning_rate": 2.491175423008888e-06, + "loss": 0.2898, + "step": 10159 + }, + { + "epoch": 1.5547054322876819, + "grad_norm": 2.161399766428231, + "learning_rate": 2.4895389726264376e-06, + "loss": 0.3391, + "step": 10160 + }, + { + "epoch": 1.554858454475899, + "grad_norm": 2.391172981427365, + "learning_rate": 2.4879029834907762e-06, + "loss": 0.3253, + "step": 10161 + }, + { + "epoch": 1.5550114766641163, + "grad_norm": 2.100970641331779, + "learning_rate": 2.486267455702387e-06, + "loss": 0.3261, + "step": 10162 + }, + { + "epoch": 1.5551644988523337, + "grad_norm": 1.9156665737120335, + "learning_rate": 2.4846323893616996e-06, + "loss": 0.2877, + "step": 10163 + }, + { + "epoch": 1.5553175210405508, + "grad_norm": 2.3248612982758057, + "learning_rate": 2.4829977845691424e-06, + "loss": 0.3276, + "step": 10164 + }, + { + "epoch": 1.5554705432287683, + "grad_norm": 2.032228598566149, + "learning_rate": 2.4813636414251e-06, + "loss": 0.2805, + "step": 10165 + }, + { + "epoch": 1.5556235654169854, + "grad_norm": 1.9488852461732409, + "learning_rate": 2.479729960029924e-06, + "loss": 0.281, + "step": 10166 + }, + { + "epoch": 1.5557765876052028, + "grad_norm": 2.137426975125268, + "learning_rate": 2.4780967404839528e-06, + "loss": 0.3415, + "step": 10167 + }, + { + "epoch": 1.55592960979342, + "grad_norm": 1.8447028628909974, + "learning_rate": 2.4764639828874905e-06, + "loss": 0.2041, + "step": 10168 + }, + { + "epoch": 1.5560826319816372, + "grad_norm": 1.8215233612139778, + "learning_rate": 2.4748316873408006e-06, + "loss": 0.2766, + "step": 10169 + }, + { + "epoch": 1.5562356541698548, + "grad_norm": 2.118588620302535, + "learning_rate": 2.473199853944138e-06, + "loss": 0.3702, + "step": 10170 + }, + { + "epoch": 1.5563886763580719, + "grad_norm": 2.316013267038449, + "learning_rate": 2.4715684827977183e-06, + "loss": 0.3036, + "step": 10171 + }, + { + "epoch": 1.5565416985462892, + "grad_norm": 2.30550773332787, + "learning_rate": 2.4699375740017296e-06, + "loss": 0.3842, + "step": 10172 + }, + { + "epoch": 1.5566947207345065, + "grad_norm": 2.0189253349152816, + "learning_rate": 2.468307127656331e-06, + "loss": 0.2854, + "step": 10173 + }, + { + "epoch": 1.5568477429227237, + "grad_norm": 1.8466817397772421, + "learning_rate": 2.4666771438616568e-06, + "loss": 0.2789, + "step": 10174 + }, + { + "epoch": 1.5570007651109412, + "grad_norm": 2.06378417070918, + "learning_rate": 2.46504762271781e-06, + "loss": 0.2566, + "step": 10175 + }, + { + "epoch": 1.5571537872991583, + "grad_norm": 2.0737045550933497, + "learning_rate": 2.4634185643248642e-06, + "loss": 0.3565, + "step": 10176 + }, + { + "epoch": 1.5573068094873757, + "grad_norm": 1.941031063446645, + "learning_rate": 2.4617899687828693e-06, + "loss": 0.2481, + "step": 10177 + }, + { + "epoch": 1.557459831675593, + "grad_norm": 2.225129231890635, + "learning_rate": 2.4601618361918413e-06, + "loss": 0.3176, + "step": 10178 + }, + { + "epoch": 1.55761285386381, + "grad_norm": 2.016757570425307, + "learning_rate": 2.45853416665177e-06, + "loss": 0.3122, + "step": 10179 + }, + { + "epoch": 1.5577658760520277, + "grad_norm": 2.17651730016683, + "learning_rate": 2.4569069602626196e-06, + "loss": 0.2992, + "step": 10180 + }, + { + "epoch": 1.5579188982402448, + "grad_norm": 1.9707363895900585, + "learning_rate": 2.4552802171243204e-06, + "loss": 0.3621, + "step": 10181 + }, + { + "epoch": 1.558071920428462, + "grad_norm": 2.2690988936611562, + "learning_rate": 2.4536539373367786e-06, + "loss": 0.32, + "step": 10182 + }, + { + "epoch": 1.5582249426166794, + "grad_norm": 2.260127613008053, + "learning_rate": 2.45202812099987e-06, + "loss": 0.313, + "step": 10183 + }, + { + "epoch": 1.5583779648048968, + "grad_norm": 2.092136322623526, + "learning_rate": 2.4504027682134423e-06, + "loss": 0.3156, + "step": 10184 + }, + { + "epoch": 1.558530986993114, + "grad_norm": 1.9982530177796707, + "learning_rate": 2.4487778790773155e-06, + "loss": 0.2735, + "step": 10185 + }, + { + "epoch": 1.5586840091813312, + "grad_norm": 2.1708897206182507, + "learning_rate": 2.447153453691279e-06, + "loss": 0.3133, + "step": 10186 + }, + { + "epoch": 1.5588370313695485, + "grad_norm": 2.0960282856628942, + "learning_rate": 2.445529492155092e-06, + "loss": 0.3201, + "step": 10187 + }, + { + "epoch": 1.5589900535577659, + "grad_norm": 2.1436054196035044, + "learning_rate": 2.4439059945684997e-06, + "loss": 0.3282, + "step": 10188 + }, + { + "epoch": 1.5591430757459832, + "grad_norm": 1.9695191815716822, + "learning_rate": 2.4422829610311948e-06, + "loss": 0.3069, + "step": 10189 + }, + { + "epoch": 1.5592960979342005, + "grad_norm": 2.1926718704756807, + "learning_rate": 2.4406603916428584e-06, + "loss": 0.3344, + "step": 10190 + }, + { + "epoch": 1.5594491201224177, + "grad_norm": 2.3383201047133855, + "learning_rate": 2.439038286503145e-06, + "loss": 0.336, + "step": 10191 + }, + { + "epoch": 1.559602142310635, + "grad_norm": 1.8372155716325524, + "learning_rate": 2.437416645711662e-06, + "loss": 0.2345, + "step": 10192 + }, + { + "epoch": 1.5597551644988523, + "grad_norm": 2.495769900944102, + "learning_rate": 2.435795469368012e-06, + "loss": 0.3478, + "step": 10193 + }, + { + "epoch": 1.5599081866870697, + "grad_norm": 2.1221966929936404, + "learning_rate": 2.4341747575717554e-06, + "loss": 0.3035, + "step": 10194 + }, + { + "epoch": 1.560061208875287, + "grad_norm": 2.0135930107293802, + "learning_rate": 2.43255451042242e-06, + "loss": 0.299, + "step": 10195 + }, + { + "epoch": 1.560214231063504, + "grad_norm": 2.1633570926567245, + "learning_rate": 2.4309347280195183e-06, + "loss": 0.3154, + "step": 10196 + }, + { + "epoch": 1.5603672532517217, + "grad_norm": 1.9875556110795365, + "learning_rate": 2.4293154104625282e-06, + "loss": 0.2904, + "step": 10197 + }, + { + "epoch": 1.5605202754399388, + "grad_norm": 2.340039983969419, + "learning_rate": 2.4276965578508905e-06, + "loss": 0.2656, + "step": 10198 + }, + { + "epoch": 1.560673297628156, + "grad_norm": 2.221132754924476, + "learning_rate": 2.426078170284032e-06, + "loss": 0.3104, + "step": 10199 + }, + { + "epoch": 1.5608263198163734, + "grad_norm": 2.073516952937971, + "learning_rate": 2.4244602478613433e-06, + "loss": 0.3356, + "step": 10200 + }, + { + "epoch": 1.5609793420045905, + "grad_norm": 2.488583914365108, + "learning_rate": 2.4228427906821863e-06, + "loss": 0.3038, + "step": 10201 + }, + { + "epoch": 1.561132364192808, + "grad_norm": 2.445490540566737, + "learning_rate": 2.421225798845894e-06, + "loss": 0.3105, + "step": 10202 + }, + { + "epoch": 1.5612853863810252, + "grad_norm": 2.1391009903185396, + "learning_rate": 2.4196092724517763e-06, + "loss": 0.2995, + "step": 10203 + }, + { + "epoch": 1.5614384085692425, + "grad_norm": 2.044543683226895, + "learning_rate": 2.4179932115991056e-06, + "loss": 0.2764, + "step": 10204 + }, + { + "epoch": 1.5615914307574599, + "grad_norm": 2.046036413929888, + "learning_rate": 2.4163776163871334e-06, + "loss": 0.2786, + "step": 10205 + }, + { + "epoch": 1.561744452945677, + "grad_norm": 2.2680242817870875, + "learning_rate": 2.41476248691508e-06, + "loss": 0.3337, + "step": 10206 + }, + { + "epoch": 1.5618974751338945, + "grad_norm": 2.0380320601865853, + "learning_rate": 2.413147823282135e-06, + "loss": 0.33, + "step": 10207 + }, + { + "epoch": 1.5620504973221117, + "grad_norm": 2.040382014149729, + "learning_rate": 2.4115336255874623e-06, + "loss": 0.2632, + "step": 10208 + }, + { + "epoch": 1.562203519510329, + "grad_norm": 2.2044053792612632, + "learning_rate": 2.409919893930196e-06, + "loss": 0.2983, + "step": 10209 + }, + { + "epoch": 1.5623565416985463, + "grad_norm": 2.277740664709357, + "learning_rate": 2.4083066284094415e-06, + "loss": 0.3382, + "step": 10210 + }, + { + "epoch": 1.5625095638867634, + "grad_norm": 2.373251287168811, + "learning_rate": 2.406693829124276e-06, + "loss": 0.3217, + "step": 10211 + }, + { + "epoch": 1.562662586074981, + "grad_norm": 2.196277994030039, + "learning_rate": 2.4050814961737466e-06, + "loss": 0.3034, + "step": 10212 + }, + { + "epoch": 1.562815608263198, + "grad_norm": 1.998217282132098, + "learning_rate": 2.403469629656875e-06, + "loss": 0.2597, + "step": 10213 + }, + { + "epoch": 1.5629686304514154, + "grad_norm": 1.9683414339934968, + "learning_rate": 2.401858229672651e-06, + "loss": 0.3089, + "step": 10214 + }, + { + "epoch": 1.5631216526396328, + "grad_norm": 2.171320953900138, + "learning_rate": 2.4002472963200374e-06, + "loss": 0.3134, + "step": 10215 + }, + { + "epoch": 1.5632746748278499, + "grad_norm": 1.9236456967890725, + "learning_rate": 2.3986368296979643e-06, + "loss": 0.2783, + "step": 10216 + }, + { + "epoch": 1.5634276970160674, + "grad_norm": 2.1561594723846205, + "learning_rate": 2.397026829905347e-06, + "loss": 0.3362, + "step": 10217 + }, + { + "epoch": 1.5635807192042845, + "grad_norm": 2.0638153028428294, + "learning_rate": 2.395417297041052e-06, + "loss": 0.2939, + "step": 10218 + }, + { + "epoch": 1.5637337413925019, + "grad_norm": 2.075780646463065, + "learning_rate": 2.3938082312039267e-06, + "loss": 0.3216, + "step": 10219 + }, + { + "epoch": 1.5638867635807192, + "grad_norm": 2.147494836310276, + "learning_rate": 2.3921996324927988e-06, + "loss": 0.3036, + "step": 10220 + }, + { + "epoch": 1.5640397857689365, + "grad_norm": 2.494807687598165, + "learning_rate": 2.390591501006452e-06, + "loss": 0.4179, + "step": 10221 + }, + { + "epoch": 1.5641928079571539, + "grad_norm": 2.2762875943975107, + "learning_rate": 2.3889838368436445e-06, + "loss": 0.331, + "step": 10222 + }, + { + "epoch": 1.564345830145371, + "grad_norm": 2.0772333504973384, + "learning_rate": 2.3873766401031205e-06, + "loss": 0.2941, + "step": 10223 + }, + { + "epoch": 1.5644988523335883, + "grad_norm": 2.1005853247996313, + "learning_rate": 2.385769910883573e-06, + "loss": 0.2845, + "step": 10224 + }, + { + "epoch": 1.5646518745218057, + "grad_norm": 2.1720104480820885, + "learning_rate": 2.3841636492836838e-06, + "loss": 0.3295, + "step": 10225 + }, + { + "epoch": 1.564804896710023, + "grad_norm": 2.0423208854212, + "learning_rate": 2.382557855402102e-06, + "loss": 0.2516, + "step": 10226 + }, + { + "epoch": 1.5649579188982403, + "grad_norm": 2.0188972984497027, + "learning_rate": 2.3809525293374357e-06, + "loss": 0.2452, + "step": 10227 + }, + { + "epoch": 1.5651109410864574, + "grad_norm": 2.0508236985553436, + "learning_rate": 2.3793476711882836e-06, + "loss": 0.2987, + "step": 10228 + }, + { + "epoch": 1.565263963274675, + "grad_norm": 2.232455824019605, + "learning_rate": 2.377743281053203e-06, + "loss": 0.2617, + "step": 10229 + }, + { + "epoch": 1.565416985462892, + "grad_norm": 2.2874276475622795, + "learning_rate": 2.3761393590307267e-06, + "loss": 0.3049, + "step": 10230 + }, + { + "epoch": 1.5655700076511094, + "grad_norm": 2.0678605418290714, + "learning_rate": 2.374535905219356e-06, + "loss": 0.3362, + "step": 10231 + }, + { + "epoch": 1.5657230298393268, + "grad_norm": 1.9479610053175493, + "learning_rate": 2.3729329197175668e-06, + "loss": 0.3345, + "step": 10232 + }, + { + "epoch": 1.5658760520275439, + "grad_norm": 2.115965723564736, + "learning_rate": 2.371330402623805e-06, + "loss": 0.3229, + "step": 10233 + }, + { + "epoch": 1.5660290742157614, + "grad_norm": 2.134915489865641, + "learning_rate": 2.3697283540364856e-06, + "loss": 0.3516, + "step": 10234 + }, + { + "epoch": 1.5661820964039785, + "grad_norm": 2.4074271755888987, + "learning_rate": 2.368126774053998e-06, + "loss": 0.2972, + "step": 10235 + }, + { + "epoch": 1.5663351185921959, + "grad_norm": 2.2511960150185435, + "learning_rate": 2.3665256627747012e-06, + "loss": 0.2815, + "step": 10236 + }, + { + "epoch": 1.5664881407804132, + "grad_norm": 2.0602330321925106, + "learning_rate": 2.3649250202969233e-06, + "loss": 0.2911, + "step": 10237 + }, + { + "epoch": 1.5666411629686303, + "grad_norm": 1.927634330204787, + "learning_rate": 2.363324846718974e-06, + "loss": 0.2757, + "step": 10238 + }, + { + "epoch": 1.5667941851568479, + "grad_norm": 2.2909319046053906, + "learning_rate": 2.3617251421391172e-06, + "loss": 0.2859, + "step": 10239 + }, + { + "epoch": 1.566947207345065, + "grad_norm": 1.8964456203975408, + "learning_rate": 2.3601259066555982e-06, + "loss": 0.2124, + "step": 10240 + }, + { + "epoch": 1.5671002295332823, + "grad_norm": 1.9433830612323009, + "learning_rate": 2.358527140366641e-06, + "loss": 0.2759, + "step": 10241 + }, + { + "epoch": 1.5672532517214997, + "grad_norm": 2.1053810325565174, + "learning_rate": 2.356928843370422e-06, + "loss": 0.3441, + "step": 10242 + }, + { + "epoch": 1.5674062739097168, + "grad_norm": 2.1752026446050117, + "learning_rate": 2.3553310157651e-06, + "loss": 0.3235, + "step": 10243 + }, + { + "epoch": 1.5675592960979343, + "grad_norm": 2.1045113271973603, + "learning_rate": 2.3537336576488124e-06, + "loss": 0.2883, + "step": 10244 + }, + { + "epoch": 1.5677123182861514, + "grad_norm": 2.1417921242316855, + "learning_rate": 2.3521367691196474e-06, + "loss": 0.4201, + "step": 10245 + }, + { + "epoch": 1.5678653404743688, + "grad_norm": 2.2202253891721924, + "learning_rate": 2.350540350275684e-06, + "loss": 0.344, + "step": 10246 + }, + { + "epoch": 1.568018362662586, + "grad_norm": 2.051083146052781, + "learning_rate": 2.3489444012149665e-06, + "loss": 0.3307, + "step": 10247 + }, + { + "epoch": 1.5681713848508032, + "grad_norm": 2.3576621657679113, + "learning_rate": 2.3473489220354985e-06, + "loss": 0.2906, + "step": 10248 + }, + { + "epoch": 1.5683244070390208, + "grad_norm": 2.400771637167478, + "learning_rate": 2.3457539128352737e-06, + "loss": 0.3003, + "step": 10249 + }, + { + "epoch": 1.5684774292272379, + "grad_norm": 2.042581493431355, + "learning_rate": 2.344159373712247e-06, + "loss": 0.2974, + "step": 10250 + }, + { + "epoch": 1.5686304514154552, + "grad_norm": 2.346004763894137, + "learning_rate": 2.3425653047643373e-06, + "loss": 0.3402, + "step": 10251 + }, + { + "epoch": 1.5687834736036725, + "grad_norm": 1.6993995033239386, + "learning_rate": 2.340971706089451e-06, + "loss": 0.2819, + "step": 10252 + }, + { + "epoch": 1.5689364957918899, + "grad_norm": 2.3292922099877766, + "learning_rate": 2.339378577785455e-06, + "loss": 0.3639, + "step": 10253 + }, + { + "epoch": 1.5690895179801072, + "grad_norm": 1.7548228300139943, + "learning_rate": 2.3377859199501886e-06, + "loss": 0.2774, + "step": 10254 + }, + { + "epoch": 1.5692425401683243, + "grad_norm": 2.192534611433887, + "learning_rate": 2.3361937326814633e-06, + "loss": 0.3263, + "step": 10255 + }, + { + "epoch": 1.5693955623565417, + "grad_norm": 2.262468415284445, + "learning_rate": 2.3346020160770632e-06, + "loss": 0.2805, + "step": 10256 + }, + { + "epoch": 1.569548584544759, + "grad_norm": 2.10740520424962, + "learning_rate": 2.3330107702347393e-06, + "loss": 0.3117, + "step": 10257 + }, + { + "epoch": 1.5697016067329763, + "grad_norm": 1.9254659842503297, + "learning_rate": 2.3314199952522176e-06, + "loss": 0.2384, + "step": 10258 + }, + { + "epoch": 1.5698546289211937, + "grad_norm": 2.3472990294430174, + "learning_rate": 2.3298296912271932e-06, + "loss": 0.3014, + "step": 10259 + }, + { + "epoch": 1.5700076511094108, + "grad_norm": 2.123204083750397, + "learning_rate": 2.328239858257335e-06, + "loss": 0.3593, + "step": 10260 + }, + { + "epoch": 1.5701606732976283, + "grad_norm": 2.3856229695225633, + "learning_rate": 2.326650496440278e-06, + "loss": 0.3334, + "step": 10261 + }, + { + "epoch": 1.5703136954858454, + "grad_norm": 2.033254452554158, + "learning_rate": 2.325061605873632e-06, + "loss": 0.3196, + "step": 10262 + }, + { + "epoch": 1.5704667176740628, + "grad_norm": 2.384658331345606, + "learning_rate": 2.3234731866549778e-06, + "loss": 0.2796, + "step": 10263 + }, + { + "epoch": 1.57061973986228, + "grad_norm": 1.9582659952780006, + "learning_rate": 2.3218852388818657e-06, + "loss": 0.2933, + "step": 10264 + }, + { + "epoch": 1.5707727620504972, + "grad_norm": 2.1466251525953166, + "learning_rate": 2.3202977626518187e-06, + "loss": 0.2537, + "step": 10265 + }, + { + "epoch": 1.5709257842387148, + "grad_norm": 2.072612975719027, + "learning_rate": 2.3187107580623257e-06, + "loss": 0.3466, + "step": 10266 + }, + { + "epoch": 1.5710788064269319, + "grad_norm": 1.889187195089687, + "learning_rate": 2.3171242252108607e-06, + "loss": 0.3119, + "step": 10267 + }, + { + "epoch": 1.5712318286151492, + "grad_norm": 2.0271223685590263, + "learning_rate": 2.3155381641948494e-06, + "loss": 0.3211, + "step": 10268 + }, + { + "epoch": 1.5713848508033665, + "grad_norm": 2.0742514867228707, + "learning_rate": 2.313952575111699e-06, + "loss": 0.3236, + "step": 10269 + }, + { + "epoch": 1.5715378729915837, + "grad_norm": 2.4083805585663085, + "learning_rate": 2.3123674580587942e-06, + "loss": 0.3223, + "step": 10270 + }, + { + "epoch": 1.5716908951798012, + "grad_norm": 1.7294525210795593, + "learning_rate": 2.3107828131334744e-06, + "loss": 0.2704, + "step": 10271 + }, + { + "epoch": 1.5718439173680183, + "grad_norm": 2.3189317261675297, + "learning_rate": 2.30919864043306e-06, + "loss": 0.3368, + "step": 10272 + }, + { + "epoch": 1.5719969395562357, + "grad_norm": 2.194260512799246, + "learning_rate": 2.3076149400548498e-06, + "loss": 0.3096, + "step": 10273 + }, + { + "epoch": 1.572149961744453, + "grad_norm": 1.8457396636416614, + "learning_rate": 2.306031712096093e-06, + "loss": 0.2729, + "step": 10274 + }, + { + "epoch": 1.57230298393267, + "grad_norm": 2.084375191930075, + "learning_rate": 2.3044489566540306e-06, + "loss": 0.3001, + "step": 10275 + }, + { + "epoch": 1.5724560061208877, + "grad_norm": 1.91765039031427, + "learning_rate": 2.3028666738258653e-06, + "loss": 0.2202, + "step": 10276 + }, + { + "epoch": 1.5726090283091048, + "grad_norm": 2.1171884440790776, + "learning_rate": 2.301284863708764e-06, + "loss": 0.303, + "step": 10277 + }, + { + "epoch": 1.572762050497322, + "grad_norm": 2.1586507600026392, + "learning_rate": 2.2997035263998792e-06, + "loss": 0.3683, + "step": 10278 + }, + { + "epoch": 1.5729150726855394, + "grad_norm": 2.3001386303828015, + "learning_rate": 2.298122661996328e-06, + "loss": 0.3143, + "step": 10279 + }, + { + "epoch": 1.5730680948737565, + "grad_norm": 2.340415887451529, + "learning_rate": 2.296542270595188e-06, + "loss": 0.3019, + "step": 10280 + }, + { + "epoch": 1.573221117061974, + "grad_norm": 2.2975152673572947, + "learning_rate": 2.294962352293526e-06, + "loss": 0.3047, + "step": 10281 + }, + { + "epoch": 1.5733741392501912, + "grad_norm": 1.7612950905997558, + "learning_rate": 2.2933829071883673e-06, + "loss": 0.231, + "step": 10282 + }, + { + "epoch": 1.5735271614384085, + "grad_norm": 2.139252431583115, + "learning_rate": 2.291803935376714e-06, + "loss": 0.3253, + "step": 10283 + }, + { + "epoch": 1.5736801836266259, + "grad_norm": 2.2296485507686836, + "learning_rate": 2.2902254369555354e-06, + "loss": 0.3236, + "step": 10284 + }, + { + "epoch": 1.5738332058148432, + "grad_norm": 2.230439426279258, + "learning_rate": 2.2886474120217726e-06, + "loss": 0.3202, + "step": 10285 + }, + { + "epoch": 1.5739862280030605, + "grad_norm": 2.194048175922532, + "learning_rate": 2.287069860672341e-06, + "loss": 0.3116, + "step": 10286 + }, + { + "epoch": 1.5741392501912777, + "grad_norm": 2.104361793722446, + "learning_rate": 2.2854927830041205e-06, + "loss": 0.3093, + "step": 10287 + }, + { + "epoch": 1.574292272379495, + "grad_norm": 2.190600186554258, + "learning_rate": 2.2839161791139685e-06, + "loss": 0.2952, + "step": 10288 + }, + { + "epoch": 1.5744452945677123, + "grad_norm": 2.2380497722604566, + "learning_rate": 2.2823400490987103e-06, + "loss": 0.2993, + "step": 10289 + }, + { + "epoch": 1.5745983167559297, + "grad_norm": 2.0662226158733907, + "learning_rate": 2.2807643930551403e-06, + "loss": 0.2838, + "step": 10290 + }, + { + "epoch": 1.574751338944147, + "grad_norm": 1.8754728763422386, + "learning_rate": 2.279189211080026e-06, + "loss": 0.2747, + "step": 10291 + }, + { + "epoch": 1.574904361132364, + "grad_norm": 2.239515204953391, + "learning_rate": 2.277614503270108e-06, + "loss": 0.3475, + "step": 10292 + }, + { + "epoch": 1.5750573833205816, + "grad_norm": 1.9798738633771396, + "learning_rate": 2.276040269722092e-06, + "loss": 0.3008, + "step": 10293 + }, + { + "epoch": 1.5752104055087988, + "grad_norm": 2.2716397492829454, + "learning_rate": 2.2744665105326603e-06, + "loss": 0.3376, + "step": 10294 + }, + { + "epoch": 1.575363427697016, + "grad_norm": 2.2329279492017795, + "learning_rate": 2.2728932257984613e-06, + "loss": 0.3152, + "step": 10295 + }, + { + "epoch": 1.5755164498852334, + "grad_norm": 1.999846969202547, + "learning_rate": 2.2713204156161193e-06, + "loss": 0.3223, + "step": 10296 + }, + { + "epoch": 1.5756694720734505, + "grad_norm": 2.186381035023827, + "learning_rate": 2.269748080082225e-06, + "loss": 0.3592, + "step": 10297 + }, + { + "epoch": 1.575822494261668, + "grad_norm": 2.1638727598497742, + "learning_rate": 2.268176219293339e-06, + "loss": 0.3635, + "step": 10298 + }, + { + "epoch": 1.5759755164498852, + "grad_norm": 1.9826793258017292, + "learning_rate": 2.2666048333460046e-06, + "loss": 0.3428, + "step": 10299 + }, + { + "epoch": 1.5761285386381025, + "grad_norm": 2.1235423137566536, + "learning_rate": 2.2650339223367167e-06, + "loss": 0.2813, + "step": 10300 + }, + { + "epoch": 1.5762815608263199, + "grad_norm": 2.150327660568215, + "learning_rate": 2.263463486361953e-06, + "loss": 0.3257, + "step": 10301 + }, + { + "epoch": 1.576434583014537, + "grad_norm": 2.1608998717546952, + "learning_rate": 2.2618935255181673e-06, + "loss": 0.3238, + "step": 10302 + }, + { + "epoch": 1.5765876052027545, + "grad_norm": 2.163568899899916, + "learning_rate": 2.2603240399017668e-06, + "loss": 0.2742, + "step": 10303 + }, + { + "epoch": 1.5767406273909716, + "grad_norm": 2.0529341198278206, + "learning_rate": 2.2587550296091477e-06, + "loss": 0.308, + "step": 10304 + }, + { + "epoch": 1.576893649579189, + "grad_norm": 2.0794813595152464, + "learning_rate": 2.2571864947366685e-06, + "loss": 0.3075, + "step": 10305 + }, + { + "epoch": 1.5770466717674063, + "grad_norm": 2.0384171796033796, + "learning_rate": 2.255618435380651e-06, + "loss": 0.3338, + "step": 10306 + }, + { + "epoch": 1.5771996939556234, + "grad_norm": 2.226350908567524, + "learning_rate": 2.2540508516374036e-06, + "loss": 0.3045, + "step": 10307 + }, + { + "epoch": 1.577352716143841, + "grad_norm": 2.0631861942769785, + "learning_rate": 2.2524837436031997e-06, + "loss": 0.2344, + "step": 10308 + }, + { + "epoch": 1.577505738332058, + "grad_norm": 2.3266694287378527, + "learning_rate": 2.2509171113742724e-06, + "loss": 0.3964, + "step": 10309 + }, + { + "epoch": 1.5776587605202754, + "grad_norm": 1.9944872958494284, + "learning_rate": 2.249350955046842e-06, + "loss": 0.2898, + "step": 10310 + }, + { + "epoch": 1.5778117827084928, + "grad_norm": 1.9506141300360098, + "learning_rate": 2.24778527471709e-06, + "loss": 0.2996, + "step": 10311 + }, + { + "epoch": 1.5779648048967099, + "grad_norm": 1.9074519577115954, + "learning_rate": 2.246220070481171e-06, + "loss": 0.2978, + "step": 10312 + }, + { + "epoch": 1.5781178270849274, + "grad_norm": 1.9938550842973741, + "learning_rate": 2.24465534243521e-06, + "loss": 0.2799, + "step": 10313 + }, + { + "epoch": 1.5782708492731445, + "grad_norm": 1.8507728107065444, + "learning_rate": 2.2430910906753045e-06, + "loss": 0.3082, + "step": 10314 + }, + { + "epoch": 1.5784238714613619, + "grad_norm": 1.9282543196104605, + "learning_rate": 2.2415273152975205e-06, + "loss": 0.2698, + "step": 10315 + }, + { + "epoch": 1.5785768936495792, + "grad_norm": 2.120689019338789, + "learning_rate": 2.2399640163978942e-06, + "loss": 0.316, + "step": 10316 + }, + { + "epoch": 1.5787299158377963, + "grad_norm": 2.193624778585067, + "learning_rate": 2.238401194072436e-06, + "loss": 0.2874, + "step": 10317 + }, + { + "epoch": 1.5788829380260139, + "grad_norm": 2.302872403542241, + "learning_rate": 2.2368388484171246e-06, + "loss": 0.3184, + "step": 10318 + }, + { + "epoch": 1.579035960214231, + "grad_norm": 1.642833529579855, + "learning_rate": 2.235276979527905e-06, + "loss": 0.2235, + "step": 10319 + }, + { + "epoch": 1.5791889824024483, + "grad_norm": 2.2481428159239334, + "learning_rate": 2.2337155875007076e-06, + "loss": 0.2923, + "step": 10320 + }, + { + "epoch": 1.5793420045906656, + "grad_norm": 1.9982154763461255, + "learning_rate": 2.232154672431416e-06, + "loss": 0.2771, + "step": 10321 + }, + { + "epoch": 1.579495026778883, + "grad_norm": 2.013720888615236, + "learning_rate": 2.2305942344158906e-06, + "loss": 0.276, + "step": 10322 + }, + { + "epoch": 1.5796480489671003, + "grad_norm": 2.068002598244875, + "learning_rate": 2.2290342735499724e-06, + "loss": 0.2944, + "step": 10323 + }, + { + "epoch": 1.5798010711553174, + "grad_norm": 1.9030134400531125, + "learning_rate": 2.227474789929458e-06, + "loss": 0.2344, + "step": 10324 + }, + { + "epoch": 1.5799540933435348, + "grad_norm": 2.022291552569888, + "learning_rate": 2.225915783650119e-06, + "loss": 0.2472, + "step": 10325 + }, + { + "epoch": 1.580107115531752, + "grad_norm": 2.24395570623936, + "learning_rate": 2.2243572548077107e-06, + "loss": 0.3125, + "step": 10326 + }, + { + "epoch": 1.5802601377199694, + "grad_norm": 1.9891422743630058, + "learning_rate": 2.2227992034979363e-06, + "loss": 0.2601, + "step": 10327 + }, + { + "epoch": 1.5804131599081868, + "grad_norm": 1.8983232454344225, + "learning_rate": 2.2212416298164895e-06, + "loss": 0.3107, + "step": 10328 + }, + { + "epoch": 1.5805661820964039, + "grad_norm": 1.7352050344116838, + "learning_rate": 2.219684533859028e-06, + "loss": 0.2608, + "step": 10329 + }, + { + "epoch": 1.5807192042846214, + "grad_norm": 2.0679773793018636, + "learning_rate": 2.21812791572117e-06, + "loss": 0.2791, + "step": 10330 + }, + { + "epoch": 1.5808722264728385, + "grad_norm": 2.0633576159015488, + "learning_rate": 2.216571775498523e-06, + "loss": 0.3305, + "step": 10331 + }, + { + "epoch": 1.5810252486610559, + "grad_norm": 2.0864993072287645, + "learning_rate": 2.215016113286652e-06, + "loss": 0.2381, + "step": 10332 + }, + { + "epoch": 1.5811782708492732, + "grad_norm": 1.9834409122072287, + "learning_rate": 2.213460929181097e-06, + "loss": 0.3014, + "step": 10333 + }, + { + "epoch": 1.5813312930374903, + "grad_norm": 2.2354577819111445, + "learning_rate": 2.211906223277367e-06, + "loss": 0.2997, + "step": 10334 + }, + { + "epoch": 1.5814843152257079, + "grad_norm": 2.2939978595458363, + "learning_rate": 2.210351995670943e-06, + "loss": 0.318, + "step": 10335 + }, + { + "epoch": 1.581637337413925, + "grad_norm": 2.019619517073732, + "learning_rate": 2.2087982464572755e-06, + "loss": 0.3068, + "step": 10336 + }, + { + "epoch": 1.5817903596021423, + "grad_norm": 2.1592548402626175, + "learning_rate": 2.207244975731788e-06, + "loss": 0.255, + "step": 10337 + }, + { + "epoch": 1.5819433817903596, + "grad_norm": 2.0626900857490402, + "learning_rate": 2.2056921835898717e-06, + "loss": 0.2745, + "step": 10338 + }, + { + "epoch": 1.5820964039785768, + "grad_norm": 2.318142925818211, + "learning_rate": 2.20413987012689e-06, + "loss": 0.3219, + "step": 10339 + }, + { + "epoch": 1.5822494261667943, + "grad_norm": 1.997112063237294, + "learning_rate": 2.2025880354381767e-06, + "loss": 0.3089, + "step": 10340 + }, + { + "epoch": 1.5824024483550114, + "grad_norm": 2.284517687797388, + "learning_rate": 2.2010366796190353e-06, + "loss": 0.3464, + "step": 10341 + }, + { + "epoch": 1.5825554705432288, + "grad_norm": 2.670105379186421, + "learning_rate": 2.1994858027647414e-06, + "loss": 0.3209, + "step": 10342 + }, + { + "epoch": 1.582708492731446, + "grad_norm": 2.0680620302934094, + "learning_rate": 2.197935404970539e-06, + "loss": 0.2762, + "step": 10343 + }, + { + "epoch": 1.5828615149196632, + "grad_norm": 2.0968791504944897, + "learning_rate": 2.1963854863316468e-06, + "loss": 0.2899, + "step": 10344 + }, + { + "epoch": 1.5830145371078808, + "grad_norm": 2.0214319597523103, + "learning_rate": 2.1948360469432494e-06, + "loss": 0.3563, + "step": 10345 + }, + { + "epoch": 1.5831675592960979, + "grad_norm": 2.131484039985648, + "learning_rate": 2.1932870869005042e-06, + "loss": 0.3074, + "step": 10346 + }, + { + "epoch": 1.5833205814843152, + "grad_norm": 2.103344766881095, + "learning_rate": 2.191738606298539e-06, + "loss": 0.3058, + "step": 10347 + }, + { + "epoch": 1.5834736036725325, + "grad_norm": 2.556940059879879, + "learning_rate": 2.1901906052324495e-06, + "loss": 0.4076, + "step": 10348 + }, + { + "epoch": 1.5836266258607496, + "grad_norm": 2.2405335632188517, + "learning_rate": 2.1886430837973115e-06, + "loss": 0.3214, + "step": 10349 + }, + { + "epoch": 1.5837796480489672, + "grad_norm": 2.14456826606491, + "learning_rate": 2.1870960420881584e-06, + "loss": 0.3017, + "step": 10350 + }, + { + "epoch": 1.5839326702371843, + "grad_norm": 2.2376186462192096, + "learning_rate": 2.185549480199999e-06, + "loss": 0.2989, + "step": 10351 + }, + { + "epoch": 1.5840856924254016, + "grad_norm": 2.05365198255844, + "learning_rate": 2.184003398227821e-06, + "loss": 0.2639, + "step": 10352 + }, + { + "epoch": 1.584238714613619, + "grad_norm": 1.5879002011757888, + "learning_rate": 2.182457796266568e-06, + "loss": 0.2415, + "step": 10353 + }, + { + "epoch": 1.5843917368018363, + "grad_norm": 2.2942998647330173, + "learning_rate": 2.180912674411162e-06, + "loss": 0.2657, + "step": 10354 + }, + { + "epoch": 1.5845447589900536, + "grad_norm": 2.1415116546843294, + "learning_rate": 2.1793680327565024e-06, + "loss": 0.3094, + "step": 10355 + }, + { + "epoch": 1.5846977811782708, + "grad_norm": 1.868653381216291, + "learning_rate": 2.177823871397441e-06, + "loss": 0.2852, + "step": 10356 + }, + { + "epoch": 1.584850803366488, + "grad_norm": 2.309292831698454, + "learning_rate": 2.176280190428819e-06, + "loss": 0.3522, + "step": 10357 + }, + { + "epoch": 1.5850038255547054, + "grad_norm": 2.1710738028332863, + "learning_rate": 2.1747369899454386e-06, + "loss": 0.3116, + "step": 10358 + }, + { + "epoch": 1.5851568477429228, + "grad_norm": 2.0734244476848454, + "learning_rate": 2.1731942700420683e-06, + "loss": 0.2549, + "step": 10359 + }, + { + "epoch": 1.58530986993114, + "grad_norm": 2.351848876345775, + "learning_rate": 2.171652030813458e-06, + "loss": 0.325, + "step": 10360 + }, + { + "epoch": 1.5854628921193572, + "grad_norm": 2.257892033882186, + "learning_rate": 2.1701102723543242e-06, + "loss": 0.2825, + "step": 10361 + }, + { + "epoch": 1.5856159143075748, + "grad_norm": 2.0733014553722517, + "learning_rate": 2.1685689947593445e-06, + "loss": 0.3186, + "step": 10362 + }, + { + "epoch": 1.5857689364957919, + "grad_norm": 2.1921386004476706, + "learning_rate": 2.167028198123182e-06, + "loss": 0.332, + "step": 10363 + }, + { + "epoch": 1.5859219586840092, + "grad_norm": 2.220723693801886, + "learning_rate": 2.16548788254046e-06, + "loss": 0.3319, + "step": 10364 + }, + { + "epoch": 1.5860749808722265, + "grad_norm": 2.0745477917442696, + "learning_rate": 2.1639480481057774e-06, + "loss": 0.2772, + "step": 10365 + }, + { + "epoch": 1.5862280030604436, + "grad_norm": 2.1995867088192798, + "learning_rate": 2.162408694913699e-06, + "loss": 0.322, + "step": 10366 + }, + { + "epoch": 1.5863810252486612, + "grad_norm": 2.029684586828926, + "learning_rate": 2.160869823058763e-06, + "loss": 0.2683, + "step": 10367 + }, + { + "epoch": 1.5865340474368783, + "grad_norm": 2.316636320242865, + "learning_rate": 2.1593314326354787e-06, + "loss": 0.3374, + "step": 10368 + }, + { + "epoch": 1.5866870696250956, + "grad_norm": 2.2433828028014045, + "learning_rate": 2.157793523738324e-06, + "loss": 0.3329, + "step": 10369 + }, + { + "epoch": 1.586840091813313, + "grad_norm": 1.9832856482770373, + "learning_rate": 2.1562560964617473e-06, + "loss": 0.2735, + "step": 10370 + }, + { + "epoch": 1.58699311400153, + "grad_norm": 2.1848410328916987, + "learning_rate": 2.1547191509001687e-06, + "loss": 0.2742, + "step": 10371 + }, + { + "epoch": 1.5871461361897476, + "grad_norm": 2.118690825527389, + "learning_rate": 2.1531826871479787e-06, + "loss": 0.3074, + "step": 10372 + }, + { + "epoch": 1.5872991583779648, + "grad_norm": 2.3314417826081018, + "learning_rate": 2.151646705299536e-06, + "loss": 0.2814, + "step": 10373 + }, + { + "epoch": 1.587452180566182, + "grad_norm": 2.402232417525519, + "learning_rate": 2.1501112054491725e-06, + "loss": 0.3494, + "step": 10374 + }, + { + "epoch": 1.5876052027543994, + "grad_norm": 2.326820597715731, + "learning_rate": 2.1485761876911892e-06, + "loss": 0.3038, + "step": 10375 + }, + { + "epoch": 1.5877582249426165, + "grad_norm": 2.1383455574598713, + "learning_rate": 2.1470416521198567e-06, + "loss": 0.352, + "step": 10376 + }, + { + "epoch": 1.587911247130834, + "grad_norm": 2.2995939874004625, + "learning_rate": 2.1455075988294137e-06, + "loss": 0.3279, + "step": 10377 + }, + { + "epoch": 1.5880642693190512, + "grad_norm": 1.9137006604044404, + "learning_rate": 2.143974027914083e-06, + "loss": 0.3033, + "step": 10378 + }, + { + "epoch": 1.5882172915072685, + "grad_norm": 2.2833225627008074, + "learning_rate": 2.142440939468037e-06, + "loss": 0.3236, + "step": 10379 + }, + { + "epoch": 1.5883703136954859, + "grad_norm": 1.9860651109123406, + "learning_rate": 2.1409083335854287e-06, + "loss": 0.2383, + "step": 10380 + }, + { + "epoch": 1.588523335883703, + "grad_norm": 2.48380903765018, + "learning_rate": 2.1393762103603898e-06, + "loss": 0.3249, + "step": 10381 + }, + { + "epoch": 1.5886763580719205, + "grad_norm": 2.4131936637247184, + "learning_rate": 2.1378445698870064e-06, + "loss": 0.3444, + "step": 10382 + }, + { + "epoch": 1.5888293802601376, + "grad_norm": 2.1244925827371386, + "learning_rate": 2.136313412259342e-06, + "loss": 0.3347, + "step": 10383 + }, + { + "epoch": 1.588982402448355, + "grad_norm": 2.1275395552021252, + "learning_rate": 2.134782737571439e-06, + "loss": 0.2968, + "step": 10384 + }, + { + "epoch": 1.5891354246365723, + "grad_norm": 2.382525615883336, + "learning_rate": 2.1332525459172927e-06, + "loss": 0.3307, + "step": 10385 + }, + { + "epoch": 1.5892884468247896, + "grad_norm": 2.193605115589541, + "learning_rate": 2.131722837390885e-06, + "loss": 0.2865, + "step": 10386 + }, + { + "epoch": 1.589441469013007, + "grad_norm": 2.3407094562764215, + "learning_rate": 2.130193612086161e-06, + "loss": 0.3284, + "step": 10387 + }, + { + "epoch": 1.589594491201224, + "grad_norm": 2.213402841205263, + "learning_rate": 2.128664870097028e-06, + "loss": 0.31, + "step": 10388 + }, + { + "epoch": 1.5897475133894414, + "grad_norm": 2.0222607907984798, + "learning_rate": 2.127136611517382e-06, + "loss": 0.2872, + "step": 10389 + }, + { + "epoch": 1.5899005355776588, + "grad_norm": 2.2442947647359714, + "learning_rate": 2.1256088364410775e-06, + "loss": 0.2878, + "step": 10390 + }, + { + "epoch": 1.590053557765876, + "grad_norm": 2.240624536205986, + "learning_rate": 2.1240815449619335e-06, + "loss": 0.3405, + "step": 10391 + }, + { + "epoch": 1.5902065799540934, + "grad_norm": 2.1876775626139024, + "learning_rate": 2.1225547371737564e-06, + "loss": 0.3478, + "step": 10392 + }, + { + "epoch": 1.5903596021423105, + "grad_norm": 2.162284723354573, + "learning_rate": 2.1210284131703084e-06, + "loss": 0.3532, + "step": 10393 + }, + { + "epoch": 1.590512624330528, + "grad_norm": 2.1405508919523593, + "learning_rate": 2.119502573045329e-06, + "loss": 0.3358, + "step": 10394 + }, + { + "epoch": 1.5906656465187452, + "grad_norm": 1.9600807168130765, + "learning_rate": 2.117977216892525e-06, + "loss": 0.3341, + "step": 10395 + }, + { + "epoch": 1.5908186687069625, + "grad_norm": 2.314495390443742, + "learning_rate": 2.1164523448055752e-06, + "loss": 0.2895, + "step": 10396 + }, + { + "epoch": 1.5909716908951799, + "grad_norm": 2.201921905625236, + "learning_rate": 2.114927956878128e-06, + "loss": 0.2522, + "step": 10397 + }, + { + "epoch": 1.591124713083397, + "grad_norm": 2.268191757936805, + "learning_rate": 2.1134040532038e-06, + "loss": 0.3256, + "step": 10398 + }, + { + "epoch": 1.5912777352716145, + "grad_norm": 2.2869195645309968, + "learning_rate": 2.1118806338761823e-06, + "loss": 0.3375, + "step": 10399 + }, + { + "epoch": 1.5914307574598316, + "grad_norm": 2.0769558928508727, + "learning_rate": 2.110357698988834e-06, + "loss": 0.2647, + "step": 10400 + }, + { + "epoch": 1.591583779648049, + "grad_norm": 2.4711031090770286, + "learning_rate": 2.1088352486352802e-06, + "loss": 0.3858, + "step": 10401 + }, + { + "epoch": 1.5917368018362663, + "grad_norm": 2.119268678002257, + "learning_rate": 2.1073132829090305e-06, + "loss": 0.2872, + "step": 10402 + }, + { + "epoch": 1.5918898240244834, + "grad_norm": 1.9850108207669523, + "learning_rate": 2.105791801903545e-06, + "loss": 0.3069, + "step": 10403 + }, + { + "epoch": 1.592042846212701, + "grad_norm": 2.224819324782824, + "learning_rate": 2.104270805712265e-06, + "loss": 0.3448, + "step": 10404 + }, + { + "epoch": 1.592195868400918, + "grad_norm": 1.873089072531048, + "learning_rate": 2.1027502944286083e-06, + "loss": 0.2505, + "step": 10405 + }, + { + "epoch": 1.5923488905891354, + "grad_norm": 1.86192926757717, + "learning_rate": 2.101230268145944e-06, + "loss": 0.3051, + "step": 10406 + }, + { + "epoch": 1.5925019127773528, + "grad_norm": 1.9442480263553694, + "learning_rate": 2.099710726957633e-06, + "loss": 0.293, + "step": 10407 + }, + { + "epoch": 1.5926549349655699, + "grad_norm": 2.358985236520384, + "learning_rate": 2.098191670956995e-06, + "loss": 0.3361, + "step": 10408 + }, + { + "epoch": 1.5928079571537874, + "grad_norm": 2.114912121413648, + "learning_rate": 2.0966731002373132e-06, + "loss": 0.3205, + "step": 10409 + }, + { + "epoch": 1.5929609793420045, + "grad_norm": 2.324773058036772, + "learning_rate": 2.0951550148918566e-06, + "loss": 0.3892, + "step": 10410 + }, + { + "epoch": 1.5931140015302219, + "grad_norm": 1.8540830139440694, + "learning_rate": 2.093637415013857e-06, + "loss": 0.2622, + "step": 10411 + }, + { + "epoch": 1.5932670237184392, + "grad_norm": 1.901825704329433, + "learning_rate": 2.0921203006965086e-06, + "loss": 0.2557, + "step": 10412 + }, + { + "epoch": 1.5934200459066563, + "grad_norm": 2.1564336487480387, + "learning_rate": 2.0906036720329904e-06, + "loss": 0.3251, + "step": 10413 + }, + { + "epoch": 1.5935730680948739, + "grad_norm": 2.0254893602452304, + "learning_rate": 2.0890875291164425e-06, + "loss": 0.2756, + "step": 10414 + }, + { + "epoch": 1.593726090283091, + "grad_norm": 2.145515414045669, + "learning_rate": 2.087571872039977e-06, + "loss": 0.311, + "step": 10415 + }, + { + "epoch": 1.5938791124713083, + "grad_norm": 2.2101983287594034, + "learning_rate": 2.0860567008966783e-06, + "loss": 0.3755, + "step": 10416 + }, + { + "epoch": 1.5940321346595256, + "grad_norm": 2.5275320370762353, + "learning_rate": 2.084542015779595e-06, + "loss": 0.3384, + "step": 10417 + }, + { + "epoch": 1.5941851568477428, + "grad_norm": 2.058712242376689, + "learning_rate": 2.083027816781753e-06, + "loss": 0.3568, + "step": 10418 + }, + { + "epoch": 1.5943381790359603, + "grad_norm": 1.9173450107481087, + "learning_rate": 2.081514103996144e-06, + "loss": 0.2783, + "step": 10419 + }, + { + "epoch": 1.5944912012241774, + "grad_norm": 2.0613941843193793, + "learning_rate": 2.080000877515731e-06, + "loss": 0.2779, + "step": 10420 + }, + { + "epoch": 1.5946442234123948, + "grad_norm": 2.0254402953785022, + "learning_rate": 2.0784881374334464e-06, + "loss": 0.2917, + "step": 10421 + }, + { + "epoch": 1.594797245600612, + "grad_norm": 2.200732027895814, + "learning_rate": 2.076975883842196e-06, + "loss": 0.3322, + "step": 10422 + }, + { + "epoch": 1.5949502677888294, + "grad_norm": 2.0881731560154146, + "learning_rate": 2.075464116834851e-06, + "loss": 0.3173, + "step": 10423 + }, + { + "epoch": 1.5951032899770468, + "grad_norm": 2.1056631422662258, + "learning_rate": 2.0739528365042548e-06, + "loss": 0.2872, + "step": 10424 + }, + { + "epoch": 1.5952563121652639, + "grad_norm": 2.169035501067074, + "learning_rate": 2.0724420429432234e-06, + "loss": 0.3098, + "step": 10425 + }, + { + "epoch": 1.5954093343534812, + "grad_norm": 2.0153248522385647, + "learning_rate": 2.070931736244538e-06, + "loss": 0.2683, + "step": 10426 + }, + { + "epoch": 1.5955623565416985, + "grad_norm": 2.215459845696938, + "learning_rate": 2.0694219165009534e-06, + "loss": 0.3179, + "step": 10427 + }, + { + "epoch": 1.5957153787299159, + "grad_norm": 2.1656297130536872, + "learning_rate": 2.0679125838051926e-06, + "loss": 0.3034, + "step": 10428 + }, + { + "epoch": 1.5958684009181332, + "grad_norm": 1.9130239058279197, + "learning_rate": 2.0664037382499514e-06, + "loss": 0.2878, + "step": 10429 + }, + { + "epoch": 1.5960214231063503, + "grad_norm": 2.121543196047268, + "learning_rate": 2.0648953799278905e-06, + "loss": 0.3468, + "step": 10430 + }, + { + "epoch": 1.5961744452945679, + "grad_norm": 1.9881410386253346, + "learning_rate": 2.0633875089316524e-06, + "loss": 0.3357, + "step": 10431 + }, + { + "epoch": 1.596327467482785, + "grad_norm": 1.692514957785927, + "learning_rate": 2.0618801253538323e-06, + "loss": 0.2143, + "step": 10432 + }, + { + "epoch": 1.5964804896710023, + "grad_norm": 2.1161285582341782, + "learning_rate": 2.060373229287005e-06, + "loss": 0.2955, + "step": 10433 + }, + { + "epoch": 1.5966335118592196, + "grad_norm": 2.1256187952301437, + "learning_rate": 2.058866820823723e-06, + "loss": 0.3186, + "step": 10434 + }, + { + "epoch": 1.5967865340474368, + "grad_norm": 1.8594388357448477, + "learning_rate": 2.0573609000564933e-06, + "loss": 0.2979, + "step": 10435 + }, + { + "epoch": 1.5969395562356543, + "grad_norm": 2.0752161038423074, + "learning_rate": 2.0558554670777987e-06, + "loss": 0.2839, + "step": 10436 + }, + { + "epoch": 1.5970925784238714, + "grad_norm": 1.8719102046704, + "learning_rate": 2.054350521980104e-06, + "loss": 0.2496, + "step": 10437 + }, + { + "epoch": 1.5972456006120888, + "grad_norm": 2.2235478483167306, + "learning_rate": 2.052846064855821e-06, + "loss": 0.3171, + "step": 10438 + }, + { + "epoch": 1.597398622800306, + "grad_norm": 2.0084905451553143, + "learning_rate": 2.051342095797354e-06, + "loss": 0.2823, + "step": 10439 + }, + { + "epoch": 1.5975516449885232, + "grad_norm": 2.183017243080984, + "learning_rate": 2.049838614897067e-06, + "loss": 0.2913, + "step": 10440 + }, + { + "epoch": 1.5977046671767408, + "grad_norm": 2.010797629418803, + "learning_rate": 2.048335622247286e-06, + "loss": 0.3119, + "step": 10441 + }, + { + "epoch": 1.5978576893649579, + "grad_norm": 1.6995369473672757, + "learning_rate": 2.0468331179403245e-06, + "loss": 0.233, + "step": 10442 + }, + { + "epoch": 1.5980107115531752, + "grad_norm": 2.253316101580062, + "learning_rate": 2.045331102068454e-06, + "loss": 0.3347, + "step": 10443 + }, + { + "epoch": 1.5981637337413925, + "grad_norm": 2.064383868203437, + "learning_rate": 2.0438295747239203e-06, + "loss": 0.2746, + "step": 10444 + }, + { + "epoch": 1.5983167559296096, + "grad_norm": 2.2317677875246336, + "learning_rate": 2.0423285359989366e-06, + "loss": 0.2718, + "step": 10445 + }, + { + "epoch": 1.5984697781178272, + "grad_norm": 2.0477416135369517, + "learning_rate": 2.0408279859856874e-06, + "loss": 0.2805, + "step": 10446 + }, + { + "epoch": 1.5986228003060443, + "grad_norm": 1.957177005619269, + "learning_rate": 2.0393279247763287e-06, + "loss": 0.2795, + "step": 10447 + }, + { + "epoch": 1.5987758224942616, + "grad_norm": 2.3488538209818706, + "learning_rate": 2.0378283524629837e-06, + "loss": 0.3327, + "step": 10448 + }, + { + "epoch": 1.598928844682479, + "grad_norm": 1.9422180459270484, + "learning_rate": 2.036329269137749e-06, + "loss": 0.297, + "step": 10449 + }, + { + "epoch": 1.599081866870696, + "grad_norm": 2.2860116966764186, + "learning_rate": 2.0348306748926873e-06, + "loss": 0.3003, + "step": 10450 + }, + { + "epoch": 1.5992348890589136, + "grad_norm": 2.24122516023989, + "learning_rate": 2.033332569819834e-06, + "loss": 0.3337, + "step": 10451 + }, + { + "epoch": 1.5993879112471308, + "grad_norm": 2.1824835991354146, + "learning_rate": 2.0318349540111924e-06, + "loss": 0.2964, + "step": 10452 + }, + { + "epoch": 1.599540933435348, + "grad_norm": 2.3276355546648695, + "learning_rate": 2.030337827558738e-06, + "loss": 0.3569, + "step": 10453 + }, + { + "epoch": 1.5996939556235654, + "grad_norm": 2.0981843178955915, + "learning_rate": 2.0288411905544158e-06, + "loss": 0.3176, + "step": 10454 + }, + { + "epoch": 1.5998469778117828, + "grad_norm": 2.2062338938417296, + "learning_rate": 2.0273450430901396e-06, + "loss": 0.2734, + "step": 10455 + }, + { + "epoch": 1.6, + "grad_norm": 2.643436905327408, + "learning_rate": 2.0258493852577933e-06, + "loss": 0.329, + "step": 10456 + }, + { + "epoch": 1.6001530221882172, + "grad_norm": 1.964606132543307, + "learning_rate": 2.0243542171492314e-06, + "loss": 0.2629, + "step": 10457 + }, + { + "epoch": 1.6003060443764345, + "grad_norm": 2.0319908205624424, + "learning_rate": 2.0228595388562776e-06, + "loss": 0.2485, + "step": 10458 + }, + { + "epoch": 1.6004590665646519, + "grad_norm": 2.157070383503065, + "learning_rate": 2.0213653504707243e-06, + "loss": 0.2879, + "step": 10459 + }, + { + "epoch": 1.6006120887528692, + "grad_norm": 2.0638106425645106, + "learning_rate": 2.019871652084342e-06, + "loss": 0.3038, + "step": 10460 + }, + { + "epoch": 1.6007651109410865, + "grad_norm": 2.413922284247371, + "learning_rate": 2.0183784437888577e-06, + "loss": 0.42, + "step": 10461 + }, + { + "epoch": 1.6009181331293036, + "grad_norm": 2.1985129144555353, + "learning_rate": 2.0168857256759745e-06, + "loss": 0.3118, + "step": 10462 + }, + { + "epoch": 1.6010711553175212, + "grad_norm": 2.2967862222343305, + "learning_rate": 2.0153934978373745e-06, + "loss": 0.3459, + "step": 10463 + }, + { + "epoch": 1.6012241775057383, + "grad_norm": 2.1976923086941325, + "learning_rate": 2.0139017603646925e-06, + "loss": 0.3005, + "step": 10464 + }, + { + "epoch": 1.6013771996939556, + "grad_norm": 1.9323315456919672, + "learning_rate": 2.012410513349543e-06, + "loss": 0.2619, + "step": 10465 + }, + { + "epoch": 1.601530221882173, + "grad_norm": 2.7793989397342838, + "learning_rate": 2.010919756883517e-06, + "loss": 0.3693, + "step": 10466 + }, + { + "epoch": 1.60168324407039, + "grad_norm": 2.1105341449495194, + "learning_rate": 2.009429491058157e-06, + "loss": 0.291, + "step": 10467 + }, + { + "epoch": 1.6018362662586076, + "grad_norm": 2.212110475794262, + "learning_rate": 2.0079397159649938e-06, + "loss": 0.3009, + "step": 10468 + }, + { + "epoch": 1.6019892884468248, + "grad_norm": 2.2904429553853576, + "learning_rate": 2.0064504316955204e-06, + "loss": 0.2809, + "step": 10469 + }, + { + "epoch": 1.602142310635042, + "grad_norm": 2.2952348089169443, + "learning_rate": 2.0049616383411906e-06, + "loss": 0.3095, + "step": 10470 + }, + { + "epoch": 1.6022953328232594, + "grad_norm": 2.0420024184363377, + "learning_rate": 2.0034733359934476e-06, + "loss": 0.3069, + "step": 10471 + }, + { + "epoch": 1.6024483550114765, + "grad_norm": 2.087684753510197, + "learning_rate": 2.001985524743689e-06, + "loss": 0.2769, + "step": 10472 + }, + { + "epoch": 1.602601377199694, + "grad_norm": 2.3146003021955575, + "learning_rate": 2.0004982046832875e-06, + "loss": 0.3284, + "step": 10473 + }, + { + "epoch": 1.6027543993879112, + "grad_norm": 2.3341543365189037, + "learning_rate": 1.9990113759035856e-06, + "loss": 0.2941, + "step": 10474 + }, + { + "epoch": 1.6029074215761285, + "grad_norm": 2.42343083766534, + "learning_rate": 1.9975250384958954e-06, + "loss": 0.3841, + "step": 10475 + }, + { + "epoch": 1.6030604437643459, + "grad_norm": 2.0000642760572793, + "learning_rate": 1.9960391925514975e-06, + "loss": 0.2387, + "step": 10476 + }, + { + "epoch": 1.603213465952563, + "grad_norm": 2.106691781994404, + "learning_rate": 1.9945538381616456e-06, + "loss": 0.2961, + "step": 10477 + }, + { + "epoch": 1.6033664881407805, + "grad_norm": 2.0613737551747042, + "learning_rate": 1.9930689754175604e-06, + "loss": 0.3309, + "step": 10478 + }, + { + "epoch": 1.6035195103289976, + "grad_norm": 2.4398182852589105, + "learning_rate": 1.9915846044104313e-06, + "loss": 0.3037, + "step": 10479 + }, + { + "epoch": 1.603672532517215, + "grad_norm": 2.1090739150374, + "learning_rate": 1.9901007252314185e-06, + "loss": 0.2903, + "step": 10480 + }, + { + "epoch": 1.6038255547054323, + "grad_norm": 2.144499019468979, + "learning_rate": 1.988617337971661e-06, + "loss": 0.31, + "step": 10481 + }, + { + "epoch": 1.6039785768936494, + "grad_norm": 2.026883977576024, + "learning_rate": 1.9871344427222504e-06, + "loss": 0.3783, + "step": 10482 + }, + { + "epoch": 1.604131599081867, + "grad_norm": 2.411293659913249, + "learning_rate": 1.9856520395742575e-06, + "loss": 0.3794, + "step": 10483 + }, + { + "epoch": 1.604284621270084, + "grad_norm": 2.0058040325716755, + "learning_rate": 1.9841701286187297e-06, + "loss": 0.2296, + "step": 10484 + }, + { + "epoch": 1.6044376434583014, + "grad_norm": 2.0780361687783278, + "learning_rate": 1.982688709946671e-06, + "loss": 0.2714, + "step": 10485 + }, + { + "epoch": 1.6045906656465188, + "grad_norm": 2.0576034473812266, + "learning_rate": 1.9812077836490595e-06, + "loss": 0.3636, + "step": 10486 + }, + { + "epoch": 1.604743687834736, + "grad_norm": 1.9414393221409934, + "learning_rate": 1.979727349816852e-06, + "loss": 0.3066, + "step": 10487 + }, + { + "epoch": 1.6048967100229534, + "grad_norm": 2.4956241751854003, + "learning_rate": 1.9782474085409597e-06, + "loss": 0.3205, + "step": 10488 + }, + { + "epoch": 1.6050497322111705, + "grad_norm": 2.1620399546925717, + "learning_rate": 1.9767679599122767e-06, + "loss": 0.3238, + "step": 10489 + }, + { + "epoch": 1.6052027543993879, + "grad_norm": 2.3001838451511665, + "learning_rate": 1.9752890040216644e-06, + "loss": 0.3707, + "step": 10490 + }, + { + "epoch": 1.6053557765876052, + "grad_norm": 2.1205275312048237, + "learning_rate": 1.9738105409599405e-06, + "loss": 0.3203, + "step": 10491 + }, + { + "epoch": 1.6055087987758225, + "grad_norm": 2.3261888986198023, + "learning_rate": 1.972332570817913e-06, + "loss": 0.3607, + "step": 10492 + }, + { + "epoch": 1.6056618209640399, + "grad_norm": 2.2311365973332733, + "learning_rate": 1.97085509368635e-06, + "loss": 0.2721, + "step": 10493 + }, + { + "epoch": 1.605814843152257, + "grad_norm": 2.0157940050326655, + "learning_rate": 1.9693781096559794e-06, + "loss": 0.2747, + "step": 10494 + }, + { + "epoch": 1.6059678653404745, + "grad_norm": 1.9952768637916527, + "learning_rate": 1.9679016188175193e-06, + "loss": 0.2393, + "step": 10495 + }, + { + "epoch": 1.6061208875286916, + "grad_norm": 2.3814074053620207, + "learning_rate": 1.9664256212616417e-06, + "loss": 0.3275, + "step": 10496 + }, + { + "epoch": 1.606273909716909, + "grad_norm": 2.105466025822687, + "learning_rate": 1.9649501170789943e-06, + "loss": 0.2923, + "step": 10497 + }, + { + "epoch": 1.6064269319051263, + "grad_norm": 1.8103329861441821, + "learning_rate": 1.963475106360193e-06, + "loss": 0.2546, + "step": 10498 + }, + { + "epoch": 1.6065799540933434, + "grad_norm": 1.981345827940942, + "learning_rate": 1.962000589195825e-06, + "loss": 0.2861, + "step": 10499 + }, + { + "epoch": 1.606732976281561, + "grad_norm": 2.031188520976353, + "learning_rate": 1.960526565676445e-06, + "loss": 0.3322, + "step": 10500 + }, + { + "epoch": 1.606885998469778, + "grad_norm": 2.073844327814389, + "learning_rate": 1.9590530358925796e-06, + "loss": 0.3111, + "step": 10501 + }, + { + "epoch": 1.6070390206579954, + "grad_norm": 2.224144695075106, + "learning_rate": 1.9575799999347247e-06, + "loss": 0.3317, + "step": 10502 + }, + { + "epoch": 1.6071920428462128, + "grad_norm": 2.2658544063551456, + "learning_rate": 1.9561074578933424e-06, + "loss": 0.4216, + "step": 10503 + }, + { + "epoch": 1.6073450650344299, + "grad_norm": 2.3868575362093907, + "learning_rate": 1.9546354098588693e-06, + "loss": 0.3201, + "step": 10504 + }, + { + "epoch": 1.6074980872226474, + "grad_norm": 2.052778963043461, + "learning_rate": 1.9531638559217104e-06, + "loss": 0.3202, + "step": 10505 + }, + { + "epoch": 1.6076511094108645, + "grad_norm": 2.027431317380972, + "learning_rate": 1.951692796172238e-06, + "loss": 0.3444, + "step": 10506 + }, + { + "epoch": 1.6078041315990819, + "grad_norm": 1.9896009671346127, + "learning_rate": 1.9502222307007956e-06, + "loss": 0.2569, + "step": 10507 + }, + { + "epoch": 1.6079571537872992, + "grad_norm": 2.3888546056899918, + "learning_rate": 1.948752159597698e-06, + "loss": 0.3455, + "step": 10508 + }, + { + "epoch": 1.6081101759755163, + "grad_norm": 2.3012402578354036, + "learning_rate": 1.9472825829532226e-06, + "loss": 0.3532, + "step": 10509 + }, + { + "epoch": 1.6082631981637339, + "grad_norm": 2.214180934529892, + "learning_rate": 1.945813500857633e-06, + "loss": 0.3095, + "step": 10510 + }, + { + "epoch": 1.608416220351951, + "grad_norm": 2.406222516683202, + "learning_rate": 1.9443449134011416e-06, + "loss": 0.3571, + "step": 10511 + }, + { + "epoch": 1.6085692425401683, + "grad_norm": 1.972700407695371, + "learning_rate": 1.9428768206739403e-06, + "loss": 0.3136, + "step": 10512 + }, + { + "epoch": 1.6087222647283856, + "grad_norm": 2.103254444351623, + "learning_rate": 1.9414092227661976e-06, + "loss": 0.2693, + "step": 10513 + }, + { + "epoch": 1.6088752869166028, + "grad_norm": 2.0014590653047346, + "learning_rate": 1.939942119768038e-06, + "loss": 0.2823, + "step": 10514 + }, + { + "epoch": 1.6090283091048203, + "grad_norm": 2.558600077343717, + "learning_rate": 1.9384755117695607e-06, + "loss": 0.4164, + "step": 10515 + }, + { + "epoch": 1.6091813312930374, + "grad_norm": 1.9254914747715335, + "learning_rate": 1.9370093988608453e-06, + "loss": 0.2698, + "step": 10516 + }, + { + "epoch": 1.6093343534812548, + "grad_norm": 2.3909848914674203, + "learning_rate": 1.935543781131919e-06, + "loss": 0.3229, + "step": 10517 + }, + { + "epoch": 1.609487375669472, + "grad_norm": 2.024294601203651, + "learning_rate": 1.934078658672801e-06, + "loss": 0.2838, + "step": 10518 + }, + { + "epoch": 1.6096403978576894, + "grad_norm": 2.5391493263896154, + "learning_rate": 1.9326140315734685e-06, + "loss": 0.418, + "step": 10519 + }, + { + "epoch": 1.6097934200459068, + "grad_norm": 2.267744207696907, + "learning_rate": 1.931149899923863e-06, + "loss": 0.2905, + "step": 10520 + }, + { + "epoch": 1.6099464422341239, + "grad_norm": 1.9815819153211391, + "learning_rate": 1.9296862638139103e-06, + "loss": 0.2529, + "step": 10521 + }, + { + "epoch": 1.6100994644223412, + "grad_norm": 1.8905960118777305, + "learning_rate": 1.928223123333498e-06, + "loss": 0.2605, + "step": 10522 + }, + { + "epoch": 1.6102524866105585, + "grad_norm": 2.1424726322335386, + "learning_rate": 1.926760478572476e-06, + "loss": 0.322, + "step": 10523 + }, + { + "epoch": 1.6104055087987759, + "grad_norm": 2.2819861093889036, + "learning_rate": 1.9252983296206784e-06, + "loss": 0.2884, + "step": 10524 + }, + { + "epoch": 1.6105585309869932, + "grad_norm": 2.0572968938301877, + "learning_rate": 1.9238366765678972e-06, + "loss": 0.3299, + "step": 10525 + }, + { + "epoch": 1.6107115531752103, + "grad_norm": 1.969348700776263, + "learning_rate": 1.922375519503902e-06, + "loss": 0.2684, + "step": 10526 + }, + { + "epoch": 1.6108645753634276, + "grad_norm": 2.202527841852906, + "learning_rate": 1.9209148585184244e-06, + "loss": 0.3269, + "step": 10527 + }, + { + "epoch": 1.611017597551645, + "grad_norm": 1.940551357103671, + "learning_rate": 1.9194546937011716e-06, + "loss": 0.2926, + "step": 10528 + }, + { + "epoch": 1.6111706197398623, + "grad_norm": 2.4080905545015825, + "learning_rate": 1.9179950251418168e-06, + "loss": 0.3119, + "step": 10529 + }, + { + "epoch": 1.6113236419280796, + "grad_norm": 2.1771118306491495, + "learning_rate": 1.9165358529300047e-06, + "loss": 0.2689, + "step": 10530 + }, + { + "epoch": 1.6114766641162968, + "grad_norm": 2.2555773274378237, + "learning_rate": 1.915077177155349e-06, + "loss": 0.319, + "step": 10531 + }, + { + "epoch": 1.6116296863045143, + "grad_norm": 2.0593411592193664, + "learning_rate": 1.9136189979074314e-06, + "loss": 0.3501, + "step": 10532 + }, + { + "epoch": 1.6117827084927314, + "grad_norm": 1.712499352328928, + "learning_rate": 1.9121613152758067e-06, + "loss": 0.2723, + "step": 10533 + }, + { + "epoch": 1.6119357306809488, + "grad_norm": 2.229661991037623, + "learning_rate": 1.910704129349994e-06, + "loss": 0.3884, + "step": 10534 + }, + { + "epoch": 1.612088752869166, + "grad_norm": 2.44309670412781, + "learning_rate": 1.909247440219487e-06, + "loss": 0.3624, + "step": 10535 + }, + { + "epoch": 1.6122417750573832, + "grad_norm": 2.0981658994548904, + "learning_rate": 1.907791247973746e-06, + "loss": 0.3294, + "step": 10536 + }, + { + "epoch": 1.6123947972456008, + "grad_norm": 2.2664865350975956, + "learning_rate": 1.906335552702201e-06, + "loss": 0.3435, + "step": 10537 + }, + { + "epoch": 1.6125478194338179, + "grad_norm": 2.166203828647484, + "learning_rate": 1.9048803544942518e-06, + "loss": 0.3217, + "step": 10538 + }, + { + "epoch": 1.6127008416220352, + "grad_norm": 2.1129488376182204, + "learning_rate": 1.9034256534392692e-06, + "loss": 0.2719, + "step": 10539 + }, + { + "epoch": 1.6128538638102525, + "grad_norm": 1.9130793331631273, + "learning_rate": 1.9019714496265906e-06, + "loss": 0.2827, + "step": 10540 + }, + { + "epoch": 1.6130068859984696, + "grad_norm": 1.9783507773097913, + "learning_rate": 1.9005177431455223e-06, + "loss": 0.3302, + "step": 10541 + }, + { + "epoch": 1.6131599081866872, + "grad_norm": 2.266976194472374, + "learning_rate": 1.8990645340853498e-06, + "loss": 0.3105, + "step": 10542 + }, + { + "epoch": 1.6133129303749043, + "grad_norm": 2.447531198231755, + "learning_rate": 1.8976118225353135e-06, + "loss": 0.337, + "step": 10543 + }, + { + "epoch": 1.6134659525631216, + "grad_norm": 1.9466005333498533, + "learning_rate": 1.896159608584629e-06, + "loss": 0.2737, + "step": 10544 + }, + { + "epoch": 1.613618974751339, + "grad_norm": 2.270820493266619, + "learning_rate": 1.8947078923224905e-06, + "loss": 0.3383, + "step": 10545 + }, + { + "epoch": 1.613771996939556, + "grad_norm": 2.175481962547, + "learning_rate": 1.8932566738380432e-06, + "loss": 0.2804, + "step": 10546 + }, + { + "epoch": 1.6139250191277736, + "grad_norm": 2.3916081886581337, + "learning_rate": 1.891805953220419e-06, + "loss": 0.3189, + "step": 10547 + }, + { + "epoch": 1.6140780413159908, + "grad_norm": 2.199411097369911, + "learning_rate": 1.890355730558715e-06, + "loss": 0.3678, + "step": 10548 + }, + { + "epoch": 1.614231063504208, + "grad_norm": 2.186939011479005, + "learning_rate": 1.888906005941984e-06, + "loss": 0.3018, + "step": 10549 + }, + { + "epoch": 1.6143840856924254, + "grad_norm": 2.1798549473165565, + "learning_rate": 1.887456779459269e-06, + "loss": 0.2894, + "step": 10550 + }, + { + "epoch": 1.6145371078806425, + "grad_norm": 2.273224534202083, + "learning_rate": 1.8860080511995727e-06, + "loss": 0.3557, + "step": 10551 + }, + { + "epoch": 1.61469013006886, + "grad_norm": 2.1166615630191603, + "learning_rate": 1.884559821251859e-06, + "loss": 0.3081, + "step": 10552 + }, + { + "epoch": 1.6148431522570772, + "grad_norm": 1.9044929221608777, + "learning_rate": 1.8831120897050759e-06, + "loss": 0.2839, + "step": 10553 + }, + { + "epoch": 1.6149961744452945, + "grad_norm": 2.1024143177946377, + "learning_rate": 1.8816648566481343e-06, + "loss": 0.2778, + "step": 10554 + }, + { + "epoch": 1.6151491966335119, + "grad_norm": 2.1048915778898163, + "learning_rate": 1.8802181221699124e-06, + "loss": 0.3376, + "step": 10555 + }, + { + "epoch": 1.6153022188217292, + "grad_norm": 2.187077064266125, + "learning_rate": 1.8787718863592597e-06, + "loss": 0.3192, + "step": 10556 + }, + { + "epoch": 1.6154552410099465, + "grad_norm": 1.9487619847012356, + "learning_rate": 1.8773261493049965e-06, + "loss": 0.2797, + "step": 10557 + }, + { + "epoch": 1.6156082631981636, + "grad_norm": 2.1489496371148378, + "learning_rate": 1.8758809110959098e-06, + "loss": 0.3082, + "step": 10558 + }, + { + "epoch": 1.615761285386381, + "grad_norm": 1.877140278626581, + "learning_rate": 1.8744361718207593e-06, + "loss": 0.3091, + "step": 10559 + }, + { + "epoch": 1.6159143075745983, + "grad_norm": 2.4222748132697367, + "learning_rate": 1.87299193156827e-06, + "loss": 0.321, + "step": 10560 + }, + { + "epoch": 1.6160673297628156, + "grad_norm": 2.402037261974164, + "learning_rate": 1.8715481904271393e-06, + "loss": 0.3186, + "step": 10561 + }, + { + "epoch": 1.616220351951033, + "grad_norm": 1.842256510948127, + "learning_rate": 1.8701049484860312e-06, + "loss": 0.274, + "step": 10562 + }, + { + "epoch": 1.61637337413925, + "grad_norm": 2.1205220163507255, + "learning_rate": 1.8686622058335867e-06, + "loss": 0.2792, + "step": 10563 + }, + { + "epoch": 1.6165263963274676, + "grad_norm": 2.161144153109064, + "learning_rate": 1.8672199625584042e-06, + "loss": 0.3228, + "step": 10564 + }, + { + "epoch": 1.6166794185156848, + "grad_norm": 2.288586202317026, + "learning_rate": 1.8657782187490558e-06, + "loss": 0.3729, + "step": 10565 + }, + { + "epoch": 1.616832440703902, + "grad_norm": 2.110696657148073, + "learning_rate": 1.8643369744940942e-06, + "loss": 0.312, + "step": 10566 + }, + { + "epoch": 1.6169854628921194, + "grad_norm": 2.0382296277093466, + "learning_rate": 1.862896229882023e-06, + "loss": 0.2529, + "step": 10567 + }, + { + "epoch": 1.6171384850803365, + "grad_norm": 2.040328917756369, + "learning_rate": 1.8614559850013247e-06, + "loss": 0.2704, + "step": 10568 + }, + { + "epoch": 1.617291507268554, + "grad_norm": 2.0325733210458905, + "learning_rate": 1.8600162399404576e-06, + "loss": 0.2916, + "step": 10569 + }, + { + "epoch": 1.6174445294567712, + "grad_norm": 1.8749942403353452, + "learning_rate": 1.8585769947878318e-06, + "loss": 0.2755, + "step": 10570 + }, + { + "epoch": 1.6175975516449885, + "grad_norm": 1.9462632302198313, + "learning_rate": 1.8571382496318446e-06, + "loss": 0.2756, + "step": 10571 + }, + { + "epoch": 1.6177505738332059, + "grad_norm": 2.0840998504651407, + "learning_rate": 1.8557000045608553e-06, + "loss": 0.2953, + "step": 10572 + }, + { + "epoch": 1.617903596021423, + "grad_norm": 2.3883921872480305, + "learning_rate": 1.8542622596631831e-06, + "loss": 0.3609, + "step": 10573 + }, + { + "epoch": 1.6180566182096405, + "grad_norm": 2.256482256007678, + "learning_rate": 1.8528250150271365e-06, + "loss": 0.342, + "step": 10574 + }, + { + "epoch": 1.6182096403978576, + "grad_norm": 2.3045778230254768, + "learning_rate": 1.8513882707409791e-06, + "loss": 0.3323, + "step": 10575 + }, + { + "epoch": 1.618362662586075, + "grad_norm": 2.3223204819305505, + "learning_rate": 1.8499520268929406e-06, + "loss": 0.3327, + "step": 10576 + }, + { + "epoch": 1.6185156847742923, + "grad_norm": 2.1041983316848696, + "learning_rate": 1.8485162835712334e-06, + "loss": 0.327, + "step": 10577 + }, + { + "epoch": 1.6186687069625094, + "grad_norm": 1.9349892975599834, + "learning_rate": 1.8470810408640305e-06, + "loss": 0.2856, + "step": 10578 + }, + { + "epoch": 1.618821729150727, + "grad_norm": 2.1178930440291057, + "learning_rate": 1.8456462988594748e-06, + "loss": 0.3653, + "step": 10579 + }, + { + "epoch": 1.618974751338944, + "grad_norm": 2.1031709608253695, + "learning_rate": 1.8442120576456802e-06, + "loss": 0.2744, + "step": 10580 + }, + { + "epoch": 1.6191277735271614, + "grad_norm": 2.478206559028707, + "learning_rate": 1.8427783173107283e-06, + "loss": 0.3079, + "step": 10581 + }, + { + "epoch": 1.6192807957153788, + "grad_norm": 1.8838818778239124, + "learning_rate": 1.8413450779426723e-06, + "loss": 0.2817, + "step": 10582 + }, + { + "epoch": 1.6194338179035959, + "grad_norm": 1.912703665415085, + "learning_rate": 1.8399123396295305e-06, + "loss": 0.3116, + "step": 10583 + }, + { + "epoch": 1.6195868400918134, + "grad_norm": 2.2889397537082345, + "learning_rate": 1.8384801024592957e-06, + "loss": 0.3419, + "step": 10584 + }, + { + "epoch": 1.6197398622800305, + "grad_norm": 2.012218099198962, + "learning_rate": 1.8370483665199246e-06, + "loss": 0.247, + "step": 10585 + }, + { + "epoch": 1.6198928844682479, + "grad_norm": 2.6818823744928593, + "learning_rate": 1.8356171318993477e-06, + "loss": 0.3282, + "step": 10586 + }, + { + "epoch": 1.6200459066564652, + "grad_norm": 2.2120147507952304, + "learning_rate": 1.8341863986854624e-06, + "loss": 0.3462, + "step": 10587 + }, + { + "epoch": 1.6201989288446825, + "grad_norm": 2.052125974963057, + "learning_rate": 1.8327561669661343e-06, + "loss": 0.3083, + "step": 10588 + }, + { + "epoch": 1.6203519510328999, + "grad_norm": 1.9840591498263909, + "learning_rate": 1.8313264368292005e-06, + "loss": 0.2652, + "step": 10589 + }, + { + "epoch": 1.620504973221117, + "grad_norm": 2.4214798106894975, + "learning_rate": 1.8298972083624667e-06, + "loss": 0.348, + "step": 10590 + }, + { + "epoch": 1.6206579954093343, + "grad_norm": 1.8997166893876836, + "learning_rate": 1.8284684816537045e-06, + "loss": 0.3332, + "step": 10591 + }, + { + "epoch": 1.6208110175975516, + "grad_norm": 2.369531878415244, + "learning_rate": 1.8270402567906654e-06, + "loss": 0.3397, + "step": 10592 + }, + { + "epoch": 1.620964039785769, + "grad_norm": 2.170929041681945, + "learning_rate": 1.825612533861053e-06, + "loss": 0.2398, + "step": 10593 + }, + { + "epoch": 1.6211170619739863, + "grad_norm": 1.9795611289660413, + "learning_rate": 1.8241853129525522e-06, + "loss": 0.2896, + "step": 10594 + }, + { + "epoch": 1.6212700841622034, + "grad_norm": 2.155517297499104, + "learning_rate": 1.8227585941528192e-06, + "loss": 0.2697, + "step": 10595 + }, + { + "epoch": 1.621423106350421, + "grad_norm": 2.228766404040802, + "learning_rate": 1.8213323775494684e-06, + "loss": 0.2509, + "step": 10596 + }, + { + "epoch": 1.621576128538638, + "grad_norm": 2.187183161177902, + "learning_rate": 1.81990666323009e-06, + "loss": 0.3133, + "step": 10597 + }, + { + "epoch": 1.6217291507268554, + "grad_norm": 1.8590016521156796, + "learning_rate": 1.8184814512822479e-06, + "loss": 0.2984, + "step": 10598 + }, + { + "epoch": 1.6218821729150728, + "grad_norm": 2.134756508469202, + "learning_rate": 1.8170567417934615e-06, + "loss": 0.3212, + "step": 10599 + }, + { + "epoch": 1.6220351951032899, + "grad_norm": 1.975640103428586, + "learning_rate": 1.815632534851235e-06, + "loss": 0.344, + "step": 10600 + }, + { + "epoch": 1.6221882172915074, + "grad_norm": 1.9895208241922568, + "learning_rate": 1.8142088305430339e-06, + "loss": 0.3345, + "step": 10601 + }, + { + "epoch": 1.6223412394797245, + "grad_norm": 2.0141924746839344, + "learning_rate": 1.8127856289562873e-06, + "loss": 0.2954, + "step": 10602 + }, + { + "epoch": 1.6224942616679419, + "grad_norm": 2.392543783330661, + "learning_rate": 1.8113629301784052e-06, + "loss": 0.3212, + "step": 10603 + }, + { + "epoch": 1.6226472838561592, + "grad_norm": 2.2038116103794976, + "learning_rate": 1.8099407342967635e-06, + "loss": 0.2888, + "step": 10604 + }, + { + "epoch": 1.6228003060443763, + "grad_norm": 2.0600551971159806, + "learning_rate": 1.8085190413986953e-06, + "loss": 0.3045, + "step": 10605 + }, + { + "epoch": 1.6229533282325939, + "grad_norm": 1.9894692711668345, + "learning_rate": 1.807097851571521e-06, + "loss": 0.3068, + "step": 10606 + }, + { + "epoch": 1.623106350420811, + "grad_norm": 2.2340507250131147, + "learning_rate": 1.8056771649025173e-06, + "loss": 0.3865, + "step": 10607 + }, + { + "epoch": 1.6232593726090283, + "grad_norm": 1.8290540957109105, + "learning_rate": 1.8042569814789367e-06, + "loss": 0.245, + "step": 10608 + }, + { + "epoch": 1.6234123947972456, + "grad_norm": 2.3069059295735155, + "learning_rate": 1.8028373013879964e-06, + "loss": 0.3339, + "step": 10609 + }, + { + "epoch": 1.6235654169854628, + "grad_norm": 2.1424771142080723, + "learning_rate": 1.801418124716884e-06, + "loss": 0.2891, + "step": 10610 + }, + { + "epoch": 1.6237184391736803, + "grad_norm": 2.4355336467958035, + "learning_rate": 1.7999994515527586e-06, + "loss": 0.3452, + "step": 10611 + }, + { + "epoch": 1.6238714613618974, + "grad_norm": 2.112129584337102, + "learning_rate": 1.7985812819827452e-06, + "loss": 0.2674, + "step": 10612 + }, + { + "epoch": 1.6240244835501148, + "grad_norm": 1.8521996070155238, + "learning_rate": 1.7971636160939388e-06, + "loss": 0.263, + "step": 10613 + }, + { + "epoch": 1.624177505738332, + "grad_norm": 2.216065359992707, + "learning_rate": 1.7957464539734048e-06, + "loss": 0.3601, + "step": 10614 + }, + { + "epoch": 1.6243305279265492, + "grad_norm": 1.7979366955047242, + "learning_rate": 1.7943297957081762e-06, + "loss": 0.2903, + "step": 10615 + }, + { + "epoch": 1.6244835501147668, + "grad_norm": 2.39989702737361, + "learning_rate": 1.7929136413852567e-06, + "loss": 0.428, + "step": 10616 + }, + { + "epoch": 1.6246365723029839, + "grad_norm": 2.2140572585003557, + "learning_rate": 1.7914979910916152e-06, + "loss": 0.31, + "step": 10617 + }, + { + "epoch": 1.6247895944912012, + "grad_norm": 2.4071733588786715, + "learning_rate": 1.7900828449141949e-06, + "loss": 0.3316, + "step": 10618 + }, + { + "epoch": 1.6249426166794185, + "grad_norm": 2.1107088272132812, + "learning_rate": 1.788668202939904e-06, + "loss": 0.3253, + "step": 10619 + }, + { + "epoch": 1.6250956388676359, + "grad_norm": 2.166748268503527, + "learning_rate": 1.7872540652556192e-06, + "loss": 0.3268, + "step": 10620 + }, + { + "epoch": 1.6252486610558532, + "grad_norm": 2.0274159462473382, + "learning_rate": 1.785840431948196e-06, + "loss": 0.2423, + "step": 10621 + }, + { + "epoch": 1.6254016832440703, + "grad_norm": 2.2267309500004413, + "learning_rate": 1.7844273031044435e-06, + "loss": 0.2898, + "step": 10622 + }, + { + "epoch": 1.6255547054322876, + "grad_norm": 2.1327367988472776, + "learning_rate": 1.783014678811147e-06, + "loss": 0.2974, + "step": 10623 + }, + { + "epoch": 1.625707727620505, + "grad_norm": 2.054036296303469, + "learning_rate": 1.781602559155069e-06, + "loss": 0.303, + "step": 10624 + }, + { + "epoch": 1.6258607498087223, + "grad_norm": 1.9841387608214618, + "learning_rate": 1.7801909442229258e-06, + "loss": 0.2797, + "step": 10625 + }, + { + "epoch": 1.6260137719969396, + "grad_norm": 1.964548815517046, + "learning_rate": 1.7787798341014107e-06, + "loss": 0.2808, + "step": 10626 + }, + { + "epoch": 1.6261667941851567, + "grad_norm": 2.1037655301339906, + "learning_rate": 1.7773692288771927e-06, + "loss": 0.2973, + "step": 10627 + }, + { + "epoch": 1.6263198163733743, + "grad_norm": 2.024239937854239, + "learning_rate": 1.7759591286368915e-06, + "loss": 0.2694, + "step": 10628 + }, + { + "epoch": 1.6264728385615914, + "grad_norm": 2.094301076866119, + "learning_rate": 1.774549533467116e-06, + "loss": 0.2871, + "step": 10629 + }, + { + "epoch": 1.6266258607498087, + "grad_norm": 1.9001204144586141, + "learning_rate": 1.773140443454434e-06, + "loss": 0.2899, + "step": 10630 + }, + { + "epoch": 1.626778882938026, + "grad_norm": 2.0305966737469117, + "learning_rate": 1.7717318586853772e-06, + "loss": 0.3259, + "step": 10631 + }, + { + "epoch": 1.6269319051262432, + "grad_norm": 2.132174412861507, + "learning_rate": 1.7703237792464567e-06, + "loss": 0.3068, + "step": 10632 + }, + { + "epoch": 1.6270849273144607, + "grad_norm": 2.267922670169627, + "learning_rate": 1.7689162052241515e-06, + "loss": 0.3581, + "step": 10633 + }, + { + "epoch": 1.6272379495026779, + "grad_norm": 2.0521196074770414, + "learning_rate": 1.767509136704897e-06, + "loss": 0.2833, + "step": 10634 + }, + { + "epoch": 1.6273909716908952, + "grad_norm": 1.95384614994354, + "learning_rate": 1.7661025737751148e-06, + "loss": 0.2801, + "step": 10635 + }, + { + "epoch": 1.6275439938791125, + "grad_norm": 2.073529431779787, + "learning_rate": 1.7646965165211837e-06, + "loss": 0.2925, + "step": 10636 + }, + { + "epoch": 1.6276970160673296, + "grad_norm": 1.681491994873035, + "learning_rate": 1.7632909650294571e-06, + "loss": 0.2322, + "step": 10637 + }, + { + "epoch": 1.6278500382555472, + "grad_norm": 2.013831593099779, + "learning_rate": 1.7618859193862547e-06, + "loss": 0.246, + "step": 10638 + }, + { + "epoch": 1.6280030604437643, + "grad_norm": 2.222352955584193, + "learning_rate": 1.7604813796778652e-06, + "loss": 0.3341, + "step": 10639 + }, + { + "epoch": 1.6281560826319816, + "grad_norm": 2.0854233819651253, + "learning_rate": 1.7590773459905475e-06, + "loss": 0.2933, + "step": 10640 + }, + { + "epoch": 1.628309104820199, + "grad_norm": 2.1521488694164495, + "learning_rate": 1.7576738184105292e-06, + "loss": 0.3917, + "step": 10641 + }, + { + "epoch": 1.628462127008416, + "grad_norm": 2.0691690612534135, + "learning_rate": 1.7562707970240046e-06, + "loss": 0.2824, + "step": 10642 + }, + { + "epoch": 1.6286151491966336, + "grad_norm": 2.4170362752448065, + "learning_rate": 1.7548682819171414e-06, + "loss": 0.4363, + "step": 10643 + }, + { + "epoch": 1.6287681713848507, + "grad_norm": 2.362068837715601, + "learning_rate": 1.7534662731760687e-06, + "loss": 0.3469, + "step": 10644 + }, + { + "epoch": 1.628921193573068, + "grad_norm": 2.3419814264762713, + "learning_rate": 1.7520647708868977e-06, + "loss": 0.3054, + "step": 10645 + }, + { + "epoch": 1.6290742157612854, + "grad_norm": 2.180124099872857, + "learning_rate": 1.7506637751356936e-06, + "loss": 0.309, + "step": 10646 + }, + { + "epoch": 1.6292272379495025, + "grad_norm": 2.1503373897058373, + "learning_rate": 1.7492632860084957e-06, + "loss": 0.3065, + "step": 10647 + }, + { + "epoch": 1.62938026013772, + "grad_norm": 1.7921363423401993, + "learning_rate": 1.7478633035913217e-06, + "loss": 0.275, + "step": 10648 + }, + { + "epoch": 1.6295332823259372, + "grad_norm": 2.052364874867607, + "learning_rate": 1.7464638279701385e-06, + "loss": 0.3333, + "step": 10649 + }, + { + "epoch": 1.6296863045141545, + "grad_norm": 2.027999984299312, + "learning_rate": 1.7450648592309039e-06, + "loss": 0.2747, + "step": 10650 + }, + { + "epoch": 1.6298393267023719, + "grad_norm": 1.9461536696532593, + "learning_rate": 1.7436663974595314e-06, + "loss": 0.2983, + "step": 10651 + }, + { + "epoch": 1.629992348890589, + "grad_norm": 2.1762010864690153, + "learning_rate": 1.742268442741899e-06, + "loss": 0.3249, + "step": 10652 + }, + { + "epoch": 1.6301453710788065, + "grad_norm": 1.848456650261626, + "learning_rate": 1.7408709951638692e-06, + "loss": 0.2466, + "step": 10653 + }, + { + "epoch": 1.6302983932670236, + "grad_norm": 2.1106357287709314, + "learning_rate": 1.7394740548112644e-06, + "loss": 0.2985, + "step": 10654 + }, + { + "epoch": 1.630451415455241, + "grad_norm": 2.2833521016179117, + "learning_rate": 1.738077621769867e-06, + "loss": 0.2894, + "step": 10655 + }, + { + "epoch": 1.6306044376434583, + "grad_norm": 2.0606386731477, + "learning_rate": 1.7366816961254463e-06, + "loss": 0.2897, + "step": 10656 + }, + { + "epoch": 1.6307574598316756, + "grad_norm": 1.9264653530292764, + "learning_rate": 1.7352862779637302e-06, + "loss": 0.3205, + "step": 10657 + }, + { + "epoch": 1.630910482019893, + "grad_norm": 2.0009189375636045, + "learning_rate": 1.7338913673704138e-06, + "loss": 0.2725, + "step": 10658 + }, + { + "epoch": 1.63106350420811, + "grad_norm": 2.1654583638020024, + "learning_rate": 1.7324969644311672e-06, + "loss": 0.2803, + "step": 10659 + }, + { + "epoch": 1.6312165263963274, + "grad_norm": 1.6338114436592361, + "learning_rate": 1.7311030692316244e-06, + "loss": 0.2143, + "step": 10660 + }, + { + "epoch": 1.6313695485845447, + "grad_norm": 2.2733662110358415, + "learning_rate": 1.72970968185739e-06, + "loss": 0.295, + "step": 10661 + }, + { + "epoch": 1.631522570772762, + "grad_norm": 2.1242848843223703, + "learning_rate": 1.7283168023940366e-06, + "loss": 0.3347, + "step": 10662 + }, + { + "epoch": 1.6316755929609794, + "grad_norm": 2.0020305645316747, + "learning_rate": 1.7269244309271083e-06, + "loss": 0.263, + "step": 10663 + }, + { + "epoch": 1.6318286151491965, + "grad_norm": 2.4292633335719582, + "learning_rate": 1.7255325675421154e-06, + "loss": 0.3645, + "step": 10664 + }, + { + "epoch": 1.631981637337414, + "grad_norm": 2.2540963885960483, + "learning_rate": 1.7241412123245372e-06, + "loss": 0.3293, + "step": 10665 + }, + { + "epoch": 1.6321346595256312, + "grad_norm": 1.7674776566718227, + "learning_rate": 1.722750365359822e-06, + "loss": 0.2674, + "step": 10666 + }, + { + "epoch": 1.6322876817138485, + "grad_norm": 2.0795766976304413, + "learning_rate": 1.7213600267333884e-06, + "loss": 0.351, + "step": 10667 + }, + { + "epoch": 1.6324407039020659, + "grad_norm": 2.539896141459289, + "learning_rate": 1.7199701965306214e-06, + "loss": 0.3219, + "step": 10668 + }, + { + "epoch": 1.632593726090283, + "grad_norm": 2.151179350291576, + "learning_rate": 1.718580874836877e-06, + "loss": 0.299, + "step": 10669 + }, + { + "epoch": 1.6327467482785005, + "grad_norm": 2.59810278830666, + "learning_rate": 1.717192061737477e-06, + "loss": 0.3523, + "step": 10670 + }, + { + "epoch": 1.6328997704667176, + "grad_norm": 1.9421911778296748, + "learning_rate": 1.7158037573177156e-06, + "loss": 0.2264, + "step": 10671 + }, + { + "epoch": 1.633052792654935, + "grad_norm": 2.4319925226909413, + "learning_rate": 1.714415961662853e-06, + "loss": 0.3519, + "step": 10672 + }, + { + "epoch": 1.6332058148431523, + "grad_norm": 2.196304094029701, + "learning_rate": 1.7130286748581183e-06, + "loss": 0.2792, + "step": 10673 + }, + { + "epoch": 1.6333588370313694, + "grad_norm": 1.8814963990285223, + "learning_rate": 1.7116418969887149e-06, + "loss": 0.254, + "step": 10674 + }, + { + "epoch": 1.633511859219587, + "grad_norm": 2.163967544140257, + "learning_rate": 1.7102556281398053e-06, + "loss": 0.3448, + "step": 10675 + }, + { + "epoch": 1.633664881407804, + "grad_norm": 2.3130978412784096, + "learning_rate": 1.7088698683965243e-06, + "loss": 0.3146, + "step": 10676 + }, + { + "epoch": 1.6338179035960214, + "grad_norm": 1.9372326688003392, + "learning_rate": 1.7074846178439853e-06, + "loss": 0.2683, + "step": 10677 + }, + { + "epoch": 1.6339709257842387, + "grad_norm": 2.136531609644606, + "learning_rate": 1.7060998765672531e-06, + "loss": 0.2711, + "step": 10678 + }, + { + "epoch": 1.6341239479724559, + "grad_norm": 2.2288583019578354, + "learning_rate": 1.7047156446513723e-06, + "loss": 0.3449, + "step": 10679 + }, + { + "epoch": 1.6342769701606734, + "grad_norm": 1.9287035630634422, + "learning_rate": 1.7033319221813593e-06, + "loss": 0.279, + "step": 10680 + }, + { + "epoch": 1.6344299923488905, + "grad_norm": 1.9373764849343669, + "learning_rate": 1.7019487092421838e-06, + "loss": 0.311, + "step": 10681 + }, + { + "epoch": 1.6345830145371079, + "grad_norm": 2.1086222036034785, + "learning_rate": 1.7005660059188034e-06, + "loss": 0.3082, + "step": 10682 + }, + { + "epoch": 1.6347360367253252, + "grad_norm": 1.992491784010115, + "learning_rate": 1.6991838122961358e-06, + "loss": 0.2944, + "step": 10683 + }, + { + "epoch": 1.6348890589135423, + "grad_norm": 1.948514553292706, + "learning_rate": 1.6978021284590562e-06, + "loss": 0.3036, + "step": 10684 + }, + { + "epoch": 1.6350420811017599, + "grad_norm": 2.3051469233814905, + "learning_rate": 1.6964209544924304e-06, + "loss": 0.302, + "step": 10685 + }, + { + "epoch": 1.635195103289977, + "grad_norm": 2.0805217135051053, + "learning_rate": 1.695040290481077e-06, + "loss": 0.2704, + "step": 10686 + }, + { + "epoch": 1.6353481254781943, + "grad_norm": 2.130342610927485, + "learning_rate": 1.6936601365097883e-06, + "loss": 0.3191, + "step": 10687 + }, + { + "epoch": 1.6355011476664116, + "grad_norm": 2.1838518937601763, + "learning_rate": 1.692280492663325e-06, + "loss": 0.2934, + "step": 10688 + }, + { + "epoch": 1.635654169854629, + "grad_norm": 1.6849116928654915, + "learning_rate": 1.6909013590264178e-06, + "loss": 0.1704, + "step": 10689 + }, + { + "epoch": 1.6358071920428463, + "grad_norm": 2.2691614114392835, + "learning_rate": 1.689522735683763e-06, + "loss": 0.285, + "step": 10690 + }, + { + "epoch": 1.6359602142310634, + "grad_norm": 2.0610432754287547, + "learning_rate": 1.6881446227200272e-06, + "loss": 0.2869, + "step": 10691 + }, + { + "epoch": 1.6361132364192807, + "grad_norm": 2.220578815693314, + "learning_rate": 1.686767020219846e-06, + "loss": 0.263, + "step": 10692 + }, + { + "epoch": 1.636266258607498, + "grad_norm": 2.237510475416876, + "learning_rate": 1.6853899282678243e-06, + "loss": 0.3804, + "step": 10693 + }, + { + "epoch": 1.6364192807957154, + "grad_norm": 2.3743835475077018, + "learning_rate": 1.6840133469485342e-06, + "loss": 0.3615, + "step": 10694 + }, + { + "epoch": 1.6365723029839327, + "grad_norm": 2.0430532081213353, + "learning_rate": 1.682637276346517e-06, + "loss": 0.3135, + "step": 10695 + }, + { + "epoch": 1.6367253251721499, + "grad_norm": 2.203751506438757, + "learning_rate": 1.681261716546282e-06, + "loss": 0.3146, + "step": 10696 + }, + { + "epoch": 1.6368783473603674, + "grad_norm": 2.203912076629452, + "learning_rate": 1.6798866676323078e-06, + "loss": 0.2949, + "step": 10697 + }, + { + "epoch": 1.6370313695485845, + "grad_norm": 2.1840574837792697, + "learning_rate": 1.6785121296890416e-06, + "loss": 0.2636, + "step": 10698 + }, + { + "epoch": 1.6371843917368019, + "grad_norm": 1.9430320860711479, + "learning_rate": 1.6771381028009003e-06, + "loss": 0.2496, + "step": 10699 + }, + { + "epoch": 1.6373374139250192, + "grad_norm": 2.8639026682037336, + "learning_rate": 1.6757645870522665e-06, + "loss": 0.2489, + "step": 10700 + }, + { + "epoch": 1.6374904361132363, + "grad_norm": 1.9705293616477655, + "learning_rate": 1.6743915825274948e-06, + "loss": 0.3078, + "step": 10701 + }, + { + "epoch": 1.6376434583014539, + "grad_norm": 2.1633250353549585, + "learning_rate": 1.6730190893109032e-06, + "loss": 0.303, + "step": 10702 + }, + { + "epoch": 1.637796480489671, + "grad_norm": 2.1591337070007635, + "learning_rate": 1.671647107486789e-06, + "loss": 0.2965, + "step": 10703 + }, + { + "epoch": 1.6379495026778883, + "grad_norm": 2.228367036532636, + "learning_rate": 1.6702756371394046e-06, + "loss": 0.3568, + "step": 10704 + }, + { + "epoch": 1.6381025248661056, + "grad_norm": 2.035251795089624, + "learning_rate": 1.668904678352977e-06, + "loss": 0.2833, + "step": 10705 + }, + { + "epoch": 1.6382555470543227, + "grad_norm": 2.1454705760944965, + "learning_rate": 1.6675342312117094e-06, + "loss": 0.3327, + "step": 10706 + }, + { + "epoch": 1.6384085692425403, + "grad_norm": 1.845039677873378, + "learning_rate": 1.6661642957997592e-06, + "loss": 0.2323, + "step": 10707 + }, + { + "epoch": 1.6385615914307574, + "grad_norm": 2.128005587177012, + "learning_rate": 1.6647948722012586e-06, + "loss": 0.3273, + "step": 10708 + }, + { + "epoch": 1.6387146136189747, + "grad_norm": 2.483060924862472, + "learning_rate": 1.6634259605003178e-06, + "loss": 0.3343, + "step": 10709 + }, + { + "epoch": 1.638867635807192, + "grad_norm": 1.8476913614331738, + "learning_rate": 1.6620575607809963e-06, + "loss": 0.241, + "step": 10710 + }, + { + "epoch": 1.6390206579954092, + "grad_norm": 1.873225854131616, + "learning_rate": 1.6606896731273414e-06, + "loss": 0.2667, + "step": 10711 + }, + { + "epoch": 1.6391736801836267, + "grad_norm": 2.0053587873486225, + "learning_rate": 1.6593222976233591e-06, + "loss": 0.3382, + "step": 10712 + }, + { + "epoch": 1.6393267023718439, + "grad_norm": 2.381386238829408, + "learning_rate": 1.6579554343530192e-06, + "loss": 0.3578, + "step": 10713 + }, + { + "epoch": 1.6394797245600612, + "grad_norm": 2.0143214868755015, + "learning_rate": 1.6565890834002718e-06, + "loss": 0.2752, + "step": 10714 + }, + { + "epoch": 1.6396327467482785, + "grad_norm": 2.0593276322342104, + "learning_rate": 1.6552232448490314e-06, + "loss": 0.3165, + "step": 10715 + }, + { + "epoch": 1.6397857689364956, + "grad_norm": 2.1151191992071348, + "learning_rate": 1.6538579187831715e-06, + "loss": 0.3537, + "step": 10716 + }, + { + "epoch": 1.6399387911247132, + "grad_norm": 2.2127685314861774, + "learning_rate": 1.6524931052865501e-06, + "loss": 0.2841, + "step": 10717 + }, + { + "epoch": 1.6400918133129303, + "grad_norm": 2.193174920060312, + "learning_rate": 1.6511288044429818e-06, + "loss": 0.2926, + "step": 10718 + }, + { + "epoch": 1.6402448355011476, + "grad_norm": 2.010676753287093, + "learning_rate": 1.6497650163362556e-06, + "loss": 0.2297, + "step": 10719 + }, + { + "epoch": 1.640397857689365, + "grad_norm": 2.2276182035195373, + "learning_rate": 1.6484017410501251e-06, + "loss": 0.3368, + "step": 10720 + }, + { + "epoch": 1.6405508798775823, + "grad_norm": 2.036294692508882, + "learning_rate": 1.647038978668316e-06, + "loss": 0.2753, + "step": 10721 + }, + { + "epoch": 1.6407039020657996, + "grad_norm": 1.9936948700499255, + "learning_rate": 1.6456767292745201e-06, + "loss": 0.2486, + "step": 10722 + }, + { + "epoch": 1.6408569242540167, + "grad_norm": 2.0439149869667412, + "learning_rate": 1.644314992952395e-06, + "loss": 0.27, + "step": 10723 + }, + { + "epoch": 1.641009946442234, + "grad_norm": 2.1513599382318205, + "learning_rate": 1.6429537697855802e-06, + "loss": 0.3388, + "step": 10724 + }, + { + "epoch": 1.6411629686304514, + "grad_norm": 2.5542635210630666, + "learning_rate": 1.6415930598576647e-06, + "loss": 0.3008, + "step": 10725 + }, + { + "epoch": 1.6413159908186687, + "grad_norm": 2.1803917893956997, + "learning_rate": 1.6402328632522147e-06, + "loss": 0.3298, + "step": 10726 + }, + { + "epoch": 1.641469013006886, + "grad_norm": 2.0606574091597727, + "learning_rate": 1.6388731800527725e-06, + "loss": 0.2659, + "step": 10727 + }, + { + "epoch": 1.6416220351951032, + "grad_norm": 2.5470074979530706, + "learning_rate": 1.6375140103428355e-06, + "loss": 0.3826, + "step": 10728 + }, + { + "epoch": 1.6417750573833207, + "grad_norm": 1.9157050495130552, + "learning_rate": 1.636155354205875e-06, + "loss": 0.2514, + "step": 10729 + }, + { + "epoch": 1.6419280795715379, + "grad_norm": 2.312294859804467, + "learning_rate": 1.6347972117253386e-06, + "loss": 0.3057, + "step": 10730 + }, + { + "epoch": 1.6420811017597552, + "grad_norm": 2.2051790746379005, + "learning_rate": 1.633439582984625e-06, + "loss": 0.3248, + "step": 10731 + }, + { + "epoch": 1.6422341239479725, + "grad_norm": 2.1339386560489295, + "learning_rate": 1.6320824680671199e-06, + "loss": 0.2573, + "step": 10732 + }, + { + "epoch": 1.6423871461361896, + "grad_norm": 2.2289140984293496, + "learning_rate": 1.6307258670561676e-06, + "loss": 0.2824, + "step": 10733 + }, + { + "epoch": 1.6425401683244072, + "grad_norm": 1.9393319401090998, + "learning_rate": 1.6293697800350761e-06, + "loss": 0.238, + "step": 10734 + }, + { + "epoch": 1.6426931905126243, + "grad_norm": 2.1686155278174, + "learning_rate": 1.6280142070871352e-06, + "loss": 0.2616, + "step": 10735 + }, + { + "epoch": 1.6428462127008416, + "grad_norm": 1.712168777957528, + "learning_rate": 1.626659148295595e-06, + "loss": 0.2245, + "step": 10736 + }, + { + "epoch": 1.642999234889059, + "grad_norm": 2.2094323994725866, + "learning_rate": 1.6253046037436693e-06, + "loss": 0.314, + "step": 10737 + }, + { + "epoch": 1.643152257077276, + "grad_norm": 1.836421884614005, + "learning_rate": 1.6239505735145511e-06, + "loss": 0.2484, + "step": 10738 + }, + { + "epoch": 1.6433052792654936, + "grad_norm": 2.1152088410801375, + "learning_rate": 1.6225970576913963e-06, + "loss": 0.3049, + "step": 10739 + }, + { + "epoch": 1.6434583014537107, + "grad_norm": 2.0485816420127, + "learning_rate": 1.6212440563573284e-06, + "loss": 0.2677, + "step": 10740 + }, + { + "epoch": 1.643611323641928, + "grad_norm": 2.003129581012719, + "learning_rate": 1.6198915695954408e-06, + "loss": 0.2557, + "step": 10741 + }, + { + "epoch": 1.6437643458301454, + "grad_norm": 1.7381668985451146, + "learning_rate": 1.6185395974887952e-06, + "loss": 0.2338, + "step": 10742 + }, + { + "epoch": 1.6439173680183625, + "grad_norm": 2.0542402267329876, + "learning_rate": 1.6171881401204215e-06, + "loss": 0.2303, + "step": 10743 + }, + { + "epoch": 1.64407039020658, + "grad_norm": 2.116449407226027, + "learning_rate": 1.6158371975733178e-06, + "loss": 0.2836, + "step": 10744 + }, + { + "epoch": 1.6442234123947972, + "grad_norm": 2.3970780751820944, + "learning_rate": 1.6144867699304512e-06, + "loss": 0.3241, + "step": 10745 + }, + { + "epoch": 1.6443764345830145, + "grad_norm": 1.993119568552427, + "learning_rate": 1.6131368572747564e-06, + "loss": 0.2613, + "step": 10746 + }, + { + "epoch": 1.6445294567712319, + "grad_norm": 2.109118877668952, + "learning_rate": 1.611787459689136e-06, + "loss": 0.2621, + "step": 10747 + }, + { + "epoch": 1.644682478959449, + "grad_norm": 2.1555885076869563, + "learning_rate": 1.6104385772564625e-06, + "loss": 0.2521, + "step": 10748 + }, + { + "epoch": 1.6448355011476665, + "grad_norm": 2.425326814567966, + "learning_rate": 1.6090902100595774e-06, + "loss": 0.3719, + "step": 10749 + }, + { + "epoch": 1.6449885233358836, + "grad_norm": 2.1643534965090483, + "learning_rate": 1.6077423581812869e-06, + "loss": 0.3239, + "step": 10750 + }, + { + "epoch": 1.645141545524101, + "grad_norm": 1.9450833734330513, + "learning_rate": 1.606395021704369e-06, + "loss": 0.2902, + "step": 10751 + }, + { + "epoch": 1.6452945677123183, + "grad_norm": 2.1136401370200053, + "learning_rate": 1.6050482007115687e-06, + "loss": 0.3279, + "step": 10752 + }, + { + "epoch": 1.6454475899005354, + "grad_norm": 2.065969679238715, + "learning_rate": 1.6037018952855998e-06, + "loss": 0.2373, + "step": 10753 + }, + { + "epoch": 1.645600612088753, + "grad_norm": 2.377631789499768, + "learning_rate": 1.6023561055091441e-06, + "loss": 0.2707, + "step": 10754 + }, + { + "epoch": 1.64575363427697, + "grad_norm": 1.8720142905869108, + "learning_rate": 1.6010108314648498e-06, + "loss": 0.2948, + "step": 10755 + }, + { + "epoch": 1.6459066564651874, + "grad_norm": 2.0211503929517196, + "learning_rate": 1.5996660732353409e-06, + "loss": 0.2744, + "step": 10756 + }, + { + "epoch": 1.6460596786534047, + "grad_norm": 1.8960009862980085, + "learning_rate": 1.5983218309031989e-06, + "loss": 0.296, + "step": 10757 + }, + { + "epoch": 1.646212700841622, + "grad_norm": 2.3533780465336545, + "learning_rate": 1.5969781045509792e-06, + "loss": 0.3347, + "step": 10758 + }, + { + "epoch": 1.6463657230298394, + "grad_norm": 1.8612954689985926, + "learning_rate": 1.595634894261211e-06, + "loss": 0.2747, + "step": 10759 + }, + { + "epoch": 1.6465187452180565, + "grad_norm": 2.2717995170800385, + "learning_rate": 1.5942922001163775e-06, + "loss": 0.3192, + "step": 10760 + }, + { + "epoch": 1.6466717674062739, + "grad_norm": 2.157957154061846, + "learning_rate": 1.5929500221989448e-06, + "loss": 0.3263, + "step": 10761 + }, + { + "epoch": 1.6468247895944912, + "grad_norm": 2.285376952361559, + "learning_rate": 1.5916083605913435e-06, + "loss": 0.312, + "step": 10762 + }, + { + "epoch": 1.6469778117827085, + "grad_norm": 2.137538885073889, + "learning_rate": 1.5902672153759613e-06, + "loss": 0.273, + "step": 10763 + }, + { + "epoch": 1.6471308339709259, + "grad_norm": 2.1379714062032216, + "learning_rate": 1.58892658663517e-06, + "loss": 0.3213, + "step": 10764 + }, + { + "epoch": 1.647283856159143, + "grad_norm": 1.8926610476687151, + "learning_rate": 1.5875864744513048e-06, + "loss": 0.2281, + "step": 10765 + }, + { + "epoch": 1.6474368783473605, + "grad_norm": 2.3552325347343146, + "learning_rate": 1.5862468789066587e-06, + "loss": 0.2999, + "step": 10766 + }, + { + "epoch": 1.6475899005355776, + "grad_norm": 2.0190965406363457, + "learning_rate": 1.5849078000835083e-06, + "loss": 0.2834, + "step": 10767 + }, + { + "epoch": 1.647742922723795, + "grad_norm": 1.8876247283806757, + "learning_rate": 1.583569238064091e-06, + "loss": 0.251, + "step": 10768 + }, + { + "epoch": 1.6478959449120123, + "grad_norm": 2.0416052122157824, + "learning_rate": 1.582231192930611e-06, + "loss": 0.3513, + "step": 10769 + }, + { + "epoch": 1.6480489671002294, + "grad_norm": 1.949524372805771, + "learning_rate": 1.580893664765245e-06, + "loss": 0.2473, + "step": 10770 + }, + { + "epoch": 1.648201989288447, + "grad_norm": 2.0017501920680085, + "learning_rate": 1.5795566536501339e-06, + "loss": 0.3034, + "step": 10771 + }, + { + "epoch": 1.648355011476664, + "grad_norm": 2.3663624170073243, + "learning_rate": 1.5782201596673908e-06, + "loss": 0.3085, + "step": 10772 + }, + { + "epoch": 1.6485080336648814, + "grad_norm": 1.9866391576555056, + "learning_rate": 1.5768841828990934e-06, + "loss": 0.2308, + "step": 10773 + }, + { + "epoch": 1.6486610558530987, + "grad_norm": 1.9819239180672437, + "learning_rate": 1.5755487234272892e-06, + "loss": 0.3391, + "step": 10774 + }, + { + "epoch": 1.6488140780413159, + "grad_norm": 2.210194731319105, + "learning_rate": 1.5742137813339942e-06, + "loss": 0.2937, + "step": 10775 + }, + { + "epoch": 1.6489671002295334, + "grad_norm": 2.347026374742834, + "learning_rate": 1.5728793567011934e-06, + "loss": 0.3295, + "step": 10776 + }, + { + "epoch": 1.6491201224177505, + "grad_norm": 2.005811317294377, + "learning_rate": 1.5715454496108384e-06, + "loss": 0.2935, + "step": 10777 + }, + { + "epoch": 1.6492731446059679, + "grad_norm": 2.153939130371352, + "learning_rate": 1.57021206014485e-06, + "loss": 0.3447, + "step": 10778 + }, + { + "epoch": 1.6494261667941852, + "grad_norm": 2.067837021033556, + "learning_rate": 1.5688791883851152e-06, + "loss": 0.273, + "step": 10779 + }, + { + "epoch": 1.6495791889824023, + "grad_norm": 2.0123842804250747, + "learning_rate": 1.5675468344134936e-06, + "loss": 0.2966, + "step": 10780 + }, + { + "epoch": 1.6497322111706199, + "grad_norm": 2.142451040406505, + "learning_rate": 1.566214998311808e-06, + "loss": 0.2641, + "step": 10781 + }, + { + "epoch": 1.649885233358837, + "grad_norm": 2.073097588130826, + "learning_rate": 1.5648836801618527e-06, + "loss": 0.27, + "step": 10782 + }, + { + "epoch": 1.6500382555470543, + "grad_norm": 2.483413467106734, + "learning_rate": 1.5635528800453892e-06, + "loss": 0.3521, + "step": 10783 + }, + { + "epoch": 1.6501912777352716, + "grad_norm": 2.1582050153024945, + "learning_rate": 1.562222598044144e-06, + "loss": 0.2844, + "step": 10784 + }, + { + "epoch": 1.6503442999234887, + "grad_norm": 2.176784737134013, + "learning_rate": 1.5608928342398232e-06, + "loss": 0.2599, + "step": 10785 + }, + { + "epoch": 1.6504973221117063, + "grad_norm": 2.5654567717802217, + "learning_rate": 1.5595635887140847e-06, + "loss": 0.3485, + "step": 10786 + }, + { + "epoch": 1.6506503442999234, + "grad_norm": 2.190836740951483, + "learning_rate": 1.5582348615485632e-06, + "loss": 0.2981, + "step": 10787 + }, + { + "epoch": 1.6508033664881407, + "grad_norm": 1.7488115769482173, + "learning_rate": 1.5569066528248676e-06, + "loss": 0.2574, + "step": 10788 + }, + { + "epoch": 1.650956388676358, + "grad_norm": 1.919495686623538, + "learning_rate": 1.5555789626245599e-06, + "loss": 0.2921, + "step": 10789 + }, + { + "epoch": 1.6511094108645754, + "grad_norm": 2.268063202918329, + "learning_rate": 1.5542517910291843e-06, + "loss": 0.3139, + "step": 10790 + }, + { + "epoch": 1.6512624330527927, + "grad_norm": 2.049890249473881, + "learning_rate": 1.5529251381202492e-06, + "loss": 0.301, + "step": 10791 + }, + { + "epoch": 1.6514154552410099, + "grad_norm": 2.2210056204592146, + "learning_rate": 1.5515990039792217e-06, + "loss": 0.2149, + "step": 10792 + }, + { + "epoch": 1.6515684774292272, + "grad_norm": 1.8970844610815365, + "learning_rate": 1.5502733886875521e-06, + "loss": 0.2351, + "step": 10793 + }, + { + "epoch": 1.6517214996174445, + "grad_norm": 1.987111276832506, + "learning_rate": 1.5489482923266519e-06, + "loss": 0.2797, + "step": 10794 + }, + { + "epoch": 1.6518745218056619, + "grad_norm": 2.1966652245652547, + "learning_rate": 1.5476237149778928e-06, + "loss": 0.3025, + "step": 10795 + }, + { + "epoch": 1.6520275439938792, + "grad_norm": 1.7725625214287049, + "learning_rate": 1.5462996567226296e-06, + "loss": 0.2398, + "step": 10796 + }, + { + "epoch": 1.6521805661820963, + "grad_norm": 2.095813332604148, + "learning_rate": 1.5449761176421752e-06, + "loss": 0.3259, + "step": 10797 + }, + { + "epoch": 1.6523335883703139, + "grad_norm": 2.1249341291903736, + "learning_rate": 1.5436530978178132e-06, + "loss": 0.3051, + "step": 10798 + }, + { + "epoch": 1.652486610558531, + "grad_norm": 2.414881397404002, + "learning_rate": 1.5423305973307966e-06, + "loss": 0.355, + "step": 10799 + }, + { + "epoch": 1.6526396327467483, + "grad_norm": 2.101162420996157, + "learning_rate": 1.541008616262345e-06, + "loss": 0.3226, + "step": 10800 + }, + { + "epoch": 1.6527926549349656, + "grad_norm": 2.1531043110040433, + "learning_rate": 1.5396871546936453e-06, + "loss": 0.2871, + "step": 10801 + }, + { + "epoch": 1.6529456771231827, + "grad_norm": 2.235894912765083, + "learning_rate": 1.5383662127058552e-06, + "loss": 0.2943, + "step": 10802 + }, + { + "epoch": 1.6530986993114003, + "grad_norm": 2.3541968452917006, + "learning_rate": 1.5370457903800973e-06, + "loss": 0.3262, + "step": 10803 + }, + { + "epoch": 1.6532517214996174, + "grad_norm": 1.954556761150049, + "learning_rate": 1.5357258877974645e-06, + "loss": 0.2851, + "step": 10804 + }, + { + "epoch": 1.6534047436878347, + "grad_norm": 2.526920108336962, + "learning_rate": 1.5344065050390155e-06, + "loss": 0.3662, + "step": 10805 + }, + { + "epoch": 1.653557765876052, + "grad_norm": 2.179565146522856, + "learning_rate": 1.5330876421857855e-06, + "loss": 0.3278, + "step": 10806 + }, + { + "epoch": 1.6537107880642692, + "grad_norm": 2.074137636510761, + "learning_rate": 1.531769299318764e-06, + "loss": 0.3092, + "step": 10807 + }, + { + "epoch": 1.6538638102524867, + "grad_norm": 2.525170469543575, + "learning_rate": 1.530451476518915e-06, + "loss": 0.3664, + "step": 10808 + }, + { + "epoch": 1.6540168324407039, + "grad_norm": 2.3703063766098875, + "learning_rate": 1.5291341738671782e-06, + "loss": 0.3352, + "step": 10809 + }, + { + "epoch": 1.6541698546289212, + "grad_norm": 2.3707433160320037, + "learning_rate": 1.5278173914444472e-06, + "loss": 0.3119, + "step": 10810 + }, + { + "epoch": 1.6543228768171385, + "grad_norm": 2.281609337415562, + "learning_rate": 1.5265011293315923e-06, + "loss": 0.2825, + "step": 10811 + }, + { + "epoch": 1.6544758990053556, + "grad_norm": 2.3882911358515, + "learning_rate": 1.5251853876094557e-06, + "loss": 0.2972, + "step": 10812 + }, + { + "epoch": 1.6546289211935732, + "grad_norm": 2.0222330062798695, + "learning_rate": 1.523870166358834e-06, + "loss": 0.3287, + "step": 10813 + }, + { + "epoch": 1.6547819433817903, + "grad_norm": 2.1272748346654566, + "learning_rate": 1.5225554656605057e-06, + "loss": 0.2728, + "step": 10814 + }, + { + "epoch": 1.6549349655700076, + "grad_norm": 2.120587577042761, + "learning_rate": 1.5212412855952131e-06, + "loss": 0.3192, + "step": 10815 + }, + { + "epoch": 1.655087987758225, + "grad_norm": 2.259649605135267, + "learning_rate": 1.519927626243658e-06, + "loss": 0.3069, + "step": 10816 + }, + { + "epoch": 1.655241009946442, + "grad_norm": 1.7862660695016823, + "learning_rate": 1.5186144876865228e-06, + "loss": 0.2747, + "step": 10817 + }, + { + "epoch": 1.6553940321346596, + "grad_norm": 2.1200121696415875, + "learning_rate": 1.517301870004454e-06, + "loss": 0.2835, + "step": 10818 + }, + { + "epoch": 1.6555470543228767, + "grad_norm": 2.0400460017927635, + "learning_rate": 1.515989773278058e-06, + "loss": 0.28, + "step": 10819 + }, + { + "epoch": 1.655700076511094, + "grad_norm": 1.8889820609466452, + "learning_rate": 1.514678197587921e-06, + "loss": 0.2624, + "step": 10820 + }, + { + "epoch": 1.6558530986993114, + "grad_norm": 1.9934048791905552, + "learning_rate": 1.5133671430145913e-06, + "loss": 0.2727, + "step": 10821 + }, + { + "epoch": 1.6560061208875287, + "grad_norm": 1.9503046841502563, + "learning_rate": 1.5120566096385846e-06, + "loss": 0.302, + "step": 10822 + }, + { + "epoch": 1.656159143075746, + "grad_norm": 1.8025401263324794, + "learning_rate": 1.5107465975403868e-06, + "loss": 0.2487, + "step": 10823 + }, + { + "epoch": 1.6563121652639632, + "grad_norm": 2.2381643970107246, + "learning_rate": 1.509437106800451e-06, + "loss": 0.3447, + "step": 10824 + }, + { + "epoch": 1.6564651874521805, + "grad_norm": 2.213589525316061, + "learning_rate": 1.5081281374991975e-06, + "loss": 0.3024, + "step": 10825 + }, + { + "epoch": 1.6566182096403979, + "grad_norm": 2.1435019979757364, + "learning_rate": 1.5068196897170152e-06, + "loss": 0.3324, + "step": 10826 + }, + { + "epoch": 1.6567712318286152, + "grad_norm": 2.666594275663528, + "learning_rate": 1.505511763534262e-06, + "loss": 0.3843, + "step": 10827 + }, + { + "epoch": 1.6569242540168325, + "grad_norm": 2.273364682415013, + "learning_rate": 1.5042043590312616e-06, + "loss": 0.2817, + "step": 10828 + }, + { + "epoch": 1.6570772762050496, + "grad_norm": 1.972543812149342, + "learning_rate": 1.5028974762883065e-06, + "loss": 0.3084, + "step": 10829 + }, + { + "epoch": 1.6572302983932672, + "grad_norm": 1.9225873877071888, + "learning_rate": 1.5015911153856588e-06, + "loss": 0.2399, + "step": 10830 + }, + { + "epoch": 1.6573833205814843, + "grad_norm": 1.9259450792736108, + "learning_rate": 1.5002852764035468e-06, + "loss": 0.2644, + "step": 10831 + }, + { + "epoch": 1.6575363427697016, + "grad_norm": 2.25980625558073, + "learning_rate": 1.4989799594221676e-06, + "loss": 0.2851, + "step": 10832 + }, + { + "epoch": 1.657689364957919, + "grad_norm": 1.9883780300425007, + "learning_rate": 1.4976751645216846e-06, + "loss": 0.2178, + "step": 10833 + }, + { + "epoch": 1.657842387146136, + "grad_norm": 2.194469310656908, + "learning_rate": 1.4963708917822283e-06, + "loss": 0.2982, + "step": 10834 + }, + { + "epoch": 1.6579954093343536, + "grad_norm": 2.216289901686091, + "learning_rate": 1.4950671412839068e-06, + "loss": 0.3195, + "step": 10835 + }, + { + "epoch": 1.6581484315225707, + "grad_norm": 2.5717793300199836, + "learning_rate": 1.4937639131067817e-06, + "loss": 0.3651, + "step": 10836 + }, + { + "epoch": 1.658301453710788, + "grad_norm": 2.137589206573856, + "learning_rate": 1.4924612073308887e-06, + "loss": 0.3486, + "step": 10837 + }, + { + "epoch": 1.6584544758990054, + "grad_norm": 2.154092460386197, + "learning_rate": 1.4911590240362395e-06, + "loss": 0.2565, + "step": 10838 + }, + { + "epoch": 1.6586074980872225, + "grad_norm": 2.0526315485709814, + "learning_rate": 1.489857363302799e-06, + "loss": 0.2636, + "step": 10839 + }, + { + "epoch": 1.65876052027544, + "grad_norm": 2.214596346525897, + "learning_rate": 1.488556225210508e-06, + "loss": 0.3014, + "step": 10840 + }, + { + "epoch": 1.6589135424636572, + "grad_norm": 2.0302186159970566, + "learning_rate": 1.4872556098392798e-06, + "loss": 0.306, + "step": 10841 + }, + { + "epoch": 1.6590665646518745, + "grad_norm": 1.9744823847536939, + "learning_rate": 1.485955517268982e-06, + "loss": 0.2682, + "step": 10842 + }, + { + "epoch": 1.6592195868400919, + "grad_norm": 1.970294762784387, + "learning_rate": 1.4846559475794653e-06, + "loss": 0.3153, + "step": 10843 + }, + { + "epoch": 1.659372609028309, + "grad_norm": 2.0777672103857747, + "learning_rate": 1.4833569008505422e-06, + "loss": 0.2852, + "step": 10844 + }, + { + "epoch": 1.6595256312165265, + "grad_norm": 2.187430071032332, + "learning_rate": 1.4820583771619845e-06, + "loss": 0.2573, + "step": 10845 + }, + { + "epoch": 1.6596786534047436, + "grad_norm": 2.263106010448733, + "learning_rate": 1.4807603765935451e-06, + "loss": 0.2328, + "step": 10846 + }, + { + "epoch": 1.659831675592961, + "grad_norm": 2.2069090905920876, + "learning_rate": 1.4794628992249427e-06, + "loss": 0.3041, + "step": 10847 + }, + { + "epoch": 1.6599846977811783, + "grad_norm": 2.2081482884286685, + "learning_rate": 1.4781659451358498e-06, + "loss": 0.3292, + "step": 10848 + }, + { + "epoch": 1.6601377199693954, + "grad_norm": 2.1366601858632275, + "learning_rate": 1.4768695144059276e-06, + "loss": 0.2761, + "step": 10849 + }, + { + "epoch": 1.660290742157613, + "grad_norm": 2.106645047341696, + "learning_rate": 1.4755736071147897e-06, + "loss": 0.3162, + "step": 10850 + }, + { + "epoch": 1.66044376434583, + "grad_norm": 2.3656997479027413, + "learning_rate": 1.474278223342026e-06, + "loss": 0.3084, + "step": 10851 + }, + { + "epoch": 1.6605967865340474, + "grad_norm": 2.192837488975101, + "learning_rate": 1.4729833631671887e-06, + "loss": 0.2746, + "step": 10852 + }, + { + "epoch": 1.6607498087222647, + "grad_norm": 2.139378747762477, + "learning_rate": 1.4716890266698002e-06, + "loss": 0.325, + "step": 10853 + }, + { + "epoch": 1.660902830910482, + "grad_norm": 2.3221793138150773, + "learning_rate": 1.470395213929352e-06, + "loss": 0.3499, + "step": 10854 + }, + { + "epoch": 1.6610558530986994, + "grad_norm": 1.8374502022826251, + "learning_rate": 1.4691019250253025e-06, + "loss": 0.2696, + "step": 10855 + }, + { + "epoch": 1.6612088752869165, + "grad_norm": 2.166262856949546, + "learning_rate": 1.4678091600370758e-06, + "loss": 0.2904, + "step": 10856 + }, + { + "epoch": 1.6613618974751339, + "grad_norm": 2.2757916046025533, + "learning_rate": 1.4665169190440664e-06, + "loss": 0.2943, + "step": 10857 + }, + { + "epoch": 1.6615149196633512, + "grad_norm": 1.9638029604739888, + "learning_rate": 1.4652252021256375e-06, + "loss": 0.2033, + "step": 10858 + }, + { + "epoch": 1.6616679418515685, + "grad_norm": 2.235095501903873, + "learning_rate": 1.4639340093611165e-06, + "loss": 0.3472, + "step": 10859 + }, + { + "epoch": 1.6618209640397859, + "grad_norm": 2.1923690145521393, + "learning_rate": 1.4626433408298014e-06, + "loss": 0.3573, + "step": 10860 + }, + { + "epoch": 1.661973986228003, + "grad_norm": 1.927484721477276, + "learning_rate": 1.4613531966109561e-06, + "loss": 0.2985, + "step": 10861 + }, + { + "epoch": 1.6621270084162203, + "grad_norm": 2.0199650132275404, + "learning_rate": 1.4600635767838155e-06, + "loss": 0.2646, + "step": 10862 + }, + { + "epoch": 1.6622800306044376, + "grad_norm": 2.2089164022086027, + "learning_rate": 1.458774481427576e-06, + "loss": 0.3142, + "step": 10863 + }, + { + "epoch": 1.662433052792655, + "grad_norm": 2.0269913480817245, + "learning_rate": 1.4574859106214144e-06, + "loss": 0.2498, + "step": 10864 + }, + { + "epoch": 1.6625860749808723, + "grad_norm": 2.3356780814963773, + "learning_rate": 1.4561978644444596e-06, + "loss": 0.2692, + "step": 10865 + }, + { + "epoch": 1.6627390971690894, + "grad_norm": 2.055522130474578, + "learning_rate": 1.4549103429758138e-06, + "loss": 0.2867, + "step": 10866 + }, + { + "epoch": 1.662892119357307, + "grad_norm": 1.9009789290007288, + "learning_rate": 1.4536233462945582e-06, + "loss": 0.266, + "step": 10867 + }, + { + "epoch": 1.663045141545524, + "grad_norm": 1.8610570330440441, + "learning_rate": 1.4523368744797239e-06, + "loss": 0.2756, + "step": 10868 + }, + { + "epoch": 1.6631981637337414, + "grad_norm": 2.0347304270079927, + "learning_rate": 1.4510509276103179e-06, + "loss": 0.2777, + "step": 10869 + }, + { + "epoch": 1.6633511859219587, + "grad_norm": 2.0341956783646835, + "learning_rate": 1.4497655057653237e-06, + "loss": 0.2307, + "step": 10870 + }, + { + "epoch": 1.6635042081101759, + "grad_norm": 2.103395181152377, + "learning_rate": 1.4484806090236736e-06, + "loss": 0.2885, + "step": 10871 + }, + { + "epoch": 1.6636572302983934, + "grad_norm": 2.4448504211780824, + "learning_rate": 1.4471962374642845e-06, + "loss": 0.3329, + "step": 10872 + }, + { + "epoch": 1.6638102524866105, + "grad_norm": 2.098333249073084, + "learning_rate": 1.4459123911660366e-06, + "loss": 0.2922, + "step": 10873 + }, + { + "epoch": 1.6639632746748279, + "grad_norm": 2.0247796530057687, + "learning_rate": 1.444629070207767e-06, + "loss": 0.2942, + "step": 10874 + }, + { + "epoch": 1.6641162968630452, + "grad_norm": 2.1636751847253457, + "learning_rate": 1.443346274668298e-06, + "loss": 0.3873, + "step": 10875 + }, + { + "epoch": 1.6642693190512623, + "grad_norm": 2.231073006220205, + "learning_rate": 1.44206400462641e-06, + "loss": 0.3651, + "step": 10876 + }, + { + "epoch": 1.6644223412394799, + "grad_norm": 1.656595975610177, + "learning_rate": 1.4407822601608457e-06, + "loss": 0.2371, + "step": 10877 + }, + { + "epoch": 1.664575363427697, + "grad_norm": 1.969119693034, + "learning_rate": 1.4395010413503297e-06, + "loss": 0.235, + "step": 10878 + }, + { + "epoch": 1.6647283856159143, + "grad_norm": 1.8045403787736842, + "learning_rate": 1.4382203482735425e-06, + "loss": 0.2718, + "step": 10879 + }, + { + "epoch": 1.6648814078041316, + "grad_norm": 1.9410478871577124, + "learning_rate": 1.4369401810091377e-06, + "loss": 0.3118, + "step": 10880 + }, + { + "epoch": 1.6650344299923487, + "grad_norm": 2.1581941318540356, + "learning_rate": 1.435660539635736e-06, + "loss": 0.2823, + "step": 10881 + }, + { + "epoch": 1.6651874521805663, + "grad_norm": 2.3185022235247192, + "learning_rate": 1.4343814242319243e-06, + "loss": 0.4697, + "step": 10882 + }, + { + "epoch": 1.6653404743687834, + "grad_norm": 1.7032804581381125, + "learning_rate": 1.4331028348762577e-06, + "loss": 0.2148, + "step": 10883 + }, + { + "epoch": 1.6654934965570007, + "grad_norm": 2.0705076500454287, + "learning_rate": 1.43182477164726e-06, + "loss": 0.2949, + "step": 10884 + }, + { + "epoch": 1.665646518745218, + "grad_norm": 2.0861927158849665, + "learning_rate": 1.4305472346234227e-06, + "loss": 0.3027, + "step": 10885 + }, + { + "epoch": 1.6657995409334352, + "grad_norm": 2.204515921292248, + "learning_rate": 1.4292702238832035e-06, + "loss": 0.3218, + "step": 10886 + }, + { + "epoch": 1.6659525631216527, + "grad_norm": 2.1139004154729792, + "learning_rate": 1.4279937395050258e-06, + "loss": 0.2753, + "step": 10887 + }, + { + "epoch": 1.6661055853098699, + "grad_norm": 1.9279461931220945, + "learning_rate": 1.4267177815672906e-06, + "loss": 0.2917, + "step": 10888 + }, + { + "epoch": 1.6662586074980872, + "grad_norm": 1.877735560651574, + "learning_rate": 1.4254423501483538e-06, + "loss": 0.2467, + "step": 10889 + }, + { + "epoch": 1.6664116296863045, + "grad_norm": 2.3057435678936398, + "learning_rate": 1.4241674453265441e-06, + "loss": 0.289, + "step": 10890 + }, + { + "epoch": 1.6665646518745219, + "grad_norm": 2.1293163309803123, + "learning_rate": 1.4228930671801645e-06, + "loss": 0.2736, + "step": 10891 + }, + { + "epoch": 1.6667176740627392, + "grad_norm": 2.3130308684961656, + "learning_rate": 1.421619215787473e-06, + "loss": 0.3645, + "step": 10892 + }, + { + "epoch": 1.6668706962509563, + "grad_norm": 1.9476549249402393, + "learning_rate": 1.4203458912267032e-06, + "loss": 0.1831, + "step": 10893 + }, + { + "epoch": 1.6670237184391736, + "grad_norm": 1.5150591864711775, + "learning_rate": 1.4190730935760589e-06, + "loss": 0.2004, + "step": 10894 + }, + { + "epoch": 1.667176740627391, + "grad_norm": 2.1337667385478625, + "learning_rate": 1.4178008229137008e-06, + "loss": 0.3594, + "step": 10895 + }, + { + "epoch": 1.6673297628156083, + "grad_norm": 2.0534710712272726, + "learning_rate": 1.416529079317771e-06, + "loss": 0.2937, + "step": 10896 + }, + { + "epoch": 1.6674827850038256, + "grad_norm": 2.0695438008325646, + "learning_rate": 1.4152578628663706e-06, + "loss": 0.3173, + "step": 10897 + }, + { + "epoch": 1.6676358071920427, + "grad_norm": 1.994886244529576, + "learning_rate": 1.4139871736375643e-06, + "loss": 0.3141, + "step": 10898 + }, + { + "epoch": 1.6677888293802603, + "grad_norm": 2.387752398099085, + "learning_rate": 1.4127170117093958e-06, + "loss": 0.375, + "step": 10899 + }, + { + "epoch": 1.6679418515684774, + "grad_norm": 2.190527276810829, + "learning_rate": 1.4114473771598702e-06, + "loss": 0.2836, + "step": 10900 + }, + { + "epoch": 1.6680948737566947, + "grad_norm": 2.670201984085481, + "learning_rate": 1.4101782700669597e-06, + "loss": 0.3888, + "step": 10901 + }, + { + "epoch": 1.668247895944912, + "grad_norm": 2.1367696600688304, + "learning_rate": 1.4089096905086053e-06, + "loss": 0.3103, + "step": 10902 + }, + { + "epoch": 1.6684009181331292, + "grad_norm": 1.7876575207949825, + "learning_rate": 1.4076416385627146e-06, + "loss": 0.2303, + "step": 10903 + }, + { + "epoch": 1.6685539403213467, + "grad_norm": 2.411778716322336, + "learning_rate": 1.4063741143071652e-06, + "loss": 0.2621, + "step": 10904 + }, + { + "epoch": 1.6687069625095639, + "grad_norm": 2.160323074264381, + "learning_rate": 1.4051071178197996e-06, + "loss": 0.2919, + "step": 10905 + }, + { + "epoch": 1.6688599846977812, + "grad_norm": 2.0159113695934696, + "learning_rate": 1.4038406491784285e-06, + "loss": 0.2663, + "step": 10906 + }, + { + "epoch": 1.6690130068859985, + "grad_norm": 1.8991370060184651, + "learning_rate": 1.4025747084608322e-06, + "loss": 0.2995, + "step": 10907 + }, + { + "epoch": 1.6691660290742156, + "grad_norm": 1.9419152106094528, + "learning_rate": 1.401309295744756e-06, + "loss": 0.2903, + "step": 10908 + }, + { + "epoch": 1.6693190512624332, + "grad_norm": 1.7963355100642329, + "learning_rate": 1.4000444111079147e-06, + "loss": 0.2973, + "step": 10909 + }, + { + "epoch": 1.6694720734506503, + "grad_norm": 1.8830633868108124, + "learning_rate": 1.3987800546279895e-06, + "loss": 0.2105, + "step": 10910 + }, + { + "epoch": 1.6696250956388676, + "grad_norm": 2.2856483161991092, + "learning_rate": 1.397516226382629e-06, + "loss": 0.2671, + "step": 10911 + }, + { + "epoch": 1.669778117827085, + "grad_norm": 1.771961190145075, + "learning_rate": 1.3962529264494507e-06, + "loss": 0.1899, + "step": 10912 + }, + { + "epoch": 1.669931140015302, + "grad_norm": 1.8975790556274168, + "learning_rate": 1.3949901549060373e-06, + "loss": 0.2446, + "step": 10913 + }, + { + "epoch": 1.6700841622035196, + "grad_norm": 1.596883540271513, + "learning_rate": 1.3937279118299417e-06, + "loss": 0.1948, + "step": 10914 + }, + { + "epoch": 1.6702371843917367, + "grad_norm": 2.1678140693547583, + "learning_rate": 1.392466197298683e-06, + "loss": 0.2797, + "step": 10915 + }, + { + "epoch": 1.670390206579954, + "grad_norm": 2.5530718585428835, + "learning_rate": 1.3912050113897457e-06, + "loss": 0.3456, + "step": 10916 + }, + { + "epoch": 1.6705432287681714, + "grad_norm": 2.055834230738391, + "learning_rate": 1.3899443541805912e-06, + "loss": 0.272, + "step": 10917 + }, + { + "epoch": 1.6706962509563885, + "grad_norm": 2.2459829700144107, + "learning_rate": 1.3886842257486344e-06, + "loss": 0.3651, + "step": 10918 + }, + { + "epoch": 1.670849273144606, + "grad_norm": 2.1164224680206662, + "learning_rate": 1.3874246261712644e-06, + "loss": 0.2782, + "step": 10919 + }, + { + "epoch": 1.6710022953328232, + "grad_norm": 1.8515975103873454, + "learning_rate": 1.3861655555258457e-06, + "loss": 0.2176, + "step": 10920 + }, + { + "epoch": 1.6711553175210405, + "grad_norm": 2.025232486640509, + "learning_rate": 1.3849070138896948e-06, + "loss": 0.2984, + "step": 10921 + }, + { + "epoch": 1.6713083397092579, + "grad_norm": 2.182041754809988, + "learning_rate": 1.3836490013401049e-06, + "loss": 0.3218, + "step": 10922 + }, + { + "epoch": 1.6714613618974752, + "grad_norm": 2.104884022879343, + "learning_rate": 1.3823915179543411e-06, + "loss": 0.3369, + "step": 10923 + }, + { + "epoch": 1.6716143840856925, + "grad_norm": 2.259995500180198, + "learning_rate": 1.381134563809623e-06, + "loss": 0.3001, + "step": 10924 + }, + { + "epoch": 1.6717674062739096, + "grad_norm": 1.9805033206567582, + "learning_rate": 1.3798781389831505e-06, + "loss": 0.245, + "step": 10925 + }, + { + "epoch": 1.671920428462127, + "grad_norm": 2.108247142452472, + "learning_rate": 1.3786222435520846e-06, + "loss": 0.2678, + "step": 10926 + }, + { + "epoch": 1.6720734506503443, + "grad_norm": 2.2290200835316747, + "learning_rate": 1.377366877593551e-06, + "loss": 0.2508, + "step": 10927 + }, + { + "epoch": 1.6722264728385616, + "grad_norm": 2.2448037125405973, + "learning_rate": 1.3761120411846506e-06, + "loss": 0.3183, + "step": 10928 + }, + { + "epoch": 1.672379495026779, + "grad_norm": 2.168758056304277, + "learning_rate": 1.374857734402446e-06, + "loss": 0.3377, + "step": 10929 + }, + { + "epoch": 1.672532517214996, + "grad_norm": 1.8955123459943806, + "learning_rate": 1.3736039573239712e-06, + "loss": 0.2873, + "step": 10930 + }, + { + "epoch": 1.6726855394032136, + "grad_norm": 2.1207109975957437, + "learning_rate": 1.3723507100262234e-06, + "loss": 0.2861, + "step": 10931 + }, + { + "epoch": 1.6728385615914307, + "grad_norm": 1.9763204898257254, + "learning_rate": 1.37109799258617e-06, + "loss": 0.2584, + "step": 10932 + }, + { + "epoch": 1.672991583779648, + "grad_norm": 2.475408425541011, + "learning_rate": 1.3698458050807451e-06, + "loss": 0.3446, + "step": 10933 + }, + { + "epoch": 1.6731446059678654, + "grad_norm": 2.092829234547134, + "learning_rate": 1.3685941475868502e-06, + "loss": 0.3435, + "step": 10934 + }, + { + "epoch": 1.6732976281560825, + "grad_norm": 2.40580483114001, + "learning_rate": 1.3673430201813553e-06, + "loss": 0.3217, + "step": 10935 + }, + { + "epoch": 1.6734506503443, + "grad_norm": 2.17627753965489, + "learning_rate": 1.3660924229410976e-06, + "loss": 0.2959, + "step": 10936 + }, + { + "epoch": 1.6736036725325172, + "grad_norm": 2.753710906264951, + "learning_rate": 1.3648423559428792e-06, + "loss": 0.3364, + "step": 10937 + }, + { + "epoch": 1.6737566947207345, + "grad_norm": 2.4017439533471885, + "learning_rate": 1.3635928192634728e-06, + "loss": 0.3363, + "step": 10938 + }, + { + "epoch": 1.6739097169089519, + "grad_norm": 2.0006436609312455, + "learning_rate": 1.3623438129796173e-06, + "loss": 0.3522, + "step": 10939 + }, + { + "epoch": 1.674062739097169, + "grad_norm": 2.0945862255681624, + "learning_rate": 1.361095337168019e-06, + "loss": 0.283, + "step": 10940 + }, + { + "epoch": 1.6742157612853865, + "grad_norm": 1.9868929754295699, + "learning_rate": 1.3598473919053524e-06, + "loss": 0.2842, + "step": 10941 + }, + { + "epoch": 1.6743687834736036, + "grad_norm": 1.7594827525908399, + "learning_rate": 1.3585999772682568e-06, + "loss": 0.2647, + "step": 10942 + }, + { + "epoch": 1.674521805661821, + "grad_norm": 2.061653215879647, + "learning_rate": 1.3573530933333423e-06, + "loss": 0.2889, + "step": 10943 + }, + { + "epoch": 1.6746748278500383, + "grad_norm": 2.0181626743821615, + "learning_rate": 1.3561067401771854e-06, + "loss": 0.3083, + "step": 10944 + }, + { + "epoch": 1.6748278500382554, + "grad_norm": 2.2322001593250973, + "learning_rate": 1.354860917876325e-06, + "loss": 0.3274, + "step": 10945 + }, + { + "epoch": 1.674980872226473, + "grad_norm": 2.033061742018054, + "learning_rate": 1.3536156265072808e-06, + "loss": 0.2822, + "step": 10946 + }, + { + "epoch": 1.67513389441469, + "grad_norm": 2.0820930216656577, + "learning_rate": 1.352370866146524e-06, + "loss": 0.2432, + "step": 10947 + }, + { + "epoch": 1.6752869166029074, + "grad_norm": 2.2752337340265045, + "learning_rate": 1.3511266368704989e-06, + "loss": 0.2551, + "step": 10948 + }, + { + "epoch": 1.6754399387911247, + "grad_norm": 2.2610051373444007, + "learning_rate": 1.3498829387556268e-06, + "loss": 0.3106, + "step": 10949 + }, + { + "epoch": 1.6755929609793418, + "grad_norm": 1.7493537756146116, + "learning_rate": 1.3486397718782796e-06, + "loss": 0.2036, + "step": 10950 + }, + { + "epoch": 1.6757459831675594, + "grad_norm": 2.3185756256801606, + "learning_rate": 1.3473971363148074e-06, + "loss": 0.3175, + "step": 10951 + }, + { + "epoch": 1.6758990053557765, + "grad_norm": 2.425341043817578, + "learning_rate": 1.3461550321415295e-06, + "loss": 0.3125, + "step": 10952 + }, + { + "epoch": 1.6760520275439938, + "grad_norm": 2.432317825748467, + "learning_rate": 1.3449134594347213e-06, + "loss": 0.3109, + "step": 10953 + }, + { + "epoch": 1.6762050497322112, + "grad_norm": 2.5743220175885297, + "learning_rate": 1.3436724182706396e-06, + "loss": 0.305, + "step": 10954 + }, + { + "epoch": 1.6763580719204285, + "grad_norm": 1.9770393582495958, + "learning_rate": 1.342431908725499e-06, + "loss": 0.2984, + "step": 10955 + }, + { + "epoch": 1.6765110941086458, + "grad_norm": 2.1248514001509116, + "learning_rate": 1.3411919308754794e-06, + "loss": 0.3008, + "step": 10956 + }, + { + "epoch": 1.676664116296863, + "grad_norm": 2.0468319850996872, + "learning_rate": 1.3399524847967405e-06, + "loss": 0.2727, + "step": 10957 + }, + { + "epoch": 1.6768171384850803, + "grad_norm": 2.036751420201803, + "learning_rate": 1.3387135705653986e-06, + "loss": 0.274, + "step": 10958 + }, + { + "epoch": 1.6769701606732976, + "grad_norm": 1.9180774151723226, + "learning_rate": 1.3374751882575354e-06, + "loss": 0.2751, + "step": 10959 + }, + { + "epoch": 1.677123182861515, + "grad_norm": 2.4520454556972746, + "learning_rate": 1.336237337949211e-06, + "loss": 0.3382, + "step": 10960 + }, + { + "epoch": 1.6772762050497323, + "grad_norm": 1.761323796608222, + "learning_rate": 1.3350000197164436e-06, + "loss": 0.2501, + "step": 10961 + }, + { + "epoch": 1.6774292272379494, + "grad_norm": 2.062316820272847, + "learning_rate": 1.3337632336352234e-06, + "loss": 0.2693, + "step": 10962 + }, + { + "epoch": 1.677582249426167, + "grad_norm": 2.0720852517253214, + "learning_rate": 1.3325269797815066e-06, + "loss": 0.3164, + "step": 10963 + }, + { + "epoch": 1.677735271614384, + "grad_norm": 2.27703499150179, + "learning_rate": 1.3312912582312143e-06, + "loss": 0.3367, + "step": 10964 + }, + { + "epoch": 1.6778882938026014, + "grad_norm": 1.960915018382373, + "learning_rate": 1.3300560690602382e-06, + "loss": 0.2368, + "step": 10965 + }, + { + "epoch": 1.6780413159908187, + "grad_norm": 1.9766829049595693, + "learning_rate": 1.328821412344433e-06, + "loss": 0.2469, + "step": 10966 + }, + { + "epoch": 1.6781943381790358, + "grad_norm": 1.9250997866626893, + "learning_rate": 1.3275872881596319e-06, + "loss": 0.2875, + "step": 10967 + }, + { + "epoch": 1.6783473603672534, + "grad_norm": 1.549837070098533, + "learning_rate": 1.3263536965816203e-06, + "loss": 0.2024, + "step": 10968 + }, + { + "epoch": 1.6785003825554705, + "grad_norm": 1.7642085141734583, + "learning_rate": 1.3251206376861569e-06, + "loss": 0.248, + "step": 10969 + }, + { + "epoch": 1.6786534047436878, + "grad_norm": 2.0836891565839486, + "learning_rate": 1.3238881115489755e-06, + "loss": 0.2646, + "step": 10970 + }, + { + "epoch": 1.6788064269319052, + "grad_norm": 2.008808530117174, + "learning_rate": 1.3226561182457642e-06, + "loss": 0.3069, + "step": 10971 + }, + { + "epoch": 1.6789594491201223, + "grad_norm": 2.25601227776761, + "learning_rate": 1.321424657852185e-06, + "loss": 0.3359, + "step": 10972 + }, + { + "epoch": 1.6791124713083398, + "grad_norm": 2.028677462966108, + "learning_rate": 1.3201937304438728e-06, + "loss": 0.3192, + "step": 10973 + }, + { + "epoch": 1.679265493496557, + "grad_norm": 1.9466678524250804, + "learning_rate": 1.3189633360964137e-06, + "loss": 0.3153, + "step": 10974 + }, + { + "epoch": 1.6794185156847743, + "grad_norm": 2.045679294294252, + "learning_rate": 1.3177334748853798e-06, + "loss": 0.3224, + "step": 10975 + }, + { + "epoch": 1.6795715378729916, + "grad_norm": 1.902697425925076, + "learning_rate": 1.3165041468863004e-06, + "loss": 0.2409, + "step": 10976 + }, + { + "epoch": 1.6797245600612087, + "grad_norm": 1.9497283206798874, + "learning_rate": 1.3152753521746676e-06, + "loss": 0.2643, + "step": 10977 + }, + { + "epoch": 1.6798775822494263, + "grad_norm": 2.1252780328224996, + "learning_rate": 1.3140470908259517e-06, + "loss": 0.327, + "step": 10978 + }, + { + "epoch": 1.6800306044376434, + "grad_norm": 2.0928532425925597, + "learning_rate": 1.312819362915585e-06, + "loss": 0.2725, + "step": 10979 + }, + { + "epoch": 1.6801836266258607, + "grad_norm": 2.2119579928711413, + "learning_rate": 1.3115921685189625e-06, + "loss": 0.2886, + "step": 10980 + }, + { + "epoch": 1.680336648814078, + "grad_norm": 1.7936452479086349, + "learning_rate": 1.3103655077114563e-06, + "loss": 0.2175, + "step": 10981 + }, + { + "epoch": 1.6804896710022952, + "grad_norm": 2.2272606564737445, + "learning_rate": 1.3091393805683972e-06, + "loss": 0.3237, + "step": 10982 + }, + { + "epoch": 1.6806426931905127, + "grad_norm": 2.356171938700306, + "learning_rate": 1.3079137871650894e-06, + "loss": 0.2936, + "step": 10983 + }, + { + "epoch": 1.6807957153787298, + "grad_norm": 2.232352653535342, + "learning_rate": 1.306688727576798e-06, + "loss": 0.3052, + "step": 10984 + }, + { + "epoch": 1.6809487375669472, + "grad_norm": 2.1782319417814153, + "learning_rate": 1.3054642018787612e-06, + "loss": 0.3149, + "step": 10985 + }, + { + "epoch": 1.6811017597551645, + "grad_norm": 2.2406002945095738, + "learning_rate": 1.3042402101461804e-06, + "loss": 0.2902, + "step": 10986 + }, + { + "epoch": 1.6812547819433816, + "grad_norm": 1.9244081104162174, + "learning_rate": 1.303016752454226e-06, + "loss": 0.2905, + "step": 10987 + }, + { + "epoch": 1.6814078041315992, + "grad_norm": 1.797026733705302, + "learning_rate": 1.3017938288780363e-06, + "loss": 0.3086, + "step": 10988 + }, + { + "epoch": 1.6815608263198163, + "grad_norm": 1.9509682390662428, + "learning_rate": 1.300571439492715e-06, + "loss": 0.2575, + "step": 10989 + }, + { + "epoch": 1.6817138485080336, + "grad_norm": 2.45570148479363, + "learning_rate": 1.2993495843733339e-06, + "loss": 0.3735, + "step": 10990 + }, + { + "epoch": 1.681866870696251, + "grad_norm": 1.949368989482816, + "learning_rate": 1.298128263594932e-06, + "loss": 0.3218, + "step": 10991 + }, + { + "epoch": 1.6820198928844683, + "grad_norm": 2.067653056965363, + "learning_rate": 1.2969074772325151e-06, + "loss": 0.3221, + "step": 10992 + }, + { + "epoch": 1.6821729150726856, + "grad_norm": 2.311033815856632, + "learning_rate": 1.2956872253610576e-06, + "loss": 0.3396, + "step": 10993 + }, + { + "epoch": 1.6823259372609027, + "grad_norm": 1.9032585435009628, + "learning_rate": 1.294467508055498e-06, + "loss": 0.2379, + "step": 10994 + }, + { + "epoch": 1.68247895944912, + "grad_norm": 2.2597516199257406, + "learning_rate": 1.2932483253907446e-06, + "loss": 0.2719, + "step": 10995 + }, + { + "epoch": 1.6826319816373374, + "grad_norm": 2.196069288251133, + "learning_rate": 1.2920296774416742e-06, + "loss": 0.3325, + "step": 10996 + }, + { + "epoch": 1.6827850038255547, + "grad_norm": 2.089356908329131, + "learning_rate": 1.2908115642831254e-06, + "loss": 0.2846, + "step": 10997 + }, + { + "epoch": 1.682938026013772, + "grad_norm": 2.500008607734077, + "learning_rate": 1.2895939859899075e-06, + "loss": 0.3552, + "step": 10998 + }, + { + "epoch": 1.6830910482019892, + "grad_norm": 2.214635489148657, + "learning_rate": 1.2883769426368032e-06, + "loss": 0.3502, + "step": 10999 + }, + { + "epoch": 1.6832440703902067, + "grad_norm": 1.915744367799281, + "learning_rate": 1.2871604342985478e-06, + "loss": 0.2487, + "step": 11000 + }, + { + "epoch": 1.6833970925784238, + "grad_norm": 2.2836247432485948, + "learning_rate": 1.2859444610498539e-06, + "loss": 0.293, + "step": 11001 + }, + { + "epoch": 1.6835501147666412, + "grad_norm": 1.904948231183335, + "learning_rate": 1.284729022965403e-06, + "loss": 0.2842, + "step": 11002 + }, + { + "epoch": 1.6837031369548585, + "grad_norm": 2.108321600771051, + "learning_rate": 1.2835141201198343e-06, + "loss": 0.2656, + "step": 11003 + }, + { + "epoch": 1.6838561591430756, + "grad_norm": 2.093179930514413, + "learning_rate": 1.2822997525877633e-06, + "loss": 0.2816, + "step": 11004 + }, + { + "epoch": 1.6840091813312932, + "grad_norm": 2.377483764466178, + "learning_rate": 1.281085920443772e-06, + "loss": 0.3617, + "step": 11005 + }, + { + "epoch": 1.6841622035195103, + "grad_norm": 2.167029247250692, + "learning_rate": 1.2798726237623971e-06, + "loss": 0.2789, + "step": 11006 + }, + { + "epoch": 1.6843152257077276, + "grad_norm": 2.182866340215894, + "learning_rate": 1.2786598626181613e-06, + "loss": 0.2687, + "step": 11007 + }, + { + "epoch": 1.684468247895945, + "grad_norm": 1.9979186288108053, + "learning_rate": 1.2774476370855426e-06, + "loss": 0.2822, + "step": 11008 + }, + { + "epoch": 1.684621270084162, + "grad_norm": 2.1936633301966597, + "learning_rate": 1.276235947238983e-06, + "loss": 0.2928, + "step": 11009 + }, + { + "epoch": 1.6847742922723796, + "grad_norm": 2.104500943307297, + "learning_rate": 1.2750247931529035e-06, + "loss": 0.2523, + "step": 11010 + }, + { + "epoch": 1.6849273144605967, + "grad_norm": 1.920693123555798, + "learning_rate": 1.273814174901684e-06, + "loss": 0.3069, + "step": 11011 + }, + { + "epoch": 1.685080336648814, + "grad_norm": 2.299114587660011, + "learning_rate": 1.272604092559674e-06, + "loss": 0.295, + "step": 11012 + }, + { + "epoch": 1.6852333588370314, + "grad_norm": 2.1949856289025647, + "learning_rate": 1.2713945462011868e-06, + "loss": 0.2994, + "step": 11013 + }, + { + "epoch": 1.6853863810252485, + "grad_norm": 1.9841400340099198, + "learning_rate": 1.2701855359005077e-06, + "loss": 0.2852, + "step": 11014 + }, + { + "epoch": 1.685539403213466, + "grad_norm": 2.4095612568256866, + "learning_rate": 1.2689770617318864e-06, + "loss": 0.2926, + "step": 11015 + }, + { + "epoch": 1.6856924254016832, + "grad_norm": 2.067956237774031, + "learning_rate": 1.2677691237695399e-06, + "loss": 0.3413, + "step": 11016 + }, + { + "epoch": 1.6858454475899005, + "grad_norm": 2.2597156371173814, + "learning_rate": 1.2665617220876513e-06, + "loss": 0.3331, + "step": 11017 + }, + { + "epoch": 1.6859984697781178, + "grad_norm": 2.1700644455923297, + "learning_rate": 1.2653548567603736e-06, + "loss": 0.3578, + "step": 11018 + }, + { + "epoch": 1.686151491966335, + "grad_norm": 2.0662294654838527, + "learning_rate": 1.2641485278618237e-06, + "loss": 0.2941, + "step": 11019 + }, + { + "epoch": 1.6863045141545525, + "grad_norm": 2.129039860992622, + "learning_rate": 1.2629427354660872e-06, + "loss": 0.3242, + "step": 11020 + }, + { + "epoch": 1.6864575363427696, + "grad_norm": 2.3578673412858, + "learning_rate": 1.2617374796472181e-06, + "loss": 0.3425, + "step": 11021 + }, + { + "epoch": 1.686610558530987, + "grad_norm": 2.057803857866164, + "learning_rate": 1.260532760479234e-06, + "loss": 0.2395, + "step": 11022 + }, + { + "epoch": 1.6867635807192043, + "grad_norm": 2.0059512760022593, + "learning_rate": 1.259328578036122e-06, + "loss": 0.3397, + "step": 11023 + }, + { + "epoch": 1.6869166029074216, + "grad_norm": 2.049651286807147, + "learning_rate": 1.2581249323918354e-06, + "loss": 0.2789, + "step": 11024 + }, + { + "epoch": 1.687069625095639, + "grad_norm": 2.499375308441647, + "learning_rate": 1.2569218236202952e-06, + "loss": 0.3722, + "step": 11025 + }, + { + "epoch": 1.687222647283856, + "grad_norm": 1.960400132055227, + "learning_rate": 1.2557192517953897e-06, + "loss": 0.2968, + "step": 11026 + }, + { + "epoch": 1.6873756694720734, + "grad_norm": 2.1366266710985413, + "learning_rate": 1.2545172169909703e-06, + "loss": 0.283, + "step": 11027 + }, + { + "epoch": 1.6875286916602907, + "grad_norm": 2.039190519536008, + "learning_rate": 1.2533157192808644e-06, + "loss": 0.3212, + "step": 11028 + }, + { + "epoch": 1.687681713848508, + "grad_norm": 2.0949669870871093, + "learning_rate": 1.252114758738856e-06, + "loss": 0.2679, + "step": 11029 + }, + { + "epoch": 1.6878347360367254, + "grad_norm": 2.0416473705727487, + "learning_rate": 1.250914335438701e-06, + "loss": 0.2916, + "step": 11030 + }, + { + "epoch": 1.6879877582249425, + "grad_norm": 2.09341563269475, + "learning_rate": 1.2497144494541258e-06, + "loss": 0.2946, + "step": 11031 + }, + { + "epoch": 1.68814078041316, + "grad_norm": 2.270626399599111, + "learning_rate": 1.248515100858817e-06, + "loss": 0.331, + "step": 11032 + }, + { + "epoch": 1.6882938026013772, + "grad_norm": 2.2390309838409883, + "learning_rate": 1.247316289726429e-06, + "loss": 0.3092, + "step": 11033 + }, + { + "epoch": 1.6884468247895945, + "grad_norm": 2.1983290793456254, + "learning_rate": 1.2461180161305919e-06, + "loss": 0.3022, + "step": 11034 + }, + { + "epoch": 1.6885998469778118, + "grad_norm": 2.0568056713653875, + "learning_rate": 1.24492028014489e-06, + "loss": 0.3016, + "step": 11035 + }, + { + "epoch": 1.688752869166029, + "grad_norm": 2.3206498756421152, + "learning_rate": 1.2437230818428846e-06, + "loss": 0.3531, + "step": 11036 + }, + { + "epoch": 1.6889058913542465, + "grad_norm": 2.0813455867828643, + "learning_rate": 1.2425264212981024e-06, + "loss": 0.2628, + "step": 11037 + }, + { + "epoch": 1.6890589135424636, + "grad_norm": 2.0413458094502785, + "learning_rate": 1.241330298584027e-06, + "loss": 0.3088, + "step": 11038 + }, + { + "epoch": 1.689211935730681, + "grad_norm": 2.1582679198860637, + "learning_rate": 1.2401347137741248e-06, + "loss": 0.3091, + "step": 11039 + }, + { + "epoch": 1.6893649579188983, + "grad_norm": 2.069240644552984, + "learning_rate": 1.2389396669418185e-06, + "loss": 0.2866, + "step": 11040 + }, + { + "epoch": 1.6895179801071154, + "grad_norm": 1.9584455619267858, + "learning_rate": 1.2377451581605015e-06, + "loss": 0.3061, + "step": 11041 + }, + { + "epoch": 1.689671002295333, + "grad_norm": 2.3277588997885412, + "learning_rate": 1.2365511875035319e-06, + "loss": 0.3369, + "step": 11042 + }, + { + "epoch": 1.68982402448355, + "grad_norm": 1.9800477950424002, + "learning_rate": 1.2353577550442363e-06, + "loss": 0.2405, + "step": 11043 + }, + { + "epoch": 1.6899770466717674, + "grad_norm": 2.160902013523082, + "learning_rate": 1.2341648608559088e-06, + "loss": 0.2896, + "step": 11044 + }, + { + "epoch": 1.6901300688599847, + "grad_norm": 2.1642624810194877, + "learning_rate": 1.2329725050118091e-06, + "loss": 0.2678, + "step": 11045 + }, + { + "epoch": 1.6902830910482018, + "grad_norm": 2.209833345083377, + "learning_rate": 1.2317806875851646e-06, + "loss": 0.2937, + "step": 11046 + }, + { + "epoch": 1.6904361132364194, + "grad_norm": 2.402841329362798, + "learning_rate": 1.23058940864917e-06, + "loss": 0.286, + "step": 11047 + }, + { + "epoch": 1.6905891354246365, + "grad_norm": 2.522322338055443, + "learning_rate": 1.2293986682769832e-06, + "loss": 0.3434, + "step": 11048 + }, + { + "epoch": 1.6907421576128538, + "grad_norm": 1.9879433395929236, + "learning_rate": 1.2282084665417404e-06, + "loss": 0.2557, + "step": 11049 + }, + { + "epoch": 1.6908951798010712, + "grad_norm": 2.0561928258750366, + "learning_rate": 1.2270188035165277e-06, + "loss": 0.2496, + "step": 11050 + }, + { + "epoch": 1.6910482019892883, + "grad_norm": 1.9106083750167469, + "learning_rate": 1.2258296792744084e-06, + "loss": 0.3045, + "step": 11051 + }, + { + "epoch": 1.6912012241775058, + "grad_norm": 1.8611621144706574, + "learning_rate": 1.224641093888418e-06, + "loss": 0.2356, + "step": 11052 + }, + { + "epoch": 1.691354246365723, + "grad_norm": 2.020755692551469, + "learning_rate": 1.2234530474315453e-06, + "loss": 0.2391, + "step": 11053 + }, + { + "epoch": 1.6915072685539403, + "grad_norm": 2.3678631524009353, + "learning_rate": 1.2222655399767524e-06, + "loss": 0.2883, + "step": 11054 + }, + { + "epoch": 1.6916602907421576, + "grad_norm": 2.246798245562036, + "learning_rate": 1.2210785715969765e-06, + "loss": 0.3168, + "step": 11055 + }, + { + "epoch": 1.691813312930375, + "grad_norm": 2.1634726407960088, + "learning_rate": 1.2198921423651034e-06, + "loss": 0.2602, + "step": 11056 + }, + { + "epoch": 1.6919663351185923, + "grad_norm": 1.8821433603848743, + "learning_rate": 1.218706252354005e-06, + "loss": 0.2641, + "step": 11057 + }, + { + "epoch": 1.6921193573068094, + "grad_norm": 2.1581611539677863, + "learning_rate": 1.2175209016365098e-06, + "loss": 0.3223, + "step": 11058 + }, + { + "epoch": 1.6922723794950267, + "grad_norm": 2.07129171472233, + "learning_rate": 1.2163360902854094e-06, + "loss": 0.3231, + "step": 11059 + }, + { + "epoch": 1.692425401683244, + "grad_norm": 2.270422976688089, + "learning_rate": 1.2151518183734735e-06, + "loss": 0.311, + "step": 11060 + }, + { + "epoch": 1.6925784238714614, + "grad_norm": 1.8203368474711896, + "learning_rate": 1.2139680859734326e-06, + "loss": 0.2481, + "step": 11061 + }, + { + "epoch": 1.6927314460596787, + "grad_norm": 2.0614554546840553, + "learning_rate": 1.2127848931579788e-06, + "loss": 0.2811, + "step": 11062 + }, + { + "epoch": 1.6928844682478958, + "grad_norm": 2.196661166866587, + "learning_rate": 1.2116022399997828e-06, + "loss": 0.3167, + "step": 11063 + }, + { + "epoch": 1.6930374904361134, + "grad_norm": 2.077107983851203, + "learning_rate": 1.2104201265714743e-06, + "loss": 0.2713, + "step": 11064 + }, + { + "epoch": 1.6931905126243305, + "grad_norm": 2.0755681276903424, + "learning_rate": 1.2092385529456497e-06, + "loss": 0.2502, + "step": 11065 + }, + { + "epoch": 1.6933435348125478, + "grad_norm": 2.22983973981828, + "learning_rate": 1.2080575191948763e-06, + "loss": 0.2748, + "step": 11066 + }, + { + "epoch": 1.6934965570007652, + "grad_norm": 2.2026304230846323, + "learning_rate": 1.206877025391684e-06, + "loss": 0.2916, + "step": 11067 + }, + { + "epoch": 1.6936495791889823, + "grad_norm": 2.1680872094915915, + "learning_rate": 1.2056970716085724e-06, + "loss": 0.2673, + "step": 11068 + }, + { + "epoch": 1.6938026013771998, + "grad_norm": 1.849240136022882, + "learning_rate": 1.2045176579180074e-06, + "loss": 0.2087, + "step": 11069 + }, + { + "epoch": 1.693955623565417, + "grad_norm": 2.0659329721045308, + "learning_rate": 1.2033387843924215e-06, + "loss": 0.2818, + "step": 11070 + }, + { + "epoch": 1.6941086457536343, + "grad_norm": 2.2135172712744176, + "learning_rate": 1.202160451104213e-06, + "loss": 0.3076, + "step": 11071 + }, + { + "epoch": 1.6942616679418516, + "grad_norm": 2.20420719101936, + "learning_rate": 1.2009826581257488e-06, + "loss": 0.3086, + "step": 11072 + }, + { + "epoch": 1.6944146901300687, + "grad_norm": 1.9144814530586822, + "learning_rate": 1.1998054055293617e-06, + "loss": 0.4203, + "step": 11073 + }, + { + "epoch": 1.6945677123182863, + "grad_norm": 2.5558070625916343, + "learning_rate": 1.1986286933873503e-06, + "loss": 0.2861, + "step": 11074 + }, + { + "epoch": 1.6947207345065034, + "grad_norm": 2.2374112894675595, + "learning_rate": 1.1974525217719835e-06, + "loss": 0.3367, + "step": 11075 + }, + { + "epoch": 1.6948737566947207, + "grad_norm": 2.1946913555112637, + "learning_rate": 1.1962768907554923e-06, + "loss": 0.3348, + "step": 11076 + }, + { + "epoch": 1.695026778882938, + "grad_norm": 2.151829815904901, + "learning_rate": 1.1951018004100757e-06, + "loss": 0.3246, + "step": 11077 + }, + { + "epoch": 1.6951798010711552, + "grad_norm": 2.2967753420143513, + "learning_rate": 1.1939272508079058e-06, + "loss": 0.335, + "step": 11078 + }, + { + "epoch": 1.6953328232593727, + "grad_norm": 2.2577136557529838, + "learning_rate": 1.192753242021112e-06, + "loss": 0.2895, + "step": 11079 + }, + { + "epoch": 1.6954858454475898, + "grad_norm": 2.128267511682266, + "learning_rate": 1.1915797741217928e-06, + "loss": 0.257, + "step": 11080 + }, + { + "epoch": 1.6956388676358072, + "grad_norm": 2.4002431250160567, + "learning_rate": 1.190406847182023e-06, + "loss": 0.3277, + "step": 11081 + }, + { + "epoch": 1.6957918898240245, + "grad_norm": 2.2038719394242023, + "learning_rate": 1.1892344612738305e-06, + "loss": 0.263, + "step": 11082 + }, + { + "epoch": 1.6959449120122416, + "grad_norm": 1.8881112103439488, + "learning_rate": 1.1880626164692154e-06, + "loss": 0.2243, + "step": 11083 + }, + { + "epoch": 1.6960979342004592, + "grad_norm": 2.138352754268959, + "learning_rate": 1.1868913128401504e-06, + "loss": 0.2879, + "step": 11084 + }, + { + "epoch": 1.6962509563886763, + "grad_norm": 1.828210483145325, + "learning_rate": 1.1857205504585645e-06, + "loss": 0.2238, + "step": 11085 + }, + { + "epoch": 1.6964039785768936, + "grad_norm": 2.5405874955917516, + "learning_rate": 1.1845503293963623e-06, + "loss": 0.3505, + "step": 11086 + }, + { + "epoch": 1.696557000765111, + "grad_norm": 2.111535006004857, + "learning_rate": 1.183380649725413e-06, + "loss": 0.2496, + "step": 11087 + }, + { + "epoch": 1.696710022953328, + "grad_norm": 2.424057326610335, + "learning_rate": 1.1822115115175448e-06, + "loss": 0.3544, + "step": 11088 + }, + { + "epoch": 1.6968630451415456, + "grad_norm": 2.4592854858100512, + "learning_rate": 1.1810429148445645e-06, + "loss": 0.2626, + "step": 11089 + }, + { + "epoch": 1.6970160673297627, + "grad_norm": 2.042198436206543, + "learning_rate": 1.1798748597782417e-06, + "loss": 0.2835, + "step": 11090 + }, + { + "epoch": 1.69716908951798, + "grad_norm": 2.275740572056835, + "learning_rate": 1.1787073463903033e-06, + "loss": 0.2623, + "step": 11091 + }, + { + "epoch": 1.6973221117061974, + "grad_norm": 2.158557298967889, + "learning_rate": 1.1775403747524582e-06, + "loss": 0.2947, + "step": 11092 + }, + { + "epoch": 1.6974751338944147, + "grad_norm": 1.9757088684466695, + "learning_rate": 1.176373944936372e-06, + "loss": 0.2579, + "step": 11093 + }, + { + "epoch": 1.697628156082632, + "grad_norm": 1.7875604110832137, + "learning_rate": 1.1752080570136814e-06, + "loss": 0.2421, + "step": 11094 + }, + { + "epoch": 1.6977811782708492, + "grad_norm": 2.2847682190407674, + "learning_rate": 1.174042711055986e-06, + "loss": 0.2785, + "step": 11095 + }, + { + "epoch": 1.6979342004590665, + "grad_norm": 2.1586205655942345, + "learning_rate": 1.172877907134855e-06, + "loss": 0.2986, + "step": 11096 + }, + { + "epoch": 1.6980872226472838, + "grad_norm": 2.5074890553104403, + "learning_rate": 1.1717136453218236e-06, + "loss": 0.3686, + "step": 11097 + }, + { + "epoch": 1.6982402448355012, + "grad_norm": 2.01025453080292, + "learning_rate": 1.1705499256883934e-06, + "loss": 0.2587, + "step": 11098 + }, + { + "epoch": 1.6983932670237185, + "grad_norm": 2.3308542569292303, + "learning_rate": 1.1693867483060328e-06, + "loss": 0.2932, + "step": 11099 + }, + { + "epoch": 1.6985462892119356, + "grad_norm": 2.234928679181359, + "learning_rate": 1.1682241132461791e-06, + "loss": 0.2752, + "step": 11100 + }, + { + "epoch": 1.6986993114001532, + "grad_norm": 2.4713138884866472, + "learning_rate": 1.1670620205802319e-06, + "loss": 0.3389, + "step": 11101 + }, + { + "epoch": 1.6988523335883703, + "grad_norm": 2.471201205818322, + "learning_rate": 1.1659004703795607e-06, + "loss": 0.2308, + "step": 11102 + }, + { + "epoch": 1.6990053557765876, + "grad_norm": 1.7215349224563574, + "learning_rate": 1.1647394627155006e-06, + "loss": 0.1932, + "step": 11103 + }, + { + "epoch": 1.699158377964805, + "grad_norm": 2.0922403083721925, + "learning_rate": 1.1635789976593536e-06, + "loss": 0.2619, + "step": 11104 + }, + { + "epoch": 1.699311400153022, + "grad_norm": 2.302314103427617, + "learning_rate": 1.1624190752823895e-06, + "loss": 0.3499, + "step": 11105 + }, + { + "epoch": 1.6994644223412396, + "grad_norm": 1.869357598102231, + "learning_rate": 1.1612596956558398e-06, + "loss": 0.2962, + "step": 11106 + }, + { + "epoch": 1.6996174445294567, + "grad_norm": 2.0397788159015935, + "learning_rate": 1.1601008588509143e-06, + "loss": 0.2743, + "step": 11107 + }, + { + "epoch": 1.699770466717674, + "grad_norm": 2.0520554519115093, + "learning_rate": 1.1589425649387743e-06, + "loss": 0.2592, + "step": 11108 + }, + { + "epoch": 1.6999234889058914, + "grad_norm": 1.7852914422848998, + "learning_rate": 1.1577848139905568e-06, + "loss": 0.2239, + "step": 11109 + }, + { + "epoch": 1.7000765110941085, + "grad_norm": 2.014202409879043, + "learning_rate": 1.1566276060773674e-06, + "loss": 0.2447, + "step": 11110 + }, + { + "epoch": 1.700229533282326, + "grad_norm": 1.989579755348068, + "learning_rate": 1.1554709412702714e-06, + "loss": 0.2221, + "step": 11111 + }, + { + "epoch": 1.7003825554705432, + "grad_norm": 2.090257985515126, + "learning_rate": 1.154314819640302e-06, + "loss": 0.3446, + "step": 11112 + }, + { + "epoch": 1.7005355776587605, + "grad_norm": 1.9125910003610571, + "learning_rate": 1.153159241258469e-06, + "loss": 0.2864, + "step": 11113 + }, + { + "epoch": 1.7006885998469778, + "grad_norm": 2.1613839402350314, + "learning_rate": 1.1520042061957315e-06, + "loss": 0.2825, + "step": 11114 + }, + { + "epoch": 1.700841622035195, + "grad_norm": 2.3295656766209616, + "learning_rate": 1.1508497145230314e-06, + "loss": 0.3438, + "step": 11115 + }, + { + "epoch": 1.7009946442234125, + "grad_norm": 2.2090564568144075, + "learning_rate": 1.1496957663112697e-06, + "loss": 0.3128, + "step": 11116 + }, + { + "epoch": 1.7011476664116296, + "grad_norm": 2.0492676006983297, + "learning_rate": 1.1485423616313107e-06, + "loss": 0.2841, + "step": 11117 + }, + { + "epoch": 1.701300688599847, + "grad_norm": 2.1697518217573113, + "learning_rate": 1.1473895005539937e-06, + "loss": 0.2625, + "step": 11118 + }, + { + "epoch": 1.7014537107880643, + "grad_norm": 1.9730217228325013, + "learning_rate": 1.1462371831501207e-06, + "loss": 0.2775, + "step": 11119 + }, + { + "epoch": 1.7016067329762814, + "grad_norm": 2.088602724372186, + "learning_rate": 1.1450854094904551e-06, + "loss": 0.3239, + "step": 11120 + }, + { + "epoch": 1.701759755164499, + "grad_norm": 2.3255899147858443, + "learning_rate": 1.143934179645737e-06, + "loss": 0.3086, + "step": 11121 + }, + { + "epoch": 1.701912777352716, + "grad_norm": 1.9272329964493535, + "learning_rate": 1.1427834936866655e-06, + "loss": 0.2355, + "step": 11122 + }, + { + "epoch": 1.7020657995409334, + "grad_norm": 1.9946843187147796, + "learning_rate": 1.1416333516839107e-06, + "loss": 0.2835, + "step": 11123 + }, + { + "epoch": 1.7022188217291507, + "grad_norm": 2.173321803166349, + "learning_rate": 1.1404837537081049e-06, + "loss": 0.2785, + "step": 11124 + }, + { + "epoch": 1.702371843917368, + "grad_norm": 2.0278180668942745, + "learning_rate": 1.1393346998298505e-06, + "loss": 0.266, + "step": 11125 + }, + { + "epoch": 1.7025248661055854, + "grad_norm": 2.0094825725857173, + "learning_rate": 1.1381861901197144e-06, + "loss": 0.2798, + "step": 11126 + }, + { + "epoch": 1.7026778882938025, + "grad_norm": 2.098979114428069, + "learning_rate": 1.1370382246482337e-06, + "loss": 0.2667, + "step": 11127 + }, + { + "epoch": 1.7028309104820198, + "grad_norm": 2.0523444053923305, + "learning_rate": 1.1358908034859062e-06, + "loss": 0.3097, + "step": 11128 + }, + { + "epoch": 1.7029839326702372, + "grad_norm": 2.1537059294575456, + "learning_rate": 1.1347439267032023e-06, + "loss": 0.2674, + "step": 11129 + }, + { + "epoch": 1.7031369548584545, + "grad_norm": 2.1006487237785727, + "learning_rate": 1.1335975943705523e-06, + "loss": 0.2938, + "step": 11130 + }, + { + "epoch": 1.7032899770466718, + "grad_norm": 2.3639342633413114, + "learning_rate": 1.1324518065583633e-06, + "loss": 0.2944, + "step": 11131 + }, + { + "epoch": 1.703442999234889, + "grad_norm": 2.082932273742042, + "learning_rate": 1.1313065633369968e-06, + "loss": 0.2364, + "step": 11132 + }, + { + "epoch": 1.7035960214231065, + "grad_norm": 2.188856557820004, + "learning_rate": 1.1301618647767876e-06, + "loss": 0.3076, + "step": 11133 + }, + { + "epoch": 1.7037490436113236, + "grad_norm": 2.000951821270597, + "learning_rate": 1.1290177109480406e-06, + "loss": 0.2625, + "step": 11134 + }, + { + "epoch": 1.703902065799541, + "grad_norm": 2.430335169813233, + "learning_rate": 1.127874101921018e-06, + "loss": 0.3042, + "step": 11135 + }, + { + "epoch": 1.7040550879877583, + "grad_norm": 2.281204762892098, + "learning_rate": 1.1267310377659524e-06, + "loss": 0.3133, + "step": 11136 + }, + { + "epoch": 1.7042081101759754, + "grad_norm": 1.9930439380863587, + "learning_rate": 1.1255885185530502e-06, + "loss": 0.2929, + "step": 11137 + }, + { + "epoch": 1.704361132364193, + "grad_norm": 1.9680645229256954, + "learning_rate": 1.12444654435247e-06, + "loss": 0.2042, + "step": 11138 + }, + { + "epoch": 1.70451415455241, + "grad_norm": 2.0093847715801285, + "learning_rate": 1.123305115234351e-06, + "loss": 0.349, + "step": 11139 + }, + { + "epoch": 1.7046671767406274, + "grad_norm": 2.1094826918613556, + "learning_rate": 1.122164231268793e-06, + "loss": 0.2989, + "step": 11140 + }, + { + "epoch": 1.7048201989288447, + "grad_norm": 2.169821861197907, + "learning_rate": 1.1210238925258554e-06, + "loss": 0.2632, + "step": 11141 + }, + { + "epoch": 1.7049732211170618, + "grad_norm": 2.4069773754922466, + "learning_rate": 1.1198840990755777e-06, + "loss": 0.3368, + "step": 11142 + }, + { + "epoch": 1.7051262433052794, + "grad_norm": 2.017085258934159, + "learning_rate": 1.1187448509879562e-06, + "loss": 0.2816, + "step": 11143 + }, + { + "epoch": 1.7052792654934965, + "grad_norm": 1.9728566967575223, + "learning_rate": 1.117606148332957e-06, + "loss": 0.284, + "step": 11144 + }, + { + "epoch": 1.7054322876817138, + "grad_norm": 1.9431671387155085, + "learning_rate": 1.116467991180512e-06, + "loss": 0.2646, + "step": 11145 + }, + { + "epoch": 1.7055853098699312, + "grad_norm": 1.9854827370978707, + "learning_rate": 1.115330379600521e-06, + "loss": 0.2952, + "step": 11146 + }, + { + "epoch": 1.7057383320581483, + "grad_norm": 2.280487685992356, + "learning_rate": 1.114193313662848e-06, + "loss": 0.2895, + "step": 11147 + }, + { + "epoch": 1.7058913542463658, + "grad_norm": 2.287105129323108, + "learning_rate": 1.113056793437326e-06, + "loss": 0.2921, + "step": 11148 + }, + { + "epoch": 1.706044376434583, + "grad_norm": 2.1288322442429384, + "learning_rate": 1.1119208189937514e-06, + "loss": 0.2851, + "step": 11149 + }, + { + "epoch": 1.7061973986228003, + "grad_norm": 2.0158199871640665, + "learning_rate": 1.1107853904018896e-06, + "loss": 0.2171, + "step": 11150 + }, + { + "epoch": 1.7063504208110176, + "grad_norm": 1.9626837031523319, + "learning_rate": 1.1096505077314723e-06, + "loss": 0.2358, + "step": 11151 + }, + { + "epoch": 1.7065034429992347, + "grad_norm": 2.305121265245296, + "learning_rate": 1.1085161710521962e-06, + "loss": 0.2749, + "step": 11152 + }, + { + "epoch": 1.7066564651874523, + "grad_norm": 2.038485482391815, + "learning_rate": 1.1073823804337258e-06, + "loss": 0.2631, + "step": 11153 + }, + { + "epoch": 1.7068094873756694, + "grad_norm": 2.2882425227783862, + "learning_rate": 1.1062491359456917e-06, + "loss": 0.3128, + "step": 11154 + }, + { + "epoch": 1.7069625095638867, + "grad_norm": 2.2012392539786134, + "learning_rate": 1.1051164376576896e-06, + "loss": 0.276, + "step": 11155 + }, + { + "epoch": 1.707115531752104, + "grad_norm": 2.0394408445761663, + "learning_rate": 1.1039842856392856e-06, + "loss": 0.307, + "step": 11156 + }, + { + "epoch": 1.7072685539403214, + "grad_norm": 2.128922991238398, + "learning_rate": 1.102852679960007e-06, + "loss": 0.2901, + "step": 11157 + }, + { + "epoch": 1.7074215761285387, + "grad_norm": 1.8881752359487909, + "learning_rate": 1.101721620689352e-06, + "loss": 0.298, + "step": 11158 + }, + { + "epoch": 1.7075745983167558, + "grad_norm": 2.0841273220620664, + "learning_rate": 1.1005911078967802e-06, + "loss": 0.2725, + "step": 11159 + }, + { + "epoch": 1.7077276205049732, + "grad_norm": 1.994898698810137, + "learning_rate": 1.0994611416517264e-06, + "loss": 0.2369, + "step": 11160 + }, + { + "epoch": 1.7078806426931905, + "grad_norm": 2.0006084196277003, + "learning_rate": 1.0983317220235823e-06, + "loss": 0.2582, + "step": 11161 + }, + { + "epoch": 1.7080336648814078, + "grad_norm": 2.1415344517825736, + "learning_rate": 1.0972028490817077e-06, + "loss": 0.2784, + "step": 11162 + }, + { + "epoch": 1.7081866870696252, + "grad_norm": 1.8469449110302756, + "learning_rate": 1.0960745228954384e-06, + "loss": 0.2703, + "step": 11163 + }, + { + "epoch": 1.7083397092578423, + "grad_norm": 2.0745692309473167, + "learning_rate": 1.0949467435340623e-06, + "loss": 0.2781, + "step": 11164 + }, + { + "epoch": 1.7084927314460598, + "grad_norm": 2.0757462371321473, + "learning_rate": 1.0938195110668425e-06, + "loss": 0.3172, + "step": 11165 + }, + { + "epoch": 1.708645753634277, + "grad_norm": 1.9539949708197906, + "learning_rate": 1.092692825563011e-06, + "loss": 0.2808, + "step": 11166 + }, + { + "epoch": 1.7087987758224943, + "grad_norm": 1.8315528166629997, + "learning_rate": 1.091566687091755e-06, + "loss": 0.2459, + "step": 11167 + }, + { + "epoch": 1.7089517980107116, + "grad_norm": 1.7884907895676587, + "learning_rate": 1.0904410957222411e-06, + "loss": 0.234, + "step": 11168 + }, + { + "epoch": 1.7091048201989287, + "grad_norm": 2.2466456776934662, + "learning_rate": 1.0893160515235957e-06, + "loss": 0.3361, + "step": 11169 + }, + { + "epoch": 1.7092578423871463, + "grad_norm": 2.4207583570260023, + "learning_rate": 1.0881915545649058e-06, + "loss": 0.3154, + "step": 11170 + }, + { + "epoch": 1.7094108645753634, + "grad_norm": 1.9753841454069814, + "learning_rate": 1.0870676049152385e-06, + "loss": 0.2679, + "step": 11171 + }, + { + "epoch": 1.7095638867635807, + "grad_norm": 2.3190131346279514, + "learning_rate": 1.0859442026436185e-06, + "loss": 0.2965, + "step": 11172 + }, + { + "epoch": 1.709716908951798, + "grad_norm": 2.160003228975004, + "learning_rate": 1.0848213478190328e-06, + "loss": 0.3465, + "step": 11173 + }, + { + "epoch": 1.7098699311400152, + "grad_norm": 2.3473105298032264, + "learning_rate": 1.0836990405104475e-06, + "loss": 0.3369, + "step": 11174 + }, + { + "epoch": 1.7100229533282327, + "grad_norm": 2.014596179323918, + "learning_rate": 1.082577280786784e-06, + "loss": 0.269, + "step": 11175 + }, + { + "epoch": 1.7101759755164498, + "grad_norm": 2.057023392344061, + "learning_rate": 1.081456068716935e-06, + "loss": 0.3078, + "step": 11176 + }, + { + "epoch": 1.7103289977046672, + "grad_norm": 2.444349548800692, + "learning_rate": 1.0803354043697588e-06, + "loss": 0.3089, + "step": 11177 + }, + { + "epoch": 1.7104820198928845, + "grad_norm": 1.9832948768618603, + "learning_rate": 1.0792152878140794e-06, + "loss": 0.3157, + "step": 11178 + }, + { + "epoch": 1.7106350420811016, + "grad_norm": 2.1437773295090126, + "learning_rate": 1.078095719118687e-06, + "loss": 0.3203, + "step": 11179 + }, + { + "epoch": 1.7107880642693192, + "grad_norm": 1.9014695196455216, + "learning_rate": 1.0769766983523388e-06, + "loss": 0.2788, + "step": 11180 + }, + { + "epoch": 1.7109410864575363, + "grad_norm": 1.8859102107863197, + "learning_rate": 1.0758582255837591e-06, + "loss": 0.2665, + "step": 11181 + }, + { + "epoch": 1.7110941086457536, + "grad_norm": 2.0492514664079473, + "learning_rate": 1.0747403008816382e-06, + "loss": 0.2679, + "step": 11182 + }, + { + "epoch": 1.711247130833971, + "grad_norm": 2.0658287803286197, + "learning_rate": 1.07362292431463e-06, + "loss": 0.2491, + "step": 11183 + }, + { + "epoch": 1.711400153022188, + "grad_norm": 2.1805943892160475, + "learning_rate": 1.0725060959513578e-06, + "loss": 0.3342, + "step": 11184 + }, + { + "epoch": 1.7115531752104056, + "grad_norm": 1.9814368426589373, + "learning_rate": 1.0713898158604119e-06, + "loss": 0.2748, + "step": 11185 + }, + { + "epoch": 1.7117061973986227, + "grad_norm": 2.0479335820218343, + "learning_rate": 1.0702740841103455e-06, + "loss": 0.3067, + "step": 11186 + }, + { + "epoch": 1.71185921958684, + "grad_norm": 2.2402698668164467, + "learning_rate": 1.0691589007696811e-06, + "loss": 0.2936, + "step": 11187 + }, + { + "epoch": 1.7120122417750574, + "grad_norm": 2.175527755245013, + "learning_rate": 1.0680442659069046e-06, + "loss": 0.3096, + "step": 11188 + }, + { + "epoch": 1.7121652639632745, + "grad_norm": 2.259874022178763, + "learning_rate": 1.0669301795904762e-06, + "loss": 0.3045, + "step": 11189 + }, + { + "epoch": 1.712318286151492, + "grad_norm": 2.206801471311079, + "learning_rate": 1.0658166418888094e-06, + "loss": 0.2307, + "step": 11190 + }, + { + "epoch": 1.7124713083397092, + "grad_norm": 2.0808032911944663, + "learning_rate": 1.0647036528702915e-06, + "loss": 0.3301, + "step": 11191 + }, + { + "epoch": 1.7126243305279265, + "grad_norm": 2.009233915278547, + "learning_rate": 1.06359121260328e-06, + "loss": 0.2486, + "step": 11192 + }, + { + "epoch": 1.7127773527161438, + "grad_norm": 1.9884306466496928, + "learning_rate": 1.0624793211560913e-06, + "loss": 0.236, + "step": 11193 + }, + { + "epoch": 1.7129303749043612, + "grad_norm": 1.9343940879298103, + "learning_rate": 1.0613679785970087e-06, + "loss": 0.2245, + "step": 11194 + }, + { + "epoch": 1.7130833970925785, + "grad_norm": 2.068900757215732, + "learning_rate": 1.06025718499429e-06, + "loss": 0.336, + "step": 11195 + }, + { + "epoch": 1.7132364192807956, + "grad_norm": 2.107983069680888, + "learning_rate": 1.059146940416147e-06, + "loss": 0.2883, + "step": 11196 + }, + { + "epoch": 1.713389441469013, + "grad_norm": 2.0972134931039657, + "learning_rate": 1.0580372449307686e-06, + "loss": 0.2593, + "step": 11197 + }, + { + "epoch": 1.7135424636572303, + "grad_norm": 2.2987713281363344, + "learning_rate": 1.0569280986063058e-06, + "loss": 0.3548, + "step": 11198 + }, + { + "epoch": 1.7136954858454476, + "grad_norm": 2.059416623438375, + "learning_rate": 1.0558195015108708e-06, + "loss": 0.2374, + "step": 11199 + }, + { + "epoch": 1.713848508033665, + "grad_norm": 2.161279175677584, + "learning_rate": 1.0547114537125514e-06, + "loss": 0.2702, + "step": 11200 + }, + { + "epoch": 1.714001530221882, + "grad_norm": 1.9833873999128637, + "learning_rate": 1.0536039552793987e-06, + "loss": 0.2869, + "step": 11201 + }, + { + "epoch": 1.7141545524100996, + "grad_norm": 1.941243048171417, + "learning_rate": 1.0524970062794203e-06, + "loss": 0.248, + "step": 11202 + }, + { + "epoch": 1.7143075745983167, + "grad_norm": 2.215030856608608, + "learning_rate": 1.0513906067806067e-06, + "loss": 0.2854, + "step": 11203 + }, + { + "epoch": 1.714460596786534, + "grad_norm": 2.09891972731733, + "learning_rate": 1.0502847568509023e-06, + "loss": 0.2748, + "step": 11204 + }, + { + "epoch": 1.7146136189747514, + "grad_norm": 1.9231481988683312, + "learning_rate": 1.0491794565582225e-06, + "loss": 0.227, + "step": 11205 + }, + { + "epoch": 1.7147666411629685, + "grad_norm": 2.1008690413719227, + "learning_rate": 1.0480747059704488e-06, + "loss": 0.2827, + "step": 11206 + }, + { + "epoch": 1.714919663351186, + "grad_norm": 1.9290761821050166, + "learning_rate": 1.0469705051554269e-06, + "loss": 0.3185, + "step": 11207 + }, + { + "epoch": 1.7150726855394032, + "grad_norm": 2.343551545185132, + "learning_rate": 1.0458668541809714e-06, + "loss": 0.3501, + "step": 11208 + }, + { + "epoch": 1.7152257077276205, + "grad_norm": 1.788433718216971, + "learning_rate": 1.0447637531148592e-06, + "loss": 0.2199, + "step": 11209 + }, + { + "epoch": 1.7153787299158378, + "grad_norm": 2.0652724541004863, + "learning_rate": 1.0436612020248404e-06, + "loss": 0.2759, + "step": 11210 + }, + { + "epoch": 1.715531752104055, + "grad_norm": 1.9281228885043027, + "learning_rate": 1.0425592009786246e-06, + "loss": 0.2673, + "step": 11211 + }, + { + "epoch": 1.7156847742922725, + "grad_norm": 2.106942568657105, + "learning_rate": 1.0414577500438873e-06, + "loss": 0.2637, + "step": 11212 + }, + { + "epoch": 1.7158377964804896, + "grad_norm": 2.0071831488097227, + "learning_rate": 1.0403568492882786e-06, + "loss": 0.2315, + "step": 11213 + }, + { + "epoch": 1.715990818668707, + "grad_norm": 2.0902800708364024, + "learning_rate": 1.039256498779405e-06, + "loss": 0.2754, + "step": 11214 + }, + { + "epoch": 1.7161438408569243, + "grad_norm": 1.998381374019673, + "learning_rate": 1.0381566985848423e-06, + "loss": 0.2503, + "step": 11215 + }, + { + "epoch": 1.7162968630451414, + "grad_norm": 2.275807287322682, + "learning_rate": 1.0370574487721396e-06, + "loss": 0.2725, + "step": 11216 + }, + { + "epoch": 1.716449885233359, + "grad_norm": 2.4486035736270746, + "learning_rate": 1.0359587494087985e-06, + "loss": 0.2977, + "step": 11217 + }, + { + "epoch": 1.716602907421576, + "grad_norm": 1.9189903077463175, + "learning_rate": 1.0348606005622997e-06, + "loss": 0.2505, + "step": 11218 + }, + { + "epoch": 1.7167559296097934, + "grad_norm": 2.1347410423324025, + "learning_rate": 1.0337630023000856e-06, + "loss": 0.287, + "step": 11219 + }, + { + "epoch": 1.7169089517980107, + "grad_norm": 2.043348875111778, + "learning_rate": 1.032665954689558e-06, + "loss": 0.2275, + "step": 11220 + }, + { + "epoch": 1.7170619739862278, + "grad_norm": 1.9169361145288593, + "learning_rate": 1.031569457798095e-06, + "loss": 0.2564, + "step": 11221 + }, + { + "epoch": 1.7172149961744454, + "grad_norm": 2.4828468009673257, + "learning_rate": 1.0304735116930397e-06, + "loss": 0.3424, + "step": 11222 + }, + { + "epoch": 1.7173680183626625, + "grad_norm": 1.8955725882532888, + "learning_rate": 1.0293781164416905e-06, + "loss": 0.3574, + "step": 11223 + }, + { + "epoch": 1.7175210405508798, + "grad_norm": 2.096578082910678, + "learning_rate": 1.0282832721113267e-06, + "loss": 0.351, + "step": 11224 + }, + { + "epoch": 1.7176740627390972, + "grad_norm": 1.9892607721302973, + "learning_rate": 1.0271889787691846e-06, + "loss": 0.2568, + "step": 11225 + }, + { + "epoch": 1.7178270849273145, + "grad_norm": 2.2149399450508884, + "learning_rate": 1.0260952364824694e-06, + "loss": 0.2597, + "step": 11226 + }, + { + "epoch": 1.7179801071155318, + "grad_norm": 2.198504328031844, + "learning_rate": 1.0250020453183506e-06, + "loss": 0.2871, + "step": 11227 + }, + { + "epoch": 1.718133129303749, + "grad_norm": 1.8657046369191856, + "learning_rate": 1.023909405343968e-06, + "loss": 0.2746, + "step": 11228 + }, + { + "epoch": 1.7182861514919663, + "grad_norm": 2.522595038063424, + "learning_rate": 1.0228173166264233e-06, + "loss": 0.2711, + "step": 11229 + }, + { + "epoch": 1.7184391736801836, + "grad_norm": 2.0394131673625937, + "learning_rate": 1.0217257792327862e-06, + "loss": 0.3378, + "step": 11230 + }, + { + "epoch": 1.718592195868401, + "grad_norm": 2.3529953949487568, + "learning_rate": 1.020634793230092e-06, + "loss": 0.3383, + "step": 11231 + }, + { + "epoch": 1.7187452180566183, + "grad_norm": 2.491294117442281, + "learning_rate": 1.0195443586853415e-06, + "loss": 0.4019, + "step": 11232 + }, + { + "epoch": 1.7188982402448354, + "grad_norm": 1.903192324592772, + "learning_rate": 1.0184544756655047e-06, + "loss": 0.2835, + "step": 11233 + }, + { + "epoch": 1.719051262433053, + "grad_norm": 2.2662638217854467, + "learning_rate": 1.0173651442375154e-06, + "loss": 0.3071, + "step": 11234 + }, + { + "epoch": 1.71920428462127, + "grad_norm": 2.1920337232874973, + "learning_rate": 1.0162763644682715e-06, + "loss": 0.3151, + "step": 11235 + }, + { + "epoch": 1.7193573068094874, + "grad_norm": 1.8404140119518602, + "learning_rate": 1.0151881364246407e-06, + "loss": 0.2563, + "step": 11236 + }, + { + "epoch": 1.7195103289977047, + "grad_norm": 2.371469181227642, + "learning_rate": 1.0141004601734562e-06, + "loss": 0.3661, + "step": 11237 + }, + { + "epoch": 1.7196633511859218, + "grad_norm": 2.676750375304351, + "learning_rate": 1.0130133357815142e-06, + "loss": 0.3352, + "step": 11238 + }, + { + "epoch": 1.7198163733741394, + "grad_norm": 2.2395476317147556, + "learning_rate": 1.0119267633155816e-06, + "loss": 0.316, + "step": 11239 + }, + { + "epoch": 1.7199693955623565, + "grad_norm": 1.9873318256316577, + "learning_rate": 1.0108407428423873e-06, + "loss": 0.2805, + "step": 11240 + }, + { + "epoch": 1.7201224177505738, + "grad_norm": 2.1685902722728874, + "learning_rate": 1.0097552744286277e-06, + "loss": 0.3208, + "step": 11241 + }, + { + "epoch": 1.7202754399387912, + "grad_norm": 2.0648830031892103, + "learning_rate": 1.0086703581409696e-06, + "loss": 0.2799, + "step": 11242 + }, + { + "epoch": 1.7204284621270083, + "grad_norm": 2.168635886474466, + "learning_rate": 1.0075859940460374e-06, + "loss": 0.2345, + "step": 11243 + }, + { + "epoch": 1.7205814843152258, + "grad_norm": 2.061383471866174, + "learning_rate": 1.006502182210426e-06, + "loss": 0.3173, + "step": 11244 + }, + { + "epoch": 1.720734506503443, + "grad_norm": 2.2431859409567343, + "learning_rate": 1.0054189227007027e-06, + "loss": 0.3387, + "step": 11245 + }, + { + "epoch": 1.7208875286916603, + "grad_norm": 1.9175774670047985, + "learning_rate": 1.0043362155833857e-06, + "loss": 0.2928, + "step": 11246 + }, + { + "epoch": 1.7210405508798776, + "grad_norm": 1.7032224043623452, + "learning_rate": 1.0032540609249752e-06, + "loss": 0.2237, + "step": 11247 + }, + { + "epoch": 1.7211935730680947, + "grad_norm": 2.3240615291172344, + "learning_rate": 1.0021724587919302e-06, + "loss": 0.3035, + "step": 11248 + }, + { + "epoch": 1.7213465952563123, + "grad_norm": 2.1714863936300706, + "learning_rate": 1.0010914092506706e-06, + "loss": 0.3228, + "step": 11249 + }, + { + "epoch": 1.7214996174445294, + "grad_norm": 1.8727184181754257, + "learning_rate": 1.0000109123675938e-06, + "loss": 0.2461, + "step": 11250 + }, + { + "epoch": 1.7216526396327467, + "grad_norm": 1.8880016801810797, + "learning_rate": 9.989309682090564e-07, + "loss": 0.2402, + "step": 11251 + }, + { + "epoch": 1.721805661820964, + "grad_norm": 2.0474369307251807, + "learning_rate": 9.978515768413766e-07, + "loss": 0.2758, + "step": 11252 + }, + { + "epoch": 1.7219586840091812, + "grad_norm": 2.183342201937387, + "learning_rate": 9.967727383308501e-07, + "loss": 0.3161, + "step": 11253 + }, + { + "epoch": 1.7221117061973987, + "grad_norm": 2.1222715461235153, + "learning_rate": 9.956944527437308e-07, + "loss": 0.3003, + "step": 11254 + }, + { + "epoch": 1.7222647283856158, + "grad_norm": 1.9626862552546822, + "learning_rate": 9.946167201462399e-07, + "loss": 0.2209, + "step": 11255 + }, + { + "epoch": 1.7224177505738332, + "grad_norm": 2.2158250843339684, + "learning_rate": 9.935395406045655e-07, + "loss": 0.2603, + "step": 11256 + }, + { + "epoch": 1.7225707727620505, + "grad_norm": 2.039833535788292, + "learning_rate": 9.924629141848607e-07, + "loss": 0.2453, + "step": 11257 + }, + { + "epoch": 1.7227237949502678, + "grad_norm": 2.3722448003118073, + "learning_rate": 9.913868409532467e-07, + "loss": 0.3115, + "step": 11258 + }, + { + "epoch": 1.7228768171384852, + "grad_norm": 2.0730045335293257, + "learning_rate": 9.903113209758098e-07, + "loss": 0.3535, + "step": 11259 + }, + { + "epoch": 1.7230298393267023, + "grad_norm": 2.475071828831205, + "learning_rate": 9.892363543185989e-07, + "loss": 0.3188, + "step": 11260 + }, + { + "epoch": 1.7231828615149196, + "grad_norm": 2.053885122377709, + "learning_rate": 9.88161941047635e-07, + "loss": 0.2961, + "step": 11261 + }, + { + "epoch": 1.723335883703137, + "grad_norm": 1.9970096655581508, + "learning_rate": 9.870880812289008e-07, + "loss": 0.2553, + "step": 11262 + }, + { + "epoch": 1.7234889058913543, + "grad_norm": 2.091151342045369, + "learning_rate": 9.860147749283455e-07, + "loss": 0.3145, + "step": 11263 + }, + { + "epoch": 1.7236419280795716, + "grad_norm": 2.3526633638739427, + "learning_rate": 9.849420222118866e-07, + "loss": 0.3026, + "step": 11264 + }, + { + "epoch": 1.7237949502677887, + "grad_norm": 2.2486718187261983, + "learning_rate": 9.838698231454048e-07, + "loss": 0.3352, + "step": 11265 + }, + { + "epoch": 1.7239479724560063, + "grad_norm": 2.134344080040552, + "learning_rate": 9.827981777947481e-07, + "loss": 0.2745, + "step": 11266 + }, + { + "epoch": 1.7241009946442234, + "grad_norm": 2.13120696519989, + "learning_rate": 9.817270862257312e-07, + "loss": 0.2227, + "step": 11267 + }, + { + "epoch": 1.7242540168324407, + "grad_norm": 1.9544916604476297, + "learning_rate": 9.80656548504133e-07, + "loss": 0.2757, + "step": 11268 + }, + { + "epoch": 1.724407039020658, + "grad_norm": 2.305241911383372, + "learning_rate": 9.795865646957015e-07, + "loss": 0.2935, + "step": 11269 + }, + { + "epoch": 1.7245600612088752, + "grad_norm": 2.217856703258211, + "learning_rate": 9.785171348661438e-07, + "loss": 0.321, + "step": 11270 + }, + { + "epoch": 1.7247130833970927, + "grad_norm": 2.2401253166837325, + "learning_rate": 9.774482590811451e-07, + "loss": 0.2609, + "step": 11271 + }, + { + "epoch": 1.7248661055853098, + "grad_norm": 2.379006393238006, + "learning_rate": 9.763799374063442e-07, + "loss": 0.3083, + "step": 11272 + }, + { + "epoch": 1.7250191277735272, + "grad_norm": 2.0967622566022364, + "learning_rate": 9.753121699073487e-07, + "loss": 0.2765, + "step": 11273 + }, + { + "epoch": 1.7251721499617445, + "grad_norm": 2.2026134591533024, + "learning_rate": 9.742449566497424e-07, + "loss": 0.2821, + "step": 11274 + }, + { + "epoch": 1.7253251721499616, + "grad_norm": 2.0261515121698928, + "learning_rate": 9.7317829769906e-07, + "loss": 0.3014, + "step": 11275 + }, + { + "epoch": 1.7254781943381792, + "grad_norm": 2.019153648269614, + "learning_rate": 9.7211219312081e-07, + "loss": 0.2442, + "step": 11276 + }, + { + "epoch": 1.7256312165263963, + "grad_norm": 2.430551797468562, + "learning_rate": 9.71046642980471e-07, + "loss": 0.3102, + "step": 11277 + }, + { + "epoch": 1.7257842387146136, + "grad_norm": 1.950502258473303, + "learning_rate": 9.699816473434753e-07, + "loss": 0.3363, + "step": 11278 + }, + { + "epoch": 1.725937260902831, + "grad_norm": 2.198363605772209, + "learning_rate": 9.68917206275234e-07, + "loss": 0.3117, + "step": 11279 + }, + { + "epoch": 1.726090283091048, + "grad_norm": 1.9226727381010975, + "learning_rate": 9.678533198411188e-07, + "loss": 0.2341, + "step": 11280 + }, + { + "epoch": 1.7262433052792656, + "grad_norm": 2.200988192635039, + "learning_rate": 9.667899881064624e-07, + "loss": 0.3561, + "step": 11281 + }, + { + "epoch": 1.7263963274674827, + "grad_norm": 2.4511019558072196, + "learning_rate": 9.657272111365712e-07, + "loss": 0.3339, + "step": 11282 + }, + { + "epoch": 1.7265493496557, + "grad_norm": 2.649575488293189, + "learning_rate": 9.646649889967152e-07, + "loss": 0.3311, + "step": 11283 + }, + { + "epoch": 1.7267023718439174, + "grad_norm": 2.0152891336023866, + "learning_rate": 9.63603321752129e-07, + "loss": 0.2774, + "step": 11284 + }, + { + "epoch": 1.7268553940321345, + "grad_norm": 2.0731551227423015, + "learning_rate": 9.625422094680126e-07, + "loss": 0.2537, + "step": 11285 + }, + { + "epoch": 1.727008416220352, + "grad_norm": 1.947496310724743, + "learning_rate": 9.614816522095339e-07, + "loss": 0.2203, + "step": 11286 + }, + { + "epoch": 1.7271614384085692, + "grad_norm": 2.3574204738706803, + "learning_rate": 9.604216500418262e-07, + "loss": 0.2976, + "step": 11287 + }, + { + "epoch": 1.7273144605967865, + "grad_norm": 2.014474596823735, + "learning_rate": 9.593622030299876e-07, + "loss": 0.339, + "step": 11288 + }, + { + "epoch": 1.7274674827850038, + "grad_norm": 1.8008681537701139, + "learning_rate": 9.583033112390838e-07, + "loss": 0.2211, + "step": 11289 + }, + { + "epoch": 1.7276205049732212, + "grad_norm": 2.0716788361002685, + "learning_rate": 9.57244974734145e-07, + "loss": 0.2579, + "step": 11290 + }, + { + "epoch": 1.7277735271614385, + "grad_norm": 2.2269332519553537, + "learning_rate": 9.561871935801647e-07, + "loss": 0.2713, + "step": 11291 + }, + { + "epoch": 1.7279265493496556, + "grad_norm": 2.4042579479130257, + "learning_rate": 9.551299678421133e-07, + "loss": 0.3419, + "step": 11292 + }, + { + "epoch": 1.728079571537873, + "grad_norm": 2.0798852529452336, + "learning_rate": 9.540732975849122e-07, + "loss": 0.2732, + "step": 11293 + }, + { + "epoch": 1.7282325937260903, + "grad_norm": 2.345583325761405, + "learning_rate": 9.530171828734558e-07, + "loss": 0.2879, + "step": 11294 + }, + { + "epoch": 1.7283856159143076, + "grad_norm": 2.1856914083684194, + "learning_rate": 9.519616237726104e-07, + "loss": 0.2799, + "step": 11295 + }, + { + "epoch": 1.728538638102525, + "grad_norm": 1.9641914474915316, + "learning_rate": 9.509066203471962e-07, + "loss": 0.2643, + "step": 11296 + }, + { + "epoch": 1.728691660290742, + "grad_norm": 2.2317078996984523, + "learning_rate": 9.498521726620036e-07, + "loss": 0.3013, + "step": 11297 + }, + { + "epoch": 1.7288446824789594, + "grad_norm": 2.0330881449616705, + "learning_rate": 9.487982807817975e-07, + "loss": 0.2861, + "step": 11298 + }, + { + "epoch": 1.7289977046671767, + "grad_norm": 2.0998242046821702, + "learning_rate": 9.477449447712938e-07, + "loss": 0.3496, + "step": 11299 + }, + { + "epoch": 1.729150726855394, + "grad_norm": 2.57819425032008, + "learning_rate": 9.466921646951888e-07, + "loss": 0.3567, + "step": 11300 + }, + { + "epoch": 1.7293037490436114, + "grad_norm": 2.2586641955898257, + "learning_rate": 9.456399406181349e-07, + "loss": 0.284, + "step": 11301 + }, + { + "epoch": 1.7294567712318285, + "grad_norm": 2.4828821297651373, + "learning_rate": 9.445882726047507e-07, + "loss": 0.2696, + "step": 11302 + }, + { + "epoch": 1.729609793420046, + "grad_norm": 1.850980352969131, + "learning_rate": 9.435371607196264e-07, + "loss": 0.2184, + "step": 11303 + }, + { + "epoch": 1.7297628156082632, + "grad_norm": 2.557017556726781, + "learning_rate": 9.424866050273163e-07, + "loss": 0.2398, + "step": 11304 + }, + { + "epoch": 1.7299158377964805, + "grad_norm": 1.9141775522394715, + "learning_rate": 9.41436605592333e-07, + "loss": 0.2731, + "step": 11305 + }, + { + "epoch": 1.7300688599846978, + "grad_norm": 2.0152008196879665, + "learning_rate": 9.403871624791672e-07, + "loss": 0.2778, + "step": 11306 + }, + { + "epoch": 1.730221882172915, + "grad_norm": 2.533871929801867, + "learning_rate": 9.393382757522673e-07, + "loss": 0.2577, + "step": 11307 + }, + { + "epoch": 1.7303749043611325, + "grad_norm": 2.151577434545533, + "learning_rate": 9.382899454760497e-07, + "loss": 0.2912, + "step": 11308 + }, + { + "epoch": 1.7305279265493496, + "grad_norm": 2.002420979004715, + "learning_rate": 9.372421717148961e-07, + "loss": 0.2804, + "step": 11309 + }, + { + "epoch": 1.730680948737567, + "grad_norm": 1.9167255742661933, + "learning_rate": 9.361949545331539e-07, + "loss": 0.2903, + "step": 11310 + }, + { + "epoch": 1.7308339709257843, + "grad_norm": 1.7526154281756776, + "learning_rate": 9.351482939951373e-07, + "loss": 0.2414, + "step": 11311 + }, + { + "epoch": 1.7309869931140014, + "grad_norm": 2.298302566978914, + "learning_rate": 9.341021901651259e-07, + "loss": 0.2814, + "step": 11312 + }, + { + "epoch": 1.731140015302219, + "grad_norm": 2.151736437727825, + "learning_rate": 9.330566431073663e-07, + "loss": 0.2432, + "step": 11313 + }, + { + "epoch": 1.731293037490436, + "grad_norm": 2.0864141985693156, + "learning_rate": 9.320116528860667e-07, + "loss": 0.2428, + "step": 11314 + }, + { + "epoch": 1.7314460596786534, + "grad_norm": 2.0619824325768095, + "learning_rate": 9.309672195654063e-07, + "loss": 0.2943, + "step": 11315 + }, + { + "epoch": 1.7315990818668707, + "grad_norm": 2.142611756371609, + "learning_rate": 9.299233432095267e-07, + "loss": 0.2184, + "step": 11316 + }, + { + "epoch": 1.7317521040550878, + "grad_norm": 2.1460629954558548, + "learning_rate": 9.28880023882538e-07, + "loss": 0.336, + "step": 11317 + }, + { + "epoch": 1.7319051262433054, + "grad_norm": 2.203744055586528, + "learning_rate": 9.27837261648512e-07, + "loss": 0.2863, + "step": 11318 + }, + { + "epoch": 1.7320581484315225, + "grad_norm": 2.2301200311317104, + "learning_rate": 9.267950565714911e-07, + "loss": 0.3067, + "step": 11319 + }, + { + "epoch": 1.7322111706197398, + "grad_norm": 1.977711646870203, + "learning_rate": 9.257534087154796e-07, + "loss": 0.2776, + "step": 11320 + }, + { + "epoch": 1.7323641928079572, + "grad_norm": 2.1372521289802386, + "learning_rate": 9.247123181444517e-07, + "loss": 0.2968, + "step": 11321 + }, + { + "epoch": 1.7325172149961743, + "grad_norm": 2.118149417733798, + "learning_rate": 9.23671784922342e-07, + "loss": 0.2988, + "step": 11322 + }, + { + "epoch": 1.7326702371843918, + "grad_norm": 2.1422567198478872, + "learning_rate": 9.226318091130537e-07, + "loss": 0.2463, + "step": 11323 + }, + { + "epoch": 1.732823259372609, + "grad_norm": 2.0668194086851748, + "learning_rate": 9.215923907804591e-07, + "loss": 0.3208, + "step": 11324 + }, + { + "epoch": 1.7329762815608263, + "grad_norm": 2.5807343354347423, + "learning_rate": 9.205535299883894e-07, + "loss": 0.2877, + "step": 11325 + }, + { + "epoch": 1.7331293037490436, + "grad_norm": 2.09122486977293, + "learning_rate": 9.195152268006457e-07, + "loss": 0.2624, + "step": 11326 + }, + { + "epoch": 1.733282325937261, + "grad_norm": 2.220035139943399, + "learning_rate": 9.184774812809972e-07, + "loss": 0.2811, + "step": 11327 + }, + { + "epoch": 1.7334353481254783, + "grad_norm": 2.311077102527897, + "learning_rate": 9.174402934931704e-07, + "loss": 0.3409, + "step": 11328 + }, + { + "epoch": 1.7335883703136954, + "grad_norm": 1.845809541515683, + "learning_rate": 9.164036635008689e-07, + "loss": 0.259, + "step": 11329 + }, + { + "epoch": 1.7337413925019127, + "grad_norm": 1.9894773477892849, + "learning_rate": 9.15367591367754e-07, + "loss": 0.2562, + "step": 11330 + }, + { + "epoch": 1.73389441469013, + "grad_norm": 2.343597507954915, + "learning_rate": 9.143320771574526e-07, + "loss": 0.2868, + "step": 11331 + }, + { + "epoch": 1.7340474368783474, + "grad_norm": 2.3318031296705426, + "learning_rate": 9.132971209335628e-07, + "loss": 0.3158, + "step": 11332 + }, + { + "epoch": 1.7342004590665647, + "grad_norm": 2.009158194023136, + "learning_rate": 9.122627227596458e-07, + "loss": 0.2978, + "step": 11333 + }, + { + "epoch": 1.7343534812547818, + "grad_norm": 2.0812300521920086, + "learning_rate": 9.112288826992233e-07, + "loss": 0.3239, + "step": 11334 + }, + { + "epoch": 1.7345065034429994, + "grad_norm": 1.9694566051586355, + "learning_rate": 9.10195600815792e-07, + "loss": 0.302, + "step": 11335 + }, + { + "epoch": 1.7346595256312165, + "grad_norm": 2.1698146071646707, + "learning_rate": 9.091628771728078e-07, + "loss": 0.3485, + "step": 11336 + }, + { + "epoch": 1.7348125478194338, + "grad_norm": 2.2078184817883404, + "learning_rate": 9.081307118336957e-07, + "loss": 0.2642, + "step": 11337 + }, + { + "epoch": 1.7349655700076512, + "grad_norm": 2.0498493991285347, + "learning_rate": 9.070991048618438e-07, + "loss": 0.269, + "step": 11338 + }, + { + "epoch": 1.7351185921958683, + "grad_norm": 1.9622688176545964, + "learning_rate": 9.060680563206082e-07, + "loss": 0.218, + "step": 11339 + }, + { + "epoch": 1.7352716143840858, + "grad_norm": 2.0330487439105545, + "learning_rate": 9.05037566273308e-07, + "loss": 0.2984, + "step": 11340 + }, + { + "epoch": 1.735424636572303, + "grad_norm": 2.174098256735964, + "learning_rate": 9.040076347832305e-07, + "loss": 0.2946, + "step": 11341 + }, + { + "epoch": 1.7355776587605203, + "grad_norm": 1.9391762352970459, + "learning_rate": 9.029782619136285e-07, + "loss": 0.2331, + "step": 11342 + }, + { + "epoch": 1.7357306809487376, + "grad_norm": 1.954971032676284, + "learning_rate": 9.01949447727719e-07, + "loss": 0.2687, + "step": 11343 + }, + { + "epoch": 1.7358837031369547, + "grad_norm": 2.3946998205627383, + "learning_rate": 9.009211922886862e-07, + "loss": 0.3037, + "step": 11344 + }, + { + "epoch": 1.7360367253251723, + "grad_norm": 2.4884493360178195, + "learning_rate": 8.998934956596772e-07, + "loss": 0.3187, + "step": 11345 + }, + { + "epoch": 1.7361897475133894, + "grad_norm": 2.0318519644823403, + "learning_rate": 8.988663579038104e-07, + "loss": 0.2894, + "step": 11346 + }, + { + "epoch": 1.7363427697016067, + "grad_norm": 2.032057463388814, + "learning_rate": 8.978397790841631e-07, + "loss": 0.2795, + "step": 11347 + }, + { + "epoch": 1.736495791889824, + "grad_norm": 1.9829708365779646, + "learning_rate": 8.968137592637838e-07, + "loss": 0.2309, + "step": 11348 + }, + { + "epoch": 1.7366488140780412, + "grad_norm": 1.9451190583188658, + "learning_rate": 8.95788298505682e-07, + "loss": 0.2661, + "step": 11349 + }, + { + "epoch": 1.7368018362662587, + "grad_norm": 2.2877897156250455, + "learning_rate": 8.947633968728376e-07, + "loss": 0.2577, + "step": 11350 + }, + { + "epoch": 1.7369548584544758, + "grad_norm": 2.28996390901099, + "learning_rate": 8.937390544281932e-07, + "loss": 0.2877, + "step": 11351 + }, + { + "epoch": 1.7371078806426932, + "grad_norm": 2.1909353599002626, + "learning_rate": 8.927152712346542e-07, + "loss": 0.2953, + "step": 11352 + }, + { + "epoch": 1.7372609028309105, + "grad_norm": 2.0225927534092505, + "learning_rate": 8.916920473551039e-07, + "loss": 0.2364, + "step": 11353 + }, + { + "epoch": 1.7374139250191276, + "grad_norm": 2.195364625764178, + "learning_rate": 8.906693828523738e-07, + "loss": 0.2801, + "step": 11354 + }, + { + "epoch": 1.7375669472073452, + "grad_norm": 2.198796276058231, + "learning_rate": 8.896472777892706e-07, + "loss": 0.2753, + "step": 11355 + }, + { + "epoch": 1.7377199693955623, + "grad_norm": 1.9924419768513708, + "learning_rate": 8.886257322285729e-07, + "loss": 0.2271, + "step": 11356 + }, + { + "epoch": 1.7378729915837796, + "grad_norm": 1.8575121798898055, + "learning_rate": 8.876047462330084e-07, + "loss": 0.2319, + "step": 11357 + }, + { + "epoch": 1.738026013771997, + "grad_norm": 2.0271128466885022, + "learning_rate": 8.865843198652858e-07, + "loss": 0.279, + "step": 11358 + }, + { + "epoch": 1.7381790359602143, + "grad_norm": 2.020993282119618, + "learning_rate": 8.855644531880747e-07, + "loss": 0.2546, + "step": 11359 + }, + { + "epoch": 1.7383320581484316, + "grad_norm": 2.116368172177348, + "learning_rate": 8.84545146264002e-07, + "loss": 0.3017, + "step": 11360 + }, + { + "epoch": 1.7384850803366487, + "grad_norm": 2.1967153978718184, + "learning_rate": 8.835263991556742e-07, + "loss": 0.2769, + "step": 11361 + }, + { + "epoch": 1.738638102524866, + "grad_norm": 1.9077855748756016, + "learning_rate": 8.825082119256557e-07, + "loss": 0.2808, + "step": 11362 + }, + { + "epoch": 1.7387911247130834, + "grad_norm": 1.8821456287630594, + "learning_rate": 8.814905846364719e-07, + "loss": 0.2419, + "step": 11363 + }, + { + "epoch": 1.7389441469013007, + "grad_norm": 2.3135659597892784, + "learning_rate": 8.804735173506251e-07, + "loss": 0.3392, + "step": 11364 + }, + { + "epoch": 1.739097169089518, + "grad_norm": 1.8049680049774777, + "learning_rate": 8.794570101305744e-07, + "loss": 0.2129, + "step": 11365 + }, + { + "epoch": 1.7392501912777352, + "grad_norm": 2.355594127900009, + "learning_rate": 8.784410630387497e-07, + "loss": 0.2976, + "step": 11366 + }, + { + "epoch": 1.7394032134659527, + "grad_norm": 1.8566497792068795, + "learning_rate": 8.774256761375432e-07, + "loss": 0.2239, + "step": 11367 + }, + { + "epoch": 1.7395562356541698, + "grad_norm": 1.8551089045311278, + "learning_rate": 8.764108494893131e-07, + "loss": 0.222, + "step": 11368 + }, + { + "epoch": 1.7397092578423872, + "grad_norm": 2.11851018960962, + "learning_rate": 8.753965831563838e-07, + "loss": 0.2562, + "step": 11369 + }, + { + "epoch": 1.7398622800306045, + "grad_norm": 2.2494424386365672, + "learning_rate": 8.743828772010465e-07, + "loss": 0.3498, + "step": 11370 + }, + { + "epoch": 1.7400153022188216, + "grad_norm": 2.0475301438582627, + "learning_rate": 8.733697316855572e-07, + "loss": 0.314, + "step": 11371 + }, + { + "epoch": 1.7401683244070392, + "grad_norm": 2.346917348304334, + "learning_rate": 8.723571466721348e-07, + "loss": 0.2817, + "step": 11372 + }, + { + "epoch": 1.7403213465952563, + "grad_norm": 1.9556503799353762, + "learning_rate": 8.713451222229674e-07, + "loss": 0.2514, + "step": 11373 + }, + { + "epoch": 1.7404743687834736, + "grad_norm": 1.909780103481461, + "learning_rate": 8.703336584002098e-07, + "loss": 0.2623, + "step": 11374 + }, + { + "epoch": 1.740627390971691, + "grad_norm": 2.2751671168907084, + "learning_rate": 8.693227552659766e-07, + "loss": 0.3077, + "step": 11375 + }, + { + "epoch": 1.740780413159908, + "grad_norm": 2.295512749361854, + "learning_rate": 8.683124128823506e-07, + "loss": 0.3109, + "step": 11376 + }, + { + "epoch": 1.7409334353481256, + "grad_norm": 2.070742958849601, + "learning_rate": 8.673026313113853e-07, + "loss": 0.2412, + "step": 11377 + }, + { + "epoch": 1.7410864575363427, + "grad_norm": 2.107583228263569, + "learning_rate": 8.662934106150922e-07, + "loss": 0.2787, + "step": 11378 + }, + { + "epoch": 1.74123947972456, + "grad_norm": 2.361860363040369, + "learning_rate": 8.652847508554497e-07, + "loss": 0.3423, + "step": 11379 + }, + { + "epoch": 1.7413925019127774, + "grad_norm": 2.0767444703186912, + "learning_rate": 8.642766520944091e-07, + "loss": 0.3141, + "step": 11380 + }, + { + "epoch": 1.7415455241009945, + "grad_norm": 2.336321547404426, + "learning_rate": 8.632691143938754e-07, + "loss": 0.3341, + "step": 11381 + }, + { + "epoch": 1.741698546289212, + "grad_norm": 2.0940993397565943, + "learning_rate": 8.622621378157292e-07, + "loss": 0.2899, + "step": 11382 + }, + { + "epoch": 1.7418515684774292, + "grad_norm": 2.3391721829229533, + "learning_rate": 8.612557224218154e-07, + "loss": 0.3682, + "step": 11383 + }, + { + "epoch": 1.7420045906656465, + "grad_norm": 2.4228550269184, + "learning_rate": 8.602498682739347e-07, + "loss": 0.2575, + "step": 11384 + }, + { + "epoch": 1.7421576128538638, + "grad_norm": 2.0087616832226405, + "learning_rate": 8.592445754338652e-07, + "loss": 0.2201, + "step": 11385 + }, + { + "epoch": 1.742310635042081, + "grad_norm": 2.1070525512343026, + "learning_rate": 8.582398439633466e-07, + "loss": 0.2497, + "step": 11386 + }, + { + "epoch": 1.7424636572302985, + "grad_norm": 2.08953895418418, + "learning_rate": 8.572356739240817e-07, + "loss": 0.2649, + "step": 11387 + }, + { + "epoch": 1.7426166794185156, + "grad_norm": 2.144426001085784, + "learning_rate": 8.562320653777401e-07, + "loss": 0.2698, + "step": 11388 + }, + { + "epoch": 1.742769701606733, + "grad_norm": 2.063685815560944, + "learning_rate": 8.55229018385958e-07, + "loss": 0.2662, + "step": 11389 + }, + { + "epoch": 1.7429227237949503, + "grad_norm": 2.0046930646619363, + "learning_rate": 8.542265330103372e-07, + "loss": 0.217, + "step": 11390 + }, + { + "epoch": 1.7430757459831676, + "grad_norm": 2.0568254195121796, + "learning_rate": 8.53224609312443e-07, + "loss": 0.2466, + "step": 11391 + }, + { + "epoch": 1.743228768171385, + "grad_norm": 2.342779252411613, + "learning_rate": 8.522232473538073e-07, + "loss": 0.3322, + "step": 11392 + }, + { + "epoch": 1.743381790359602, + "grad_norm": 2.245448570805698, + "learning_rate": 8.512224471959296e-07, + "loss": 0.2831, + "step": 11393 + }, + { + "epoch": 1.7435348125478194, + "grad_norm": 2.3014573795661146, + "learning_rate": 8.50222208900271e-07, + "loss": 0.2902, + "step": 11394 + }, + { + "epoch": 1.7436878347360367, + "grad_norm": 1.9968036086457233, + "learning_rate": 8.492225325282599e-07, + "loss": 0.2555, + "step": 11395 + }, + { + "epoch": 1.743840856924254, + "grad_norm": 1.9143267854598263, + "learning_rate": 8.482234181412918e-07, + "loss": 0.2421, + "step": 11396 + }, + { + "epoch": 1.7439938791124714, + "grad_norm": 1.7719945723268122, + "learning_rate": 8.472248658007253e-07, + "loss": 0.2534, + "step": 11397 + }, + { + "epoch": 1.7441469013006885, + "grad_norm": 2.533973732151635, + "learning_rate": 8.46226875567886e-07, + "loss": 0.3271, + "step": 11398 + }, + { + "epoch": 1.744299923488906, + "grad_norm": 2.2236967026045082, + "learning_rate": 8.452294475040645e-07, + "loss": 0.3054, + "step": 11399 + }, + { + "epoch": 1.7444529456771232, + "grad_norm": 2.1544617140659237, + "learning_rate": 8.442325816705154e-07, + "loss": 0.2555, + "step": 11400 + }, + { + "epoch": 1.7446059678653405, + "grad_norm": 2.3671783149900247, + "learning_rate": 8.43236278128462e-07, + "loss": 0.3112, + "step": 11401 + }, + { + "epoch": 1.7447589900535578, + "grad_norm": 2.125329113033378, + "learning_rate": 8.422405369390874e-07, + "loss": 0.2517, + "step": 11402 + }, + { + "epoch": 1.744912012241775, + "grad_norm": 2.374595888527881, + "learning_rate": 8.412453581635505e-07, + "loss": 0.3258, + "step": 11403 + }, + { + "epoch": 1.7450650344299925, + "grad_norm": 2.1344730686834503, + "learning_rate": 8.402507418629646e-07, + "loss": 0.3029, + "step": 11404 + }, + { + "epoch": 1.7452180566182096, + "grad_norm": 2.2387729700388648, + "learning_rate": 8.39256688098411e-07, + "loss": 0.2967, + "step": 11405 + }, + { + "epoch": 1.745371078806427, + "grad_norm": 1.959205870616328, + "learning_rate": 8.382631969309451e-07, + "loss": 0.2543, + "step": 11406 + }, + { + "epoch": 1.7455241009946443, + "grad_norm": 2.007488255339045, + "learning_rate": 8.37270268421575e-07, + "loss": 0.2798, + "step": 11407 + }, + { + "epoch": 1.7456771231828614, + "grad_norm": 2.152705889958382, + "learning_rate": 8.362779026312817e-07, + "loss": 0.2928, + "step": 11408 + }, + { + "epoch": 1.745830145371079, + "grad_norm": 2.005074106876521, + "learning_rate": 8.352860996210133e-07, + "loss": 0.2823, + "step": 11409 + }, + { + "epoch": 1.745983167559296, + "grad_norm": 2.692710912856644, + "learning_rate": 8.342948594516753e-07, + "loss": 0.3304, + "step": 11410 + }, + { + "epoch": 1.7461361897475134, + "grad_norm": 2.5533912783203205, + "learning_rate": 8.333041821841481e-07, + "loss": 0.3511, + "step": 11411 + }, + { + "epoch": 1.7462892119357307, + "grad_norm": 2.031777309784537, + "learning_rate": 8.32314067879274e-07, + "loss": 0.2852, + "step": 11412 + }, + { + "epoch": 1.7464422341239478, + "grad_norm": 1.735112109022947, + "learning_rate": 8.313245165978522e-07, + "loss": 0.2149, + "step": 11413 + }, + { + "epoch": 1.7465952563121654, + "grad_norm": 1.7149314804299296, + "learning_rate": 8.303355284006631e-07, + "loss": 0.1944, + "step": 11414 + }, + { + "epoch": 1.7467482785003825, + "grad_norm": 2.34769130403817, + "learning_rate": 8.293471033484412e-07, + "loss": 0.2967, + "step": 11415 + }, + { + "epoch": 1.7469013006885998, + "grad_norm": 2.104585496737917, + "learning_rate": 8.283592415018871e-07, + "loss": 0.3018, + "step": 11416 + }, + { + "epoch": 1.7470543228768172, + "grad_norm": 2.1709189972513117, + "learning_rate": 8.273719429216732e-07, + "loss": 0.3429, + "step": 11417 + }, + { + "epoch": 1.7472073450650343, + "grad_norm": 2.033005114625582, + "learning_rate": 8.26385207668432e-07, + "loss": 0.2769, + "step": 11418 + }, + { + "epoch": 1.7473603672532518, + "grad_norm": 2.200352392592713, + "learning_rate": 8.25399035802763e-07, + "loss": 0.3022, + "step": 11419 + }, + { + "epoch": 1.747513389441469, + "grad_norm": 2.016402621506828, + "learning_rate": 8.244134273852289e-07, + "loss": 0.3039, + "step": 11420 + }, + { + "epoch": 1.7476664116296863, + "grad_norm": 1.9838466411531992, + "learning_rate": 8.234283824763623e-07, + "loss": 0.2549, + "step": 11421 + }, + { + "epoch": 1.7478194338179036, + "grad_norm": 1.9775094872488335, + "learning_rate": 8.22443901136658e-07, + "loss": 0.2676, + "step": 11422 + }, + { + "epoch": 1.7479724560061207, + "grad_norm": 1.9390339024939802, + "learning_rate": 8.214599834265757e-07, + "loss": 0.2548, + "step": 11423 + }, + { + "epoch": 1.7481254781943383, + "grad_norm": 2.2990351085347966, + "learning_rate": 8.204766294065436e-07, + "loss": 0.2939, + "step": 11424 + }, + { + "epoch": 1.7482785003825554, + "grad_norm": 2.1371254621600824, + "learning_rate": 8.194938391369511e-07, + "loss": 0.2853, + "step": 11425 + }, + { + "epoch": 1.7484315225707727, + "grad_norm": 1.8783787360270152, + "learning_rate": 8.185116126781567e-07, + "loss": 0.2473, + "step": 11426 + }, + { + "epoch": 1.74858454475899, + "grad_norm": 1.946085021023166, + "learning_rate": 8.175299500904821e-07, + "loss": 0.2571, + "step": 11427 + }, + { + "epoch": 1.7487375669472074, + "grad_norm": 2.2973323888353607, + "learning_rate": 8.165488514342157e-07, + "loss": 0.3026, + "step": 11428 + }, + { + "epoch": 1.7488905891354247, + "grad_norm": 2.2079906556541835, + "learning_rate": 8.155683167696105e-07, + "loss": 0.3072, + "step": 11429 + }, + { + "epoch": 1.7490436113236418, + "grad_norm": 2.1061825703626034, + "learning_rate": 8.145883461568837e-07, + "loss": 0.287, + "step": 11430 + }, + { + "epoch": 1.7491966335118592, + "grad_norm": 2.4976507044518965, + "learning_rate": 8.136089396562186e-07, + "loss": 0.3586, + "step": 11431 + }, + { + "epoch": 1.7493496557000765, + "grad_norm": 2.1563888177726134, + "learning_rate": 8.126300973277701e-07, + "loss": 0.2401, + "step": 11432 + }, + { + "epoch": 1.7495026778882938, + "grad_norm": 2.4782391257527436, + "learning_rate": 8.116518192316459e-07, + "loss": 0.3337, + "step": 11433 + }, + { + "epoch": 1.7496557000765112, + "grad_norm": 2.0657477292079895, + "learning_rate": 8.106741054279277e-07, + "loss": 0.2599, + "step": 11434 + }, + { + "epoch": 1.7498087222647283, + "grad_norm": 2.4612207172641676, + "learning_rate": 8.096969559766643e-07, + "loss": 0.2549, + "step": 11435 + }, + { + "epoch": 1.7499617444529458, + "grad_norm": 2.118689141461657, + "learning_rate": 8.087203709378622e-07, + "loss": 0.2663, + "step": 11436 + }, + { + "epoch": 1.750114766641163, + "grad_norm": 2.19481740548431, + "learning_rate": 8.077443503714965e-07, + "loss": 0.3231, + "step": 11437 + }, + { + "epoch": 1.7502677888293803, + "grad_norm": 2.345923801834366, + "learning_rate": 8.06768894337514e-07, + "loss": 0.3122, + "step": 11438 + }, + { + "epoch": 1.7504208110175976, + "grad_norm": 2.1517185673359727, + "learning_rate": 8.057940028958145e-07, + "loss": 0.271, + "step": 11439 + }, + { + "epoch": 1.7505738332058147, + "grad_norm": 2.047360134838788, + "learning_rate": 8.048196761062743e-07, + "loss": 0.2962, + "step": 11440 + }, + { + "epoch": 1.7507268553940323, + "grad_norm": 2.166641555801906, + "learning_rate": 8.038459140287325e-07, + "loss": 0.3112, + "step": 11441 + }, + { + "epoch": 1.7508798775822494, + "grad_norm": 2.050956415007141, + "learning_rate": 8.028727167229844e-07, + "loss": 0.2088, + "step": 11442 + }, + { + "epoch": 1.7510328997704667, + "grad_norm": 1.7371176140968658, + "learning_rate": 8.019000842488045e-07, + "loss": 0.2038, + "step": 11443 + }, + { + "epoch": 1.751185921958684, + "grad_norm": 2.948667707658099, + "learning_rate": 8.00928016665925e-07, + "loss": 0.2795, + "step": 11444 + }, + { + "epoch": 1.7513389441469012, + "grad_norm": 1.9595754810553292, + "learning_rate": 7.999565140340393e-07, + "loss": 0.3042, + "step": 11445 + }, + { + "epoch": 1.7514919663351187, + "grad_norm": 1.9510168138436947, + "learning_rate": 7.989855764128163e-07, + "loss": 0.2728, + "step": 11446 + }, + { + "epoch": 1.7516449885233358, + "grad_norm": 2.25768939840928, + "learning_rate": 7.980152038618838e-07, + "loss": 0.2869, + "step": 11447 + }, + { + "epoch": 1.7517980107115532, + "grad_norm": 2.322938772528777, + "learning_rate": 7.970453964408365e-07, + "loss": 0.3438, + "step": 11448 + }, + { + "epoch": 1.7519510328997705, + "grad_norm": 2.1999953646735477, + "learning_rate": 7.960761542092332e-07, + "loss": 0.3005, + "step": 11449 + }, + { + "epoch": 1.7521040550879876, + "grad_norm": 1.7289761044406273, + "learning_rate": 7.951074772265987e-07, + "loss": 0.222, + "step": 11450 + }, + { + "epoch": 1.7522570772762052, + "grad_norm": 2.190680943604739, + "learning_rate": 7.941393655524243e-07, + "loss": 0.3138, + "step": 11451 + }, + { + "epoch": 1.7524100994644223, + "grad_norm": 2.425449360415477, + "learning_rate": 7.931718192461657e-07, + "loss": 0.3134, + "step": 11452 + }, + { + "epoch": 1.7525631216526396, + "grad_norm": 1.8979134354611327, + "learning_rate": 7.92204838367242e-07, + "loss": 0.3067, + "step": 11453 + }, + { + "epoch": 1.752716143840857, + "grad_norm": 1.9714110258957196, + "learning_rate": 7.912384229750402e-07, + "loss": 0.2503, + "step": 11454 + }, + { + "epoch": 1.752869166029074, + "grad_norm": 2.2808248096299493, + "learning_rate": 7.902725731289107e-07, + "loss": 0.2858, + "step": 11455 + }, + { + "epoch": 1.7530221882172916, + "grad_norm": 1.7634975671903566, + "learning_rate": 7.893072888881748e-07, + "loss": 0.2124, + "step": 11456 + }, + { + "epoch": 1.7531752104055087, + "grad_norm": 1.990459773666613, + "learning_rate": 7.883425703121083e-07, + "loss": 0.2959, + "step": 11457 + }, + { + "epoch": 1.753328232593726, + "grad_norm": 1.952214898201334, + "learning_rate": 7.873784174599586e-07, + "loss": 0.2631, + "step": 11458 + }, + { + "epoch": 1.7534812547819434, + "grad_norm": 2.8143096353414445, + "learning_rate": 7.864148303909447e-07, + "loss": 0.3382, + "step": 11459 + }, + { + "epoch": 1.7536342769701607, + "grad_norm": 2.1359237551802837, + "learning_rate": 7.85451809164236e-07, + "loss": 0.233, + "step": 11460 + }, + { + "epoch": 1.753787299158378, + "grad_norm": 2.358983205654095, + "learning_rate": 7.844893538389808e-07, + "loss": 0.2735, + "step": 11461 + }, + { + "epoch": 1.7539403213465952, + "grad_norm": 2.186624182602075, + "learning_rate": 7.835274644742874e-07, + "loss": 0.2891, + "step": 11462 + }, + { + "epoch": 1.7540933435348125, + "grad_norm": 2.1643668630024058, + "learning_rate": 7.82566141129224e-07, + "loss": 0.3178, + "step": 11463 + }, + { + "epoch": 1.7542463657230298, + "grad_norm": 2.391814174037397, + "learning_rate": 7.816053838628346e-07, + "loss": 0.3334, + "step": 11464 + }, + { + "epoch": 1.7543993879112472, + "grad_norm": 2.127914629053998, + "learning_rate": 7.80645192734123e-07, + "loss": 0.3049, + "step": 11465 + }, + { + "epoch": 1.7545524100994645, + "grad_norm": 2.3015822780937505, + "learning_rate": 7.796855678020531e-07, + "loss": 0.3718, + "step": 11466 + }, + { + "epoch": 1.7547054322876816, + "grad_norm": 2.1525976489722987, + "learning_rate": 7.787265091255636e-07, + "loss": 0.2538, + "step": 11467 + }, + { + "epoch": 1.7548584544758992, + "grad_norm": 2.2044691852542035, + "learning_rate": 7.777680167635537e-07, + "loss": 0.2651, + "step": 11468 + }, + { + "epoch": 1.7550114766641163, + "grad_norm": 2.046414617913969, + "learning_rate": 7.768100907748866e-07, + "loss": 0.2679, + "step": 11469 + }, + { + "epoch": 1.7551644988523336, + "grad_norm": 2.2233483966772716, + "learning_rate": 7.758527312183939e-07, + "loss": 0.3119, + "step": 11470 + }, + { + "epoch": 1.755317521040551, + "grad_norm": 2.0313291587701934, + "learning_rate": 7.748959381528698e-07, + "loss": 0.2891, + "step": 11471 + }, + { + "epoch": 1.755470543228768, + "grad_norm": 2.393618769392913, + "learning_rate": 7.73939711637075e-07, + "loss": 0.3505, + "step": 11472 + }, + { + "epoch": 1.7556235654169856, + "grad_norm": 2.128114574717289, + "learning_rate": 7.729840517297349e-07, + "loss": 0.3469, + "step": 11473 + }, + { + "epoch": 1.7557765876052027, + "grad_norm": 2.2834270947072324, + "learning_rate": 7.720289584895413e-07, + "loss": 0.297, + "step": 11474 + }, + { + "epoch": 1.75592960979342, + "grad_norm": 2.3363821368142124, + "learning_rate": 7.710744319751484e-07, + "loss": 0.3029, + "step": 11475 + }, + { + "epoch": 1.7560826319816374, + "grad_norm": 2.12918511038581, + "learning_rate": 7.701204722451783e-07, + "loss": 0.2405, + "step": 11476 + }, + { + "epoch": 1.7562356541698545, + "grad_norm": 2.187618497157304, + "learning_rate": 7.691670793582185e-07, + "loss": 0.2519, + "step": 11477 + }, + { + "epoch": 1.756388676358072, + "grad_norm": 2.3802505932165023, + "learning_rate": 7.682142533728187e-07, + "loss": 0.3085, + "step": 11478 + }, + { + "epoch": 1.7565416985462892, + "grad_norm": 2.090745528324452, + "learning_rate": 7.672619943474968e-07, + "loss": 0.2837, + "step": 11479 + }, + { + "epoch": 1.7566947207345065, + "grad_norm": 2.149798905042403, + "learning_rate": 7.663103023407337e-07, + "loss": 0.269, + "step": 11480 + }, + { + "epoch": 1.7568477429227238, + "grad_norm": 2.8280663649694042, + "learning_rate": 7.653591774109781e-07, + "loss": 0.3053, + "step": 11481 + }, + { + "epoch": 1.757000765110941, + "grad_norm": 2.0802449500599085, + "learning_rate": 7.644086196166423e-07, + "loss": 0.3047, + "step": 11482 + }, + { + "epoch": 1.7571537872991585, + "grad_norm": 1.946712250652231, + "learning_rate": 7.634586290161017e-07, + "loss": 0.2859, + "step": 11483 + }, + { + "epoch": 1.7573068094873756, + "grad_norm": 2.1479427054523903, + "learning_rate": 7.625092056676997e-07, + "loss": 0.2553, + "step": 11484 + }, + { + "epoch": 1.757459831675593, + "grad_norm": 2.0637537859831996, + "learning_rate": 7.615603496297475e-07, + "loss": 0.2663, + "step": 11485 + }, + { + "epoch": 1.7576128538638103, + "grad_norm": 2.345797647798903, + "learning_rate": 7.606120609605128e-07, + "loss": 0.2572, + "step": 11486 + }, + { + "epoch": 1.7577658760520274, + "grad_norm": 2.0232236730906106, + "learning_rate": 7.596643397182357e-07, + "loss": 0.2539, + "step": 11487 + }, + { + "epoch": 1.757918898240245, + "grad_norm": 2.1486066998428193, + "learning_rate": 7.587171859611219e-07, + "loss": 0.333, + "step": 11488 + }, + { + "epoch": 1.758071920428462, + "grad_norm": 1.887984788614984, + "learning_rate": 7.577705997473372e-07, + "loss": 0.2489, + "step": 11489 + }, + { + "epoch": 1.7582249426166794, + "grad_norm": 1.8155595847795774, + "learning_rate": 7.568245811350139e-07, + "loss": 0.2399, + "step": 11490 + }, + { + "epoch": 1.7583779648048967, + "grad_norm": 2.2819860715358384, + "learning_rate": 7.558791301822555e-07, + "loss": 0.3071, + "step": 11491 + }, + { + "epoch": 1.758530986993114, + "grad_norm": 2.005584975186744, + "learning_rate": 7.549342469471199e-07, + "loss": 0.2336, + "step": 11492 + }, + { + "epoch": 1.7586840091813314, + "grad_norm": 2.289801482583614, + "learning_rate": 7.539899314876409e-07, + "loss": 0.3048, + "step": 11493 + }, + { + "epoch": 1.7588370313695485, + "grad_norm": 2.121216076992854, + "learning_rate": 7.530461838618119e-07, + "loss": 0.3218, + "step": 11494 + }, + { + "epoch": 1.7589900535577658, + "grad_norm": 2.236459890017379, + "learning_rate": 7.521030041275879e-07, + "loss": 0.267, + "step": 11495 + }, + { + "epoch": 1.7591430757459832, + "grad_norm": 2.2002981567336466, + "learning_rate": 7.51160392342899e-07, + "loss": 0.2827, + "step": 11496 + }, + { + "epoch": 1.7592960979342005, + "grad_norm": 2.0246238778322327, + "learning_rate": 7.502183485656311e-07, + "loss": 0.2729, + "step": 11497 + }, + { + "epoch": 1.7594491201224178, + "grad_norm": 2.2739189494236993, + "learning_rate": 7.492768728536404e-07, + "loss": 0.2925, + "step": 11498 + }, + { + "epoch": 1.759602142310635, + "grad_norm": 1.9264721755974386, + "learning_rate": 7.483359652647448e-07, + "loss": 0.2605, + "step": 11499 + }, + { + "epoch": 1.7597551644988525, + "grad_norm": 2.0289411572643523, + "learning_rate": 7.473956258567316e-07, + "loss": 0.2722, + "step": 11500 + }, + { + "epoch": 1.7599081866870696, + "grad_norm": 1.6723850393620163, + "learning_rate": 7.46455854687349e-07, + "loss": 0.1741, + "step": 11501 + }, + { + "epoch": 1.760061208875287, + "grad_norm": 2.3182455245711857, + "learning_rate": 7.45516651814312e-07, + "loss": 0.2993, + "step": 11502 + }, + { + "epoch": 1.7602142310635043, + "grad_norm": 2.6195480262424002, + "learning_rate": 7.44578017295301e-07, + "loss": 0.3212, + "step": 11503 + }, + { + "epoch": 1.7603672532517214, + "grad_norm": 1.762441987335934, + "learning_rate": 7.436399511879622e-07, + "loss": 0.2604, + "step": 11504 + }, + { + "epoch": 1.760520275439939, + "grad_norm": 1.7135814424977367, + "learning_rate": 7.42702453549905e-07, + "loss": 0.2559, + "step": 11505 + }, + { + "epoch": 1.760673297628156, + "grad_norm": 2.2057167673996303, + "learning_rate": 7.417655244387045e-07, + "loss": 0.2687, + "step": 11506 + }, + { + "epoch": 1.7608263198163734, + "grad_norm": 2.0842795751715477, + "learning_rate": 7.408291639119014e-07, + "loss": 0.2647, + "step": 11507 + }, + { + "epoch": 1.7609793420045907, + "grad_norm": 2.034933453675627, + "learning_rate": 7.398933720270018e-07, + "loss": 0.3, + "step": 11508 + }, + { + "epoch": 1.7611323641928078, + "grad_norm": 2.130052559162876, + "learning_rate": 7.389581488414754e-07, + "loss": 0.3058, + "step": 11509 + }, + { + "epoch": 1.7612853863810254, + "grad_norm": 1.902586949096245, + "learning_rate": 7.380234944127595e-07, + "loss": 0.2671, + "step": 11510 + }, + { + "epoch": 1.7614384085692425, + "grad_norm": 2.0243430655818573, + "learning_rate": 7.370894087982528e-07, + "loss": 0.2402, + "step": 11511 + }, + { + "epoch": 1.7615914307574598, + "grad_norm": 2.0713333991695277, + "learning_rate": 7.361558920553225e-07, + "loss": 0.2736, + "step": 11512 + }, + { + "epoch": 1.7617444529456772, + "grad_norm": 2.6500941540661587, + "learning_rate": 7.352229442412984e-07, + "loss": 0.3045, + "step": 11513 + }, + { + "epoch": 1.7618974751338943, + "grad_norm": 2.2426275693857325, + "learning_rate": 7.342905654134802e-07, + "loss": 0.2958, + "step": 11514 + }, + { + "epoch": 1.7620504973221118, + "grad_norm": 2.187412096709696, + "learning_rate": 7.333587556291233e-07, + "loss": 0.2794, + "step": 11515 + }, + { + "epoch": 1.762203519510329, + "grad_norm": 2.086117649436215, + "learning_rate": 7.324275149454563e-07, + "loss": 0.2924, + "step": 11516 + }, + { + "epoch": 1.7623565416985463, + "grad_norm": 2.3882021880357027, + "learning_rate": 7.314968434196734e-07, + "loss": 0.272, + "step": 11517 + }, + { + "epoch": 1.7625095638867636, + "grad_norm": 2.189237862691424, + "learning_rate": 7.305667411089257e-07, + "loss": 0.2529, + "step": 11518 + }, + { + "epoch": 1.7626625860749807, + "grad_norm": 2.2909008823470076, + "learning_rate": 7.296372080703351e-07, + "loss": 0.3102, + "step": 11519 + }, + { + "epoch": 1.7628156082631983, + "grad_norm": 2.479912838161086, + "learning_rate": 7.287082443609938e-07, + "loss": 0.3642, + "step": 11520 + }, + { + "epoch": 1.7629686304514154, + "grad_norm": 2.396033551508515, + "learning_rate": 7.27779850037944e-07, + "loss": 0.3179, + "step": 11521 + }, + { + "epoch": 1.7631216526396327, + "grad_norm": 2.0599105222120686, + "learning_rate": 7.268520251582101e-07, + "loss": 0.2572, + "step": 11522 + }, + { + "epoch": 1.76327467482785, + "grad_norm": 2.1985313014313226, + "learning_rate": 7.259247697787697e-07, + "loss": 0.2397, + "step": 11523 + }, + { + "epoch": 1.7634276970160672, + "grad_norm": 2.5257026577043353, + "learning_rate": 7.249980839565673e-07, + "loss": 0.4124, + "step": 11524 + }, + { + "epoch": 1.7635807192042847, + "grad_norm": 2.1187741614819218, + "learning_rate": 7.240719677485186e-07, + "loss": 0.2284, + "step": 11525 + }, + { + "epoch": 1.7637337413925018, + "grad_norm": 2.27650166500535, + "learning_rate": 7.23146421211498e-07, + "loss": 0.3091, + "step": 11526 + }, + { + "epoch": 1.7638867635807192, + "grad_norm": 2.1431620058394687, + "learning_rate": 7.222214444023468e-07, + "loss": 0.2679, + "step": 11527 + }, + { + "epoch": 1.7640397857689365, + "grad_norm": 2.2747276067202393, + "learning_rate": 7.212970373778705e-07, + "loss": 0.2761, + "step": 11528 + }, + { + "epoch": 1.7641928079571538, + "grad_norm": 2.3816843053242156, + "learning_rate": 7.203732001948427e-07, + "loss": 0.3181, + "step": 11529 + }, + { + "epoch": 1.7643458301453712, + "grad_norm": 1.9811359822858945, + "learning_rate": 7.194499329099991e-07, + "loss": 0.241, + "step": 11530 + }, + { + "epoch": 1.7644988523335883, + "grad_norm": 2.076332988962881, + "learning_rate": 7.185272355800399e-07, + "loss": 0.318, + "step": 11531 + }, + { + "epoch": 1.7646518745218056, + "grad_norm": 2.5309491908046593, + "learning_rate": 7.176051082616331e-07, + "loss": 0.3363, + "step": 11532 + }, + { + "epoch": 1.764804896710023, + "grad_norm": 1.9335674180540168, + "learning_rate": 7.16683551011409e-07, + "loss": 0.2686, + "step": 11533 + }, + { + "epoch": 1.7649579188982403, + "grad_norm": 1.986235927705645, + "learning_rate": 7.157625638859634e-07, + "loss": 0.2505, + "step": 11534 + }, + { + "epoch": 1.7651109410864576, + "grad_norm": 2.013218268869103, + "learning_rate": 7.14842146941862e-07, + "loss": 0.2755, + "step": 11535 + }, + { + "epoch": 1.7652639632746747, + "grad_norm": 2.5751645961695533, + "learning_rate": 7.139223002356266e-07, + "loss": 0.2704, + "step": 11536 + }, + { + "epoch": 1.7654169854628923, + "grad_norm": 1.9783689343552875, + "learning_rate": 7.130030238237484e-07, + "loss": 0.2792, + "step": 11537 + }, + { + "epoch": 1.7655700076511094, + "grad_norm": 1.9929281594053494, + "learning_rate": 7.120843177626879e-07, + "loss": 0.2913, + "step": 11538 + }, + { + "epoch": 1.7657230298393267, + "grad_norm": 2.3890388963972895, + "learning_rate": 7.111661821088633e-07, + "loss": 0.3186, + "step": 11539 + }, + { + "epoch": 1.765876052027544, + "grad_norm": 2.3135962005718786, + "learning_rate": 7.102486169186595e-07, + "loss": 0.2763, + "step": 11540 + }, + { + "epoch": 1.7660290742157612, + "grad_norm": 1.9568397396801063, + "learning_rate": 7.093316222484337e-07, + "loss": 0.2673, + "step": 11541 + }, + { + "epoch": 1.7661820964039787, + "grad_norm": 1.940537721405934, + "learning_rate": 7.08415198154494e-07, + "loss": 0.2478, + "step": 11542 + }, + { + "epoch": 1.7663351185921958, + "grad_norm": 2.2076985256142763, + "learning_rate": 7.074993446931288e-07, + "loss": 0.277, + "step": 11543 + }, + { + "epoch": 1.7664881407804132, + "grad_norm": 1.9708964282837547, + "learning_rate": 7.065840619205821e-07, + "loss": 0.2539, + "step": 11544 + }, + { + "epoch": 1.7666411629686305, + "grad_norm": 2.2308721958293627, + "learning_rate": 7.056693498930611e-07, + "loss": 0.3503, + "step": 11545 + }, + { + "epoch": 1.7667941851568476, + "grad_norm": 2.3264206303422523, + "learning_rate": 7.047552086667464e-07, + "loss": 0.3175, + "step": 11546 + }, + { + "epoch": 1.7669472073450652, + "grad_norm": 2.2786931204458947, + "learning_rate": 7.038416382977797e-07, + "loss": 0.2961, + "step": 11547 + }, + { + "epoch": 1.7671002295332823, + "grad_norm": 2.401697495954377, + "learning_rate": 7.029286388422608e-07, + "loss": 0.3503, + "step": 11548 + }, + { + "epoch": 1.7672532517214996, + "grad_norm": 2.2087065408054634, + "learning_rate": 7.020162103562667e-07, + "loss": 0.2838, + "step": 11549 + }, + { + "epoch": 1.767406273909717, + "grad_norm": 2.0496835744113904, + "learning_rate": 7.011043528958306e-07, + "loss": 0.2834, + "step": 11550 + }, + { + "epoch": 1.767559296097934, + "grad_norm": 2.0960520988601146, + "learning_rate": 7.001930665169543e-07, + "loss": 0.2617, + "step": 11551 + }, + { + "epoch": 1.7677123182861516, + "grad_norm": 2.1397333109537247, + "learning_rate": 6.992823512756019e-07, + "loss": 0.3063, + "step": 11552 + }, + { + "epoch": 1.7678653404743687, + "grad_norm": 2.28631592306058, + "learning_rate": 6.983722072277055e-07, + "loss": 0.2877, + "step": 11553 + }, + { + "epoch": 1.768018362662586, + "grad_norm": 1.7910000877177028, + "learning_rate": 6.974626344291602e-07, + "loss": 0.2054, + "step": 11554 + }, + { + "epoch": 1.7681713848508034, + "grad_norm": 1.8451767717460463, + "learning_rate": 6.965536329358258e-07, + "loss": 0.2168, + "step": 11555 + }, + { + "epoch": 1.7683244070390205, + "grad_norm": 2.277969473191107, + "learning_rate": 6.956452028035287e-07, + "loss": 0.2725, + "step": 11556 + }, + { + "epoch": 1.768477429227238, + "grad_norm": 1.8798605342727712, + "learning_rate": 6.947373440880578e-07, + "loss": 0.2392, + "step": 11557 + }, + { + "epoch": 1.7686304514154552, + "grad_norm": 2.018158526897199, + "learning_rate": 6.938300568451695e-07, + "loss": 0.2837, + "step": 11558 + }, + { + "epoch": 1.7687834736036725, + "grad_norm": 2.082724016431595, + "learning_rate": 6.929233411305836e-07, + "loss": 0.297, + "step": 11559 + }, + { + "epoch": 1.7689364957918898, + "grad_norm": 2.4526921631636722, + "learning_rate": 6.920171969999845e-07, + "loss": 0.2738, + "step": 11560 + }, + { + "epoch": 1.7690895179801072, + "grad_norm": 2.137846305201505, + "learning_rate": 6.911116245090222e-07, + "loss": 0.2852, + "step": 11561 + }, + { + "epoch": 1.7692425401683245, + "grad_norm": 2.1361546200502093, + "learning_rate": 6.90206623713312e-07, + "loss": 0.3244, + "step": 11562 + }, + { + "epoch": 1.7693955623565416, + "grad_norm": 2.2487698262458062, + "learning_rate": 6.893021946684309e-07, + "loss": 0.3367, + "step": 11563 + }, + { + "epoch": 1.769548584544759, + "grad_norm": 2.0760080354126487, + "learning_rate": 6.883983374299296e-07, + "loss": 0.3091, + "step": 11564 + }, + { + "epoch": 1.7697016067329763, + "grad_norm": 2.10544391417939, + "learning_rate": 6.874950520533108e-07, + "loss": 0.3197, + "step": 11565 + }, + { + "epoch": 1.7698546289211936, + "grad_norm": 2.161015077218226, + "learning_rate": 6.86592338594051e-07, + "loss": 0.2401, + "step": 11566 + }, + { + "epoch": 1.770007651109411, + "grad_norm": 2.291425465652151, + "learning_rate": 6.856901971075936e-07, + "loss": 0.2838, + "step": 11567 + }, + { + "epoch": 1.770160673297628, + "grad_norm": 1.9249319567789467, + "learning_rate": 6.847886276493365e-07, + "loss": 0.2095, + "step": 11568 + }, + { + "epoch": 1.7703136954858456, + "grad_norm": 2.20987764398357, + "learning_rate": 6.8388763027465e-07, + "loss": 0.2599, + "step": 11569 + }, + { + "epoch": 1.7704667176740627, + "grad_norm": 2.3077010605825703, + "learning_rate": 6.82987205038872e-07, + "loss": 0.3059, + "step": 11570 + }, + { + "epoch": 1.77061973986228, + "grad_norm": 2.2411642688887174, + "learning_rate": 6.820873519972949e-07, + "loss": 0.2815, + "step": 11571 + }, + { + "epoch": 1.7707727620504974, + "grad_norm": 2.2461254208083763, + "learning_rate": 6.811880712051866e-07, + "loss": 0.3286, + "step": 11572 + }, + { + "epoch": 1.7709257842387145, + "grad_norm": 2.4101762861822853, + "learning_rate": 6.802893627177753e-07, + "loss": 0.2896, + "step": 11573 + }, + { + "epoch": 1.771078806426932, + "grad_norm": 2.2324724632302635, + "learning_rate": 6.793912265902514e-07, + "loss": 0.2563, + "step": 11574 + }, + { + "epoch": 1.7712318286151492, + "grad_norm": 2.409887429701208, + "learning_rate": 6.78493662877775e-07, + "loss": 0.337, + "step": 11575 + }, + { + "epoch": 1.7713848508033665, + "grad_norm": 2.211937176981524, + "learning_rate": 6.77596671635471e-07, + "loss": 0.2453, + "step": 11576 + }, + { + "epoch": 1.7715378729915838, + "grad_norm": 1.940927053286327, + "learning_rate": 6.767002529184219e-07, + "loss": 0.2219, + "step": 11577 + }, + { + "epoch": 1.771690895179801, + "grad_norm": 2.122117699775585, + "learning_rate": 6.758044067816849e-07, + "loss": 0.2919, + "step": 11578 + }, + { + "epoch": 1.7718439173680185, + "grad_norm": 2.014861880625214, + "learning_rate": 6.749091332802748e-07, + "loss": 0.2203, + "step": 11579 + }, + { + "epoch": 1.7719969395562356, + "grad_norm": 2.134298312181937, + "learning_rate": 6.740144324691755e-07, + "loss": 0.2862, + "step": 11580 + }, + { + "epoch": 1.772149961744453, + "grad_norm": 2.1255099152844896, + "learning_rate": 6.73120304403333e-07, + "loss": 0.2669, + "step": 11581 + }, + { + "epoch": 1.7723029839326703, + "grad_norm": 2.523914791255019, + "learning_rate": 6.722267491376599e-07, + "loss": 0.3238, + "step": 11582 + }, + { + "epoch": 1.7724560061208874, + "grad_norm": 1.9316026331139717, + "learning_rate": 6.713337667270325e-07, + "loss": 0.2682, + "step": 11583 + }, + { + "epoch": 1.772609028309105, + "grad_norm": 2.3472903220055366, + "learning_rate": 6.704413572262924e-07, + "loss": 0.316, + "step": 11584 + }, + { + "epoch": 1.772762050497322, + "grad_norm": 2.054524648699194, + "learning_rate": 6.695495206902458e-07, + "loss": 0.2599, + "step": 11585 + }, + { + "epoch": 1.7729150726855394, + "grad_norm": 1.8121343266405059, + "learning_rate": 6.686582571736643e-07, + "loss": 0.2625, + "step": 11586 + }, + { + "epoch": 1.7730680948737567, + "grad_norm": 2.1207606831716674, + "learning_rate": 6.677675667312844e-07, + "loss": 0.2734, + "step": 11587 + }, + { + "epoch": 1.7732211170619738, + "grad_norm": 2.4304987605550656, + "learning_rate": 6.668774494178054e-07, + "loss": 0.3168, + "step": 11588 + }, + { + "epoch": 1.7733741392501914, + "grad_norm": 1.9828230674015517, + "learning_rate": 6.659879052878925e-07, + "loss": 0.2348, + "step": 11589 + }, + { + "epoch": 1.7735271614384085, + "grad_norm": 2.460569333288936, + "learning_rate": 6.650989343961788e-07, + "loss": 0.3821, + "step": 11590 + }, + { + "epoch": 1.7736801836266258, + "grad_norm": 2.016231564778463, + "learning_rate": 6.642105367972573e-07, + "loss": 0.2461, + "step": 11591 + }, + { + "epoch": 1.7738332058148432, + "grad_norm": 1.9688510542138224, + "learning_rate": 6.633227125456887e-07, + "loss": 0.242, + "step": 11592 + }, + { + "epoch": 1.7739862280030605, + "grad_norm": 1.953541653477738, + "learning_rate": 6.624354616959971e-07, + "loss": 0.2461, + "step": 11593 + }, + { + "epoch": 1.7741392501912778, + "grad_norm": 2.1083010117470486, + "learning_rate": 6.615487843026735e-07, + "loss": 0.2774, + "step": 11594 + }, + { + "epoch": 1.774292272379495, + "grad_norm": 2.1541068089107656, + "learning_rate": 6.606626804201688e-07, + "loss": 0.2902, + "step": 11595 + }, + { + "epoch": 1.7744452945677123, + "grad_norm": 2.285536978638378, + "learning_rate": 6.597771501029082e-07, + "loss": 0.2622, + "step": 11596 + }, + { + "epoch": 1.7745983167559296, + "grad_norm": 2.319317293092176, + "learning_rate": 6.588921934052705e-07, + "loss": 0.2898, + "step": 11597 + }, + { + "epoch": 1.774751338944147, + "grad_norm": 1.993917651888502, + "learning_rate": 6.580078103816034e-07, + "loss": 0.2264, + "step": 11598 + }, + { + "epoch": 1.7749043611323643, + "grad_norm": 1.9890587475176882, + "learning_rate": 6.571240010862267e-07, + "loss": 0.2461, + "step": 11599 + }, + { + "epoch": 1.7750573833205814, + "grad_norm": 1.7738468768909665, + "learning_rate": 6.562407655734116e-07, + "loss": 0.2365, + "step": 11600 + }, + { + "epoch": 1.775210405508799, + "grad_norm": 2.229496564560051, + "learning_rate": 6.553581038974055e-07, + "loss": 0.3035, + "step": 11601 + }, + { + "epoch": 1.775363427697016, + "grad_norm": 2.0336619601974375, + "learning_rate": 6.544760161124153e-07, + "loss": 0.2633, + "step": 11602 + }, + { + "epoch": 1.7755164498852334, + "grad_norm": 2.002039939027311, + "learning_rate": 6.535945022726109e-07, + "loss": 0.2538, + "step": 11603 + }, + { + "epoch": 1.7756694720734507, + "grad_norm": 2.0847731213098033, + "learning_rate": 6.527135624321324e-07, + "loss": 0.3249, + "step": 11604 + }, + { + "epoch": 1.7758224942616678, + "grad_norm": 2.2660603591564517, + "learning_rate": 6.51833196645083e-07, + "loss": 0.3123, + "step": 11605 + }, + { + "epoch": 1.7759755164498854, + "grad_norm": 2.432517369709772, + "learning_rate": 6.509534049655231e-07, + "loss": 0.2786, + "step": 11606 + }, + { + "epoch": 1.7761285386381025, + "grad_norm": 1.8314023909408828, + "learning_rate": 6.500741874474903e-07, + "loss": 0.2399, + "step": 11607 + }, + { + "epoch": 1.7762815608263198, + "grad_norm": 2.32761232065426, + "learning_rate": 6.491955441449793e-07, + "loss": 0.3315, + "step": 11608 + }, + { + "epoch": 1.7764345830145372, + "grad_norm": 2.27982039867596, + "learning_rate": 6.483174751119503e-07, + "loss": 0.289, + "step": 11609 + }, + { + "epoch": 1.7765876052027543, + "grad_norm": 2.112027368923221, + "learning_rate": 6.47439980402329e-07, + "loss": 0.2515, + "step": 11610 + }, + { + "epoch": 1.7767406273909718, + "grad_norm": 2.2189829473835383, + "learning_rate": 6.465630600700057e-07, + "loss": 0.2844, + "step": 11611 + }, + { + "epoch": 1.776893649579189, + "grad_norm": 1.9160559036153286, + "learning_rate": 6.456867141688361e-07, + "loss": 0.2474, + "step": 11612 + }, + { + "epoch": 1.7770466717674063, + "grad_norm": 1.8834851653508435, + "learning_rate": 6.448109427526394e-07, + "loss": 0.3078, + "step": 11613 + }, + { + "epoch": 1.7771996939556236, + "grad_norm": 2.0496891897542695, + "learning_rate": 6.439357458752005e-07, + "loss": 0.2768, + "step": 11614 + }, + { + "epoch": 1.7773527161438407, + "grad_norm": 2.071621909374972, + "learning_rate": 6.430611235902684e-07, + "loss": 0.2088, + "step": 11615 + }, + { + "epoch": 1.7775057383320583, + "grad_norm": 2.405026711662064, + "learning_rate": 6.421870759515558e-07, + "loss": 0.3089, + "step": 11616 + }, + { + "epoch": 1.7776587605202754, + "grad_norm": 2.082176155954287, + "learning_rate": 6.413136030127465e-07, + "loss": 0.2935, + "step": 11617 + }, + { + "epoch": 1.7778117827084927, + "grad_norm": 2.0675961828515295, + "learning_rate": 6.404407048274775e-07, + "loss": 0.3031, + "step": 11618 + }, + { + "epoch": 1.77796480489671, + "grad_norm": 2.359503068353368, + "learning_rate": 6.395683814493592e-07, + "loss": 0.3404, + "step": 11619 + }, + { + "epoch": 1.7781178270849272, + "grad_norm": 2.4968081203038293, + "learning_rate": 6.386966329319666e-07, + "loss": 0.3414, + "step": 11620 + }, + { + "epoch": 1.7782708492731447, + "grad_norm": 2.034343682425657, + "learning_rate": 6.378254593288347e-07, + "loss": 0.2631, + "step": 11621 + }, + { + "epoch": 1.7784238714613618, + "grad_norm": 2.01888433987026, + "learning_rate": 6.36954860693465e-07, + "loss": 0.2474, + "step": 11622 + }, + { + "epoch": 1.7785768936495792, + "grad_norm": 2.239054272129354, + "learning_rate": 6.360848370793293e-07, + "loss": 0.3453, + "step": 11623 + }, + { + "epoch": 1.7787299158377965, + "grad_norm": 1.8388079956214614, + "learning_rate": 6.352153885398516e-07, + "loss": 0.2245, + "step": 11624 + }, + { + "epoch": 1.7788829380260138, + "grad_norm": 2.3830486932199335, + "learning_rate": 6.343465151284334e-07, + "loss": 0.3301, + "step": 11625 + }, + { + "epoch": 1.7790359602142312, + "grad_norm": 2.0508156507592705, + "learning_rate": 6.334782168984365e-07, + "loss": 0.2308, + "step": 11626 + }, + { + "epoch": 1.7791889824024483, + "grad_norm": 2.086349526363687, + "learning_rate": 6.326104939031818e-07, + "loss": 0.2734, + "step": 11627 + }, + { + "epoch": 1.7793420045906656, + "grad_norm": 2.1454319811649882, + "learning_rate": 6.317433461959622e-07, + "loss": 0.2935, + "step": 11628 + }, + { + "epoch": 1.779495026778883, + "grad_norm": 2.3671829733376453, + "learning_rate": 6.308767738300358e-07, + "loss": 0.3281, + "step": 11629 + }, + { + "epoch": 1.7796480489671003, + "grad_norm": 2.192383371880891, + "learning_rate": 6.30010776858615e-07, + "loss": 0.3156, + "step": 11630 + }, + { + "epoch": 1.7798010711553176, + "grad_norm": 2.131210440220201, + "learning_rate": 6.291453553348892e-07, + "loss": 0.3044, + "step": 11631 + }, + { + "epoch": 1.7799540933435347, + "grad_norm": 2.5597526386839844, + "learning_rate": 6.282805093120059e-07, + "loss": 0.3019, + "step": 11632 + }, + { + "epoch": 1.780107115531752, + "grad_norm": 2.3159736097903587, + "learning_rate": 6.274162388430794e-07, + "loss": 0.2879, + "step": 11633 + }, + { + "epoch": 1.7802601377199694, + "grad_norm": 2.4289419941363324, + "learning_rate": 6.265525439811881e-07, + "loss": 0.354, + "step": 11634 + }, + { + "epoch": 1.7804131599081867, + "grad_norm": 2.023080467977148, + "learning_rate": 6.256894247793732e-07, + "loss": 0.2547, + "step": 11635 + }, + { + "epoch": 1.780566182096404, + "grad_norm": 1.9144497659810062, + "learning_rate": 6.248268812906433e-07, + "loss": 0.2787, + "step": 11636 + }, + { + "epoch": 1.7807192042846212, + "grad_norm": 2.104341239947922, + "learning_rate": 6.239649135679704e-07, + "loss": 0.2601, + "step": 11637 + }, + { + "epoch": 1.7808722264728387, + "grad_norm": 1.8563273314684772, + "learning_rate": 6.231035216642922e-07, + "loss": 0.2237, + "step": 11638 + }, + { + "epoch": 1.7810252486610558, + "grad_norm": 1.9084349351630756, + "learning_rate": 6.222427056325087e-07, + "loss": 0.2434, + "step": 11639 + }, + { + "epoch": 1.7811782708492732, + "grad_norm": 2.20964827735286, + "learning_rate": 6.213824655254874e-07, + "loss": 0.3002, + "step": 11640 + }, + { + "epoch": 1.7813312930374905, + "grad_norm": 2.0264820560965933, + "learning_rate": 6.205228013960574e-07, + "loss": 0.2534, + "step": 11641 + }, + { + "epoch": 1.7814843152257076, + "grad_norm": 2.2198244041607924, + "learning_rate": 6.196637132970151e-07, + "loss": 0.2982, + "step": 11642 + }, + { + "epoch": 1.7816373374139252, + "grad_norm": 2.051614021142709, + "learning_rate": 6.188052012811207e-07, + "loss": 0.2739, + "step": 11643 + }, + { + "epoch": 1.7817903596021423, + "grad_norm": 2.148961658639848, + "learning_rate": 6.179472654010976e-07, + "loss": 0.2943, + "step": 11644 + }, + { + "epoch": 1.7819433817903596, + "grad_norm": 1.96673620662787, + "learning_rate": 6.170899057096347e-07, + "loss": 0.3192, + "step": 11645 + }, + { + "epoch": 1.782096403978577, + "grad_norm": 2.0614212600984705, + "learning_rate": 6.162331222593898e-07, + "loss": 0.335, + "step": 11646 + }, + { + "epoch": 1.782249426166794, + "grad_norm": 2.0945513989942457, + "learning_rate": 6.153769151029765e-07, + "loss": 0.2588, + "step": 11647 + }, + { + "epoch": 1.7824024483550116, + "grad_norm": 2.131680603392567, + "learning_rate": 6.145212842929793e-07, + "loss": 0.3115, + "step": 11648 + }, + { + "epoch": 1.7825554705432287, + "grad_norm": 2.169429836682061, + "learning_rate": 6.136662298819484e-07, + "loss": 0.3033, + "step": 11649 + }, + { + "epoch": 1.782708492731446, + "grad_norm": 1.8813273070863772, + "learning_rate": 6.128117519223919e-07, + "loss": 0.2579, + "step": 11650 + }, + { + "epoch": 1.7828615149196634, + "grad_norm": 1.9967126429259028, + "learning_rate": 6.119578504667878e-07, + "loss": 0.2296, + "step": 11651 + }, + { + "epoch": 1.7830145371078805, + "grad_norm": 2.325664541784967, + "learning_rate": 6.111045255675808e-07, + "loss": 0.262, + "step": 11652 + }, + { + "epoch": 1.783167559296098, + "grad_norm": 2.1955597124128086, + "learning_rate": 6.102517772771721e-07, + "loss": 0.3015, + "step": 11653 + }, + { + "epoch": 1.7833205814843152, + "grad_norm": 2.2385583608451967, + "learning_rate": 6.093996056479356e-07, + "loss": 0.3167, + "step": 11654 + }, + { + "epoch": 1.7834736036725325, + "grad_norm": 2.0869614373628185, + "learning_rate": 6.085480107322072e-07, + "loss": 0.3075, + "step": 11655 + }, + { + "epoch": 1.7836266258607498, + "grad_norm": 1.8924749927757958, + "learning_rate": 6.076969925822829e-07, + "loss": 0.2345, + "step": 11656 + }, + { + "epoch": 1.783779648048967, + "grad_norm": 2.347694523952781, + "learning_rate": 6.068465512504296e-07, + "loss": 0.3419, + "step": 11657 + }, + { + "epoch": 1.7839326702371845, + "grad_norm": 2.1575066247384034, + "learning_rate": 6.059966867888778e-07, + "loss": 0.2692, + "step": 11658 + }, + { + "epoch": 1.7840856924254016, + "grad_norm": 2.4130816506654935, + "learning_rate": 6.051473992498158e-07, + "loss": 0.3215, + "step": 11659 + }, + { + "epoch": 1.784238714613619, + "grad_norm": 1.8151835358364201, + "learning_rate": 6.042986886854063e-07, + "loss": 0.252, + "step": 11660 + }, + { + "epoch": 1.7843917368018363, + "grad_norm": 1.9735591729644815, + "learning_rate": 6.03450555147771e-07, + "loss": 0.2327, + "step": 11661 + }, + { + "epoch": 1.7845447589900536, + "grad_norm": 2.292921280603086, + "learning_rate": 6.026029986889959e-07, + "loss": 0.3124, + "step": 11662 + }, + { + "epoch": 1.784697781178271, + "grad_norm": 2.143060264993622, + "learning_rate": 6.017560193611338e-07, + "loss": 0.2716, + "step": 11663 + }, + { + "epoch": 1.784850803366488, + "grad_norm": 2.1064055632602052, + "learning_rate": 6.00909617216201e-07, + "loss": 0.324, + "step": 11664 + }, + { + "epoch": 1.7850038255547054, + "grad_norm": 2.491721456754776, + "learning_rate": 6.000637923061769e-07, + "loss": 0.2778, + "step": 11665 + }, + { + "epoch": 1.7851568477429227, + "grad_norm": 1.9397667295022485, + "learning_rate": 5.99218544683009e-07, + "loss": 0.2748, + "step": 11666 + }, + { + "epoch": 1.78530986993114, + "grad_norm": 2.0315429084869323, + "learning_rate": 5.983738743986068e-07, + "loss": 0.2424, + "step": 11667 + }, + { + "epoch": 1.7854628921193574, + "grad_norm": 2.3518604760030946, + "learning_rate": 5.975297815048442e-07, + "loss": 0.3663, + "step": 11668 + }, + { + "epoch": 1.7856159143075745, + "grad_norm": 2.260235840291961, + "learning_rate": 5.9668626605356e-07, + "loss": 0.3246, + "step": 11669 + }, + { + "epoch": 1.785768936495792, + "grad_norm": 2.2893769844413283, + "learning_rate": 5.958433280965581e-07, + "loss": 0.2896, + "step": 11670 + }, + { + "epoch": 1.7859219586840092, + "grad_norm": 2.2230273719541893, + "learning_rate": 5.950009676856061e-07, + "loss": 0.36, + "step": 11671 + }, + { + "epoch": 1.7860749808722265, + "grad_norm": 2.3200719885128858, + "learning_rate": 5.941591848724381e-07, + "loss": 0.3401, + "step": 11672 + }, + { + "epoch": 1.7862280030604438, + "grad_norm": 2.1974597281976593, + "learning_rate": 5.933179797087507e-07, + "loss": 0.3361, + "step": 11673 + }, + { + "epoch": 1.786381025248661, + "grad_norm": 2.5283285291331223, + "learning_rate": 5.924773522462024e-07, + "loss": 0.2941, + "step": 11674 + }, + { + "epoch": 1.7865340474368785, + "grad_norm": 2.112766463182009, + "learning_rate": 5.916373025364264e-07, + "loss": 0.2504, + "step": 11675 + }, + { + "epoch": 1.7866870696250956, + "grad_norm": 2.1968271904468812, + "learning_rate": 5.907978306310081e-07, + "loss": 0.311, + "step": 11676 + }, + { + "epoch": 1.786840091813313, + "grad_norm": 2.21326844605421, + "learning_rate": 5.899589365815018e-07, + "loss": 0.346, + "step": 11677 + }, + { + "epoch": 1.7869931140015303, + "grad_norm": 2.249810646236595, + "learning_rate": 5.891206204394329e-07, + "loss": 0.2816, + "step": 11678 + }, + { + "epoch": 1.7871461361897474, + "grad_norm": 1.8485888703819569, + "learning_rate": 5.882828822562814e-07, + "loss": 0.2307, + "step": 11679 + }, + { + "epoch": 1.787299158377965, + "grad_norm": 2.0267399285945404, + "learning_rate": 5.87445722083494e-07, + "loss": 0.2479, + "step": 11680 + }, + { + "epoch": 1.787452180566182, + "grad_norm": 2.0911961020778653, + "learning_rate": 5.866091399724916e-07, + "loss": 0.3178, + "step": 11681 + }, + { + "epoch": 1.7876052027543994, + "grad_norm": 1.9366386472362302, + "learning_rate": 5.857731359746433e-07, + "loss": 0.2728, + "step": 11682 + }, + { + "epoch": 1.7877582249426167, + "grad_norm": 2.0375420240316817, + "learning_rate": 5.849377101412967e-07, + "loss": 0.3075, + "step": 11683 + }, + { + "epoch": 1.7879112471308338, + "grad_norm": 1.7988385396357176, + "learning_rate": 5.841028625237589e-07, + "loss": 0.2719, + "step": 11684 + }, + { + "epoch": 1.7880642693190514, + "grad_norm": 2.5279457807415464, + "learning_rate": 5.832685931732963e-07, + "loss": 0.3454, + "step": 11685 + }, + { + "epoch": 1.7882172915072685, + "grad_norm": 2.313295462058856, + "learning_rate": 5.824349021411502e-07, + "loss": 0.256, + "step": 11686 + }, + { + "epoch": 1.7883703136954858, + "grad_norm": 2.1886780113954254, + "learning_rate": 5.816017894785197e-07, + "loss": 0.2694, + "step": 11687 + }, + { + "epoch": 1.7885233358837032, + "grad_norm": 1.8681046262205732, + "learning_rate": 5.807692552365652e-07, + "loss": 0.2614, + "step": 11688 + }, + { + "epoch": 1.7886763580719203, + "grad_norm": 2.0027057223970317, + "learning_rate": 5.799372994664199e-07, + "loss": 0.259, + "step": 11689 + }, + { + "epoch": 1.7888293802601378, + "grad_norm": 2.337302769224805, + "learning_rate": 5.791059222191764e-07, + "loss": 0.2759, + "step": 11690 + }, + { + "epoch": 1.788982402448355, + "grad_norm": 2.121766483543593, + "learning_rate": 5.782751235458939e-07, + "loss": 0.323, + "step": 11691 + }, + { + "epoch": 1.7891354246365723, + "grad_norm": 1.9704435983259665, + "learning_rate": 5.774449034975938e-07, + "loss": 0.3234, + "step": 11692 + }, + { + "epoch": 1.7892884468247896, + "grad_norm": 2.0983690476734305, + "learning_rate": 5.766152621252629e-07, + "loss": 0.2725, + "step": 11693 + }, + { + "epoch": 1.789441469013007, + "grad_norm": 2.1429735780924744, + "learning_rate": 5.757861994798542e-07, + "loss": 0.2911, + "step": 11694 + }, + { + "epoch": 1.7895944912012243, + "grad_norm": 1.9899945550379567, + "learning_rate": 5.749577156122821e-07, + "loss": 0.2584, + "step": 11695 + }, + { + "epoch": 1.7897475133894414, + "grad_norm": 2.1339230085642242, + "learning_rate": 5.741298105734272e-07, + "loss": 0.2873, + "step": 11696 + }, + { + "epoch": 1.7899005355776587, + "grad_norm": 1.9769567723769148, + "learning_rate": 5.733024844141355e-07, + "loss": 0.2111, + "step": 11697 + }, + { + "epoch": 1.790053557765876, + "grad_norm": 1.8199371530775728, + "learning_rate": 5.724757371852141e-07, + "loss": 0.2833, + "step": 11698 + }, + { + "epoch": 1.7902065799540934, + "grad_norm": 2.204375861271401, + "learning_rate": 5.716495689374413e-07, + "loss": 0.3179, + "step": 11699 + }, + { + "epoch": 1.7903596021423107, + "grad_norm": 2.392939646494985, + "learning_rate": 5.708239797215509e-07, + "loss": 0.2879, + "step": 11700 + }, + { + "epoch": 1.7905126243305278, + "grad_norm": 2.262298036101752, + "learning_rate": 5.699989695882458e-07, + "loss": 0.3352, + "step": 11701 + }, + { + "epoch": 1.7906656465187454, + "grad_norm": 1.9027329378061686, + "learning_rate": 5.691745385881964e-07, + "loss": 0.1847, + "step": 11702 + }, + { + "epoch": 1.7908186687069625, + "grad_norm": 2.150646438874415, + "learning_rate": 5.683506867720301e-07, + "loss": 0.2933, + "step": 11703 + }, + { + "epoch": 1.7909716908951798, + "grad_norm": 2.2417842603845526, + "learning_rate": 5.675274141903464e-07, + "loss": 0.2996, + "step": 11704 + }, + { + "epoch": 1.7911247130833972, + "grad_norm": 2.346125193672816, + "learning_rate": 5.66704720893706e-07, + "loss": 0.2515, + "step": 11705 + }, + { + "epoch": 1.7912777352716143, + "grad_norm": 1.9921160410033858, + "learning_rate": 5.658826069326284e-07, + "loss": 0.2964, + "step": 11706 + }, + { + "epoch": 1.7914307574598318, + "grad_norm": 2.1445987832520053, + "learning_rate": 5.650610723576078e-07, + "loss": 0.2592, + "step": 11707 + }, + { + "epoch": 1.791583779648049, + "grad_norm": 2.0694519109933727, + "learning_rate": 5.642401172190981e-07, + "loss": 0.2388, + "step": 11708 + }, + { + "epoch": 1.7917368018362663, + "grad_norm": 2.238611258670583, + "learning_rate": 5.634197415675124e-07, + "loss": 0.2617, + "step": 11709 + }, + { + "epoch": 1.7918898240244836, + "grad_norm": 2.151535537763062, + "learning_rate": 5.625999454532383e-07, + "loss": 0.2704, + "step": 11710 + }, + { + "epoch": 1.7920428462127007, + "grad_norm": 2.059430799019778, + "learning_rate": 5.617807289266208e-07, + "loss": 0.3045, + "step": 11711 + }, + { + "epoch": 1.7921958684009183, + "grad_norm": 2.119929153330778, + "learning_rate": 5.609620920379721e-07, + "loss": 0.3171, + "step": 11712 + }, + { + "epoch": 1.7923488905891354, + "grad_norm": 1.9604155333595272, + "learning_rate": 5.601440348375653e-07, + "loss": 0.2426, + "step": 11713 + }, + { + "epoch": 1.7925019127773527, + "grad_norm": 1.9606199783978195, + "learning_rate": 5.593265573756434e-07, + "loss": 0.2463, + "step": 11714 + }, + { + "epoch": 1.79265493496557, + "grad_norm": 2.0879423430699737, + "learning_rate": 5.585096597024098e-07, + "loss": 0.2984, + "step": 11715 + }, + { + "epoch": 1.7928079571537872, + "grad_norm": 2.1097963203212102, + "learning_rate": 5.576933418680331e-07, + "loss": 0.3156, + "step": 11716 + }, + { + "epoch": 1.7929609793420047, + "grad_norm": 1.9424617217380995, + "learning_rate": 5.568776039226465e-07, + "loss": 0.2702, + "step": 11717 + }, + { + "epoch": 1.7931140015302218, + "grad_norm": 2.0457163778091196, + "learning_rate": 5.560624459163488e-07, + "loss": 0.2134, + "step": 11718 + }, + { + "epoch": 1.7932670237184392, + "grad_norm": 2.2486400242745916, + "learning_rate": 5.552478678992001e-07, + "loss": 0.244, + "step": 11719 + }, + { + "epoch": 1.7934200459066565, + "grad_norm": 2.1376451976669166, + "learning_rate": 5.544338699212281e-07, + "loss": 0.216, + "step": 11720 + }, + { + "epoch": 1.7935730680948736, + "grad_norm": 2.388209980608498, + "learning_rate": 5.536204520324239e-07, + "loss": 0.2641, + "step": 11721 + }, + { + "epoch": 1.7937260902830912, + "grad_norm": 2.1460112607255053, + "learning_rate": 5.528076142827422e-07, + "loss": 0.2468, + "step": 11722 + }, + { + "epoch": 1.7938791124713083, + "grad_norm": 2.0739753388599462, + "learning_rate": 5.519953567221025e-07, + "loss": 0.2704, + "step": 11723 + }, + { + "epoch": 1.7940321346595256, + "grad_norm": 2.0422144203755295, + "learning_rate": 5.511836794003889e-07, + "loss": 0.2772, + "step": 11724 + }, + { + "epoch": 1.794185156847743, + "grad_norm": 2.0214617735698037, + "learning_rate": 5.503725823674488e-07, + "loss": 0.2763, + "step": 11725 + }, + { + "epoch": 1.7943381790359603, + "grad_norm": 1.7177450706562944, + "learning_rate": 5.49562065673096e-07, + "loss": 0.2216, + "step": 11726 + }, + { + "epoch": 1.7944912012241776, + "grad_norm": 2.0555841425573567, + "learning_rate": 5.48752129367105e-07, + "loss": 0.2227, + "step": 11727 + }, + { + "epoch": 1.7946442234123947, + "grad_norm": 2.080887556455304, + "learning_rate": 5.479427734992226e-07, + "loss": 0.2297, + "step": 11728 + }, + { + "epoch": 1.794797245600612, + "grad_norm": 2.1233295078008037, + "learning_rate": 5.471339981191503e-07, + "loss": 0.3034, + "step": 11729 + }, + { + "epoch": 1.7949502677888294, + "grad_norm": 2.3167717650903046, + "learning_rate": 5.463258032765573e-07, + "loss": 0.2657, + "step": 11730 + }, + { + "epoch": 1.7951032899770467, + "grad_norm": 2.319827835372855, + "learning_rate": 5.455181890210814e-07, + "loss": 0.2852, + "step": 11731 + }, + { + "epoch": 1.795256312165264, + "grad_norm": 1.9427112608034016, + "learning_rate": 5.447111554023199e-07, + "loss": 0.2446, + "step": 11732 + }, + { + "epoch": 1.7954093343534812, + "grad_norm": 2.0329376925996603, + "learning_rate": 5.43904702469833e-07, + "loss": 0.2396, + "step": 11733 + }, + { + "epoch": 1.7955623565416987, + "grad_norm": 1.7308500793905726, + "learning_rate": 5.430988302731544e-07, + "loss": 0.2522, + "step": 11734 + }, + { + "epoch": 1.7957153787299158, + "grad_norm": 2.1219125389826554, + "learning_rate": 5.422935388617689e-07, + "loss": 0.2545, + "step": 11735 + }, + { + "epoch": 1.7958684009181332, + "grad_norm": 1.972590600455119, + "learning_rate": 5.414888282851372e-07, + "loss": 0.2451, + "step": 11736 + }, + { + "epoch": 1.7960214231063505, + "grad_norm": 1.9834371644107958, + "learning_rate": 5.406846985926805e-07, + "loss": 0.2387, + "step": 11737 + }, + { + "epoch": 1.7961744452945676, + "grad_norm": 1.9935226037345846, + "learning_rate": 5.398811498337786e-07, + "loss": 0.2365, + "step": 11738 + }, + { + "epoch": 1.7963274674827852, + "grad_norm": 3.541082073396699, + "learning_rate": 5.39078182057785e-07, + "loss": 0.3011, + "step": 11739 + }, + { + "epoch": 1.7964804896710023, + "grad_norm": 2.2449387681269695, + "learning_rate": 5.382757953140105e-07, + "loss": 0.2998, + "step": 11740 + }, + { + "epoch": 1.7966335118592196, + "grad_norm": 2.373753123628044, + "learning_rate": 5.374739896517345e-07, + "loss": 0.3223, + "step": 11741 + }, + { + "epoch": 1.796786534047437, + "grad_norm": 2.3114047848746972, + "learning_rate": 5.366727651201986e-07, + "loss": 0.2996, + "step": 11742 + }, + { + "epoch": 1.796939556235654, + "grad_norm": 2.0196626112232257, + "learning_rate": 5.358721217686091e-07, + "loss": 0.2923, + "step": 11743 + }, + { + "epoch": 1.7970925784238716, + "grad_norm": 2.035219870357041, + "learning_rate": 5.350720596461367e-07, + "loss": 0.2943, + "step": 11744 + }, + { + "epoch": 1.7972456006120887, + "grad_norm": 2.2123252681687724, + "learning_rate": 5.342725788019154e-07, + "loss": 0.2645, + "step": 11745 + }, + { + "epoch": 1.797398622800306, + "grad_norm": 2.1569997981072664, + "learning_rate": 5.334736792850448e-07, + "loss": 0.2261, + "step": 11746 + }, + { + "epoch": 1.7975516449885234, + "grad_norm": 2.420526338846463, + "learning_rate": 5.326753611445901e-07, + "loss": 0.2685, + "step": 11747 + }, + { + "epoch": 1.7977046671767405, + "grad_norm": 2.333183084239707, + "learning_rate": 5.318776244295776e-07, + "loss": 0.2612, + "step": 11748 + }, + { + "epoch": 1.797857689364958, + "grad_norm": 2.0257483589647833, + "learning_rate": 5.310804691889993e-07, + "loss": 0.2629, + "step": 11749 + }, + { + "epoch": 1.7980107115531752, + "grad_norm": 2.0788523474030485, + "learning_rate": 5.302838954718115e-07, + "loss": 0.3047, + "step": 11750 + }, + { + "epoch": 1.7981637337413925, + "grad_norm": 2.1004294540442534, + "learning_rate": 5.294879033269362e-07, + "loss": 0.2874, + "step": 11751 + }, + { + "epoch": 1.7983167559296098, + "grad_norm": 2.1730252469221836, + "learning_rate": 5.286924928032566e-07, + "loss": 0.2628, + "step": 11752 + }, + { + "epoch": 1.798469778117827, + "grad_norm": 2.1484332701793125, + "learning_rate": 5.278976639496236e-07, + "loss": 0.2705, + "step": 11753 + }, + { + "epoch": 1.7986228003060445, + "grad_norm": 2.0795707539418697, + "learning_rate": 5.271034168148504e-07, + "loss": 0.2895, + "step": 11754 + }, + { + "epoch": 1.7987758224942616, + "grad_norm": 2.0650753087535003, + "learning_rate": 5.263097514477145e-07, + "loss": 0.2599, + "step": 11755 + }, + { + "epoch": 1.798928844682479, + "grad_norm": 2.2929545293487643, + "learning_rate": 5.25516667896957e-07, + "loss": 0.3368, + "step": 11756 + }, + { + "epoch": 1.7990818668706963, + "grad_norm": 2.171294433545801, + "learning_rate": 5.247241662112878e-07, + "loss": 0.3341, + "step": 11757 + }, + { + "epoch": 1.7992348890589134, + "grad_norm": 2.510967028431956, + "learning_rate": 5.239322464393726e-07, + "loss": 0.3286, + "step": 11758 + }, + { + "epoch": 1.799387911247131, + "grad_norm": 2.5373624082576667, + "learning_rate": 5.231409086298489e-07, + "loss": 0.3282, + "step": 11759 + }, + { + "epoch": 1.799540933435348, + "grad_norm": 2.3404830830993664, + "learning_rate": 5.223501528313179e-07, + "loss": 0.2916, + "step": 11760 + }, + { + "epoch": 1.7996939556235654, + "grad_norm": 2.4516704197186145, + "learning_rate": 5.215599790923387e-07, + "loss": 0.3127, + "step": 11761 + }, + { + "epoch": 1.7998469778117827, + "grad_norm": 1.8951882844356727, + "learning_rate": 5.2077038746144e-07, + "loss": 0.2737, + "step": 11762 + }, + { + "epoch": 1.8, + "grad_norm": 2.1140050714971848, + "learning_rate": 5.199813779871187e-07, + "loss": 0.3123, + "step": 11763 + }, + { + "epoch": 1.8001530221882174, + "grad_norm": 2.3629890157765683, + "learning_rate": 5.191929507178228e-07, + "loss": 0.2923, + "step": 11764 + }, + { + "epoch": 1.8003060443764345, + "grad_norm": 2.2207657842982704, + "learning_rate": 5.18405105701979e-07, + "loss": 0.2737, + "step": 11765 + }, + { + "epoch": 1.8004590665646518, + "grad_norm": 2.2328795744392687, + "learning_rate": 5.176178429879708e-07, + "loss": 0.2769, + "step": 11766 + }, + { + "epoch": 1.8006120887528692, + "grad_norm": 2.2494304651674804, + "learning_rate": 5.16831162624144e-07, + "loss": 0.305, + "step": 11767 + }, + { + "epoch": 1.8007651109410865, + "grad_norm": 2.097178530222374, + "learning_rate": 5.160450646588144e-07, + "loss": 0.2687, + "step": 11768 + }, + { + "epoch": 1.8009181331293038, + "grad_norm": 1.949079996975221, + "learning_rate": 5.152595491402612e-07, + "loss": 0.2349, + "step": 11769 + }, + { + "epoch": 1.801071155317521, + "grad_norm": 1.9194856467463366, + "learning_rate": 5.144746161167202e-07, + "loss": 0.2499, + "step": 11770 + }, + { + "epoch": 1.8012241775057385, + "grad_norm": 1.9094906109398266, + "learning_rate": 5.136902656364018e-07, + "loss": 0.263, + "step": 11771 + }, + { + "epoch": 1.8013771996939556, + "grad_norm": 2.0543659422279172, + "learning_rate": 5.129064977474752e-07, + "loss": 0.2677, + "step": 11772 + }, + { + "epoch": 1.801530221882173, + "grad_norm": 2.037500846294362, + "learning_rate": 5.121233124980751e-07, + "loss": 0.277, + "step": 11773 + }, + { + "epoch": 1.8016832440703903, + "grad_norm": 2.5064908670823125, + "learning_rate": 5.113407099362977e-07, + "loss": 0.3037, + "step": 11774 + }, + { + "epoch": 1.8018362662586074, + "grad_norm": 2.3767799879674065, + "learning_rate": 5.105586901102078e-07, + "loss": 0.3141, + "step": 11775 + }, + { + "epoch": 1.801989288446825, + "grad_norm": 2.1263927705629877, + "learning_rate": 5.097772530678325e-07, + "loss": 0.2535, + "step": 11776 + }, + { + "epoch": 1.802142310635042, + "grad_norm": 2.101381107662464, + "learning_rate": 5.089963988571589e-07, + "loss": 0.2888, + "step": 11777 + }, + { + "epoch": 1.8022953328232594, + "grad_norm": 2.1648239103094844, + "learning_rate": 5.082161275261499e-07, + "loss": 0.2802, + "step": 11778 + }, + { + "epoch": 1.8024483550114767, + "grad_norm": 2.4059903176752617, + "learning_rate": 5.074364391227182e-07, + "loss": 0.2784, + "step": 11779 + }, + { + "epoch": 1.8026013771996938, + "grad_norm": 2.0832771851068754, + "learning_rate": 5.066573336947489e-07, + "loss": 0.3067, + "step": 11780 + }, + { + "epoch": 1.8027543993879114, + "grad_norm": 1.8546774228621359, + "learning_rate": 5.058788112900925e-07, + "loss": 0.2336, + "step": 11781 + }, + { + "epoch": 1.8029074215761285, + "grad_norm": 1.9287318570324383, + "learning_rate": 5.051008719565597e-07, + "loss": 0.2113, + "step": 11782 + }, + { + "epoch": 1.8030604437643458, + "grad_norm": 2.329499263317703, + "learning_rate": 5.043235157419246e-07, + "loss": 0.2933, + "step": 11783 + }, + { + "epoch": 1.8032134659525632, + "grad_norm": 2.0964211634689702, + "learning_rate": 5.035467426939322e-07, + "loss": 0.3175, + "step": 11784 + }, + { + "epoch": 1.8033664881407803, + "grad_norm": 1.8844374172641056, + "learning_rate": 5.027705528602822e-07, + "loss": 0.2303, + "step": 11785 + }, + { + "epoch": 1.8035195103289978, + "grad_norm": 1.9323244223377913, + "learning_rate": 5.019949462886476e-07, + "loss": 0.2753, + "step": 11786 + }, + { + "epoch": 1.803672532517215, + "grad_norm": 2.3979087439781517, + "learning_rate": 5.012199230266612e-07, + "loss": 0.2833, + "step": 11787 + }, + { + "epoch": 1.8038255547054323, + "grad_norm": 2.066289773156986, + "learning_rate": 5.004454831219152e-07, + "loss": 0.2935, + "step": 11788 + }, + { + "epoch": 1.8039785768936496, + "grad_norm": 2.037769789992835, + "learning_rate": 4.99671626621977e-07, + "loss": 0.2397, + "step": 11789 + }, + { + "epoch": 1.8041315990818667, + "grad_norm": 2.0152504446404653, + "learning_rate": 4.988983535743697e-07, + "loss": 0.2613, + "step": 11790 + }, + { + "epoch": 1.8042846212700843, + "grad_norm": 1.7574012569673563, + "learning_rate": 4.981256640265808e-07, + "loss": 0.2328, + "step": 11791 + }, + { + "epoch": 1.8044376434583014, + "grad_norm": 2.189575784945479, + "learning_rate": 4.97353558026068e-07, + "loss": 0.3055, + "step": 11792 + }, + { + "epoch": 1.8045906656465187, + "grad_norm": 2.131528056103428, + "learning_rate": 4.965820356202478e-07, + "loss": 0.2894, + "step": 11793 + }, + { + "epoch": 1.804743687834736, + "grad_norm": 2.1817272506538496, + "learning_rate": 4.958110968565022e-07, + "loss": 0.3182, + "step": 11794 + }, + { + "epoch": 1.8048967100229534, + "grad_norm": 2.547644929513796, + "learning_rate": 4.950407417821779e-07, + "loss": 0.2626, + "step": 11795 + }, + { + "epoch": 1.8050497322111707, + "grad_norm": 2.012545629577886, + "learning_rate": 4.942709704445858e-07, + "loss": 0.298, + "step": 11796 + }, + { + "epoch": 1.8052027543993878, + "grad_norm": 2.108012508898975, + "learning_rate": 4.935017828909994e-07, + "loss": 0.2551, + "step": 11797 + }, + { + "epoch": 1.8053557765876052, + "grad_norm": 2.2952470925887303, + "learning_rate": 4.927331791686585e-07, + "loss": 0.26, + "step": 11798 + }, + { + "epoch": 1.8055087987758225, + "grad_norm": 2.075299206713513, + "learning_rate": 4.919651593247654e-07, + "loss": 0.2831, + "step": 11799 + }, + { + "epoch": 1.8056618209640398, + "grad_norm": 2.3021425499492483, + "learning_rate": 4.911977234064868e-07, + "loss": 0.2906, + "step": 11800 + }, + { + "epoch": 1.8058148431522572, + "grad_norm": 1.9590477977426395, + "learning_rate": 4.904308714609562e-07, + "loss": 0.2897, + "step": 11801 + }, + { + "epoch": 1.8059678653404743, + "grad_norm": 1.9965251043147465, + "learning_rate": 4.896646035352659e-07, + "loss": 0.3132, + "step": 11802 + }, + { + "epoch": 1.8061208875286918, + "grad_norm": 2.373562135530326, + "learning_rate": 4.888989196764782e-07, + "loss": 0.3023, + "step": 11803 + }, + { + "epoch": 1.806273909716909, + "grad_norm": 2.1986562880820486, + "learning_rate": 4.881338199316144e-07, + "loss": 0.2941, + "step": 11804 + }, + { + "epoch": 1.8064269319051263, + "grad_norm": 2.040180272914321, + "learning_rate": 4.873693043476646e-07, + "loss": 0.2436, + "step": 11805 + }, + { + "epoch": 1.8065799540933436, + "grad_norm": 1.918033241126225, + "learning_rate": 4.86605372971577e-07, + "loss": 0.2649, + "step": 11806 + }, + { + "epoch": 1.8067329762815607, + "grad_norm": 2.5039834522606426, + "learning_rate": 4.858420258502727e-07, + "loss": 0.459, + "step": 11807 + }, + { + "epoch": 1.8068859984697783, + "grad_norm": 2.1500788017787653, + "learning_rate": 4.850792630306289e-07, + "loss": 0.2669, + "step": 11808 + }, + { + "epoch": 1.8070390206579954, + "grad_norm": 1.7573674117294718, + "learning_rate": 4.84317084559488e-07, + "loss": 0.2311, + "step": 11809 + }, + { + "epoch": 1.8071920428462127, + "grad_norm": 2.116328054003031, + "learning_rate": 4.835554904836626e-07, + "loss": 0.2959, + "step": 11810 + }, + { + "epoch": 1.80734506503443, + "grad_norm": 1.9941371350603454, + "learning_rate": 4.82794480849923e-07, + "loss": 0.2193, + "step": 11811 + }, + { + "epoch": 1.8074980872226472, + "grad_norm": 2.4572585198173367, + "learning_rate": 4.82034055705004e-07, + "loss": 0.3226, + "step": 11812 + }, + { + "epoch": 1.8076511094108647, + "grad_norm": 2.18333804741628, + "learning_rate": 4.812742150956107e-07, + "loss": 0.2796, + "step": 11813 + }, + { + "epoch": 1.8078041315990818, + "grad_norm": 2.015761932746089, + "learning_rate": 4.805149590684022e-07, + "loss": 0.2538, + "step": 11814 + }, + { + "epoch": 1.8079571537872992, + "grad_norm": 2.2635890211246497, + "learning_rate": 4.797562876700124e-07, + "loss": 0.2918, + "step": 11815 + }, + { + "epoch": 1.8081101759755165, + "grad_norm": 2.0019627008557994, + "learning_rate": 4.78998200947034e-07, + "loss": 0.2271, + "step": 11816 + }, + { + "epoch": 1.8082631981637336, + "grad_norm": 1.9257373438476535, + "learning_rate": 4.782406989460197e-07, + "loss": 0.2475, + "step": 11817 + }, + { + "epoch": 1.8084162203519512, + "grad_norm": 2.1176835046895177, + "learning_rate": 4.774837817134937e-07, + "loss": 0.2816, + "step": 11818 + }, + { + "epoch": 1.8085692425401683, + "grad_norm": 2.15777176903622, + "learning_rate": 4.7672744929594396e-07, + "loss": 0.3033, + "step": 11819 + }, + { + "epoch": 1.8087222647283856, + "grad_norm": 1.9143295838287768, + "learning_rate": 4.759717017398124e-07, + "loss": 0.2498, + "step": 11820 + }, + { + "epoch": 1.808875286916603, + "grad_norm": 2.0434931578114623, + "learning_rate": 4.752165390915198e-07, + "loss": 0.2489, + "step": 11821 + }, + { + "epoch": 1.80902830910482, + "grad_norm": 1.6784767679091706, + "learning_rate": 4.744619613974399e-07, + "loss": 0.2, + "step": 11822 + }, + { + "epoch": 1.8091813312930376, + "grad_norm": 2.313772731023759, + "learning_rate": 4.737079687039148e-07, + "loss": 0.3154, + "step": 11823 + }, + { + "epoch": 1.8093343534812547, + "grad_norm": 2.30936650960722, + "learning_rate": 4.7295456105725057e-07, + "loss": 0.2594, + "step": 11824 + }, + { + "epoch": 1.809487375669472, + "grad_norm": 2.0716315153696208, + "learning_rate": 4.7220173850371695e-07, + "loss": 0.2652, + "step": 11825 + }, + { + "epoch": 1.8096403978576894, + "grad_norm": 1.9868270558385652, + "learning_rate": 4.71449501089547e-07, + "loss": 0.2478, + "step": 11826 + }, + { + "epoch": 1.8097934200459067, + "grad_norm": 2.018269885599993, + "learning_rate": 4.706978488609393e-07, + "loss": 0.2329, + "step": 11827 + }, + { + "epoch": 1.809946442234124, + "grad_norm": 1.9684741598276265, + "learning_rate": 4.6994678186405685e-07, + "loss": 0.2501, + "step": 11828 + }, + { + "epoch": 1.8100994644223412, + "grad_norm": 1.933993539998423, + "learning_rate": 4.691963001450228e-07, + "loss": 0.2868, + "step": 11829 + }, + { + "epoch": 1.8102524866105585, + "grad_norm": 2.3696993136051736, + "learning_rate": 4.684464037499292e-07, + "loss": 0.3, + "step": 11830 + }, + { + "epoch": 1.8104055087987758, + "grad_norm": 2.2717448328519603, + "learning_rate": 4.676970927248292e-07, + "loss": 0.2647, + "step": 11831 + }, + { + "epoch": 1.8105585309869932, + "grad_norm": 2.005296412078862, + "learning_rate": 4.6694836711574153e-07, + "loss": 0.2521, + "step": 11832 + }, + { + "epoch": 1.8107115531752105, + "grad_norm": 1.752049954837269, + "learning_rate": 4.662002269686483e-07, + "loss": 0.2542, + "step": 11833 + }, + { + "epoch": 1.8108645753634276, + "grad_norm": 2.015368287875028, + "learning_rate": 4.654526723294961e-07, + "loss": 0.2845, + "step": 11834 + }, + { + "epoch": 1.8110175975516452, + "grad_norm": 2.137987633418499, + "learning_rate": 4.6470570324419374e-07, + "loss": 0.3099, + "step": 11835 + }, + { + "epoch": 1.8111706197398623, + "grad_norm": 2.2337190632103794, + "learning_rate": 4.6395931975861673e-07, + "loss": 0.268, + "step": 11836 + }, + { + "epoch": 1.8113236419280796, + "grad_norm": 2.0287393644206366, + "learning_rate": 4.6321352191860293e-07, + "loss": 0.2506, + "step": 11837 + }, + { + "epoch": 1.811476664116297, + "grad_norm": 2.0219857424282845, + "learning_rate": 4.6246830976995336e-07, + "loss": 0.4011, + "step": 11838 + }, + { + "epoch": 1.811629686304514, + "grad_norm": 2.4153059420724565, + "learning_rate": 4.617236833584393e-07, + "loss": 0.2586, + "step": 11839 + }, + { + "epoch": 1.8117827084927316, + "grad_norm": 2.1473162421467507, + "learning_rate": 4.6097964272978634e-07, + "loss": 0.2317, + "step": 11840 + }, + { + "epoch": 1.8119357306809487, + "grad_norm": 2.129796298413445, + "learning_rate": 4.6023618792968794e-07, + "loss": 0.2965, + "step": 11841 + }, + { + "epoch": 1.812088752869166, + "grad_norm": 2.186068024749278, + "learning_rate": 4.5949331900380865e-07, + "loss": 0.2816, + "step": 11842 + }, + { + "epoch": 1.8122417750573834, + "grad_norm": 2.0460758575279896, + "learning_rate": 4.5875103599776426e-07, + "loss": 0.2681, + "step": 11843 + }, + { + "epoch": 1.8123947972456005, + "grad_norm": 2.3159308326608423, + "learning_rate": 4.5800933895714606e-07, + "loss": 0.2898, + "step": 11844 + }, + { + "epoch": 1.812547819433818, + "grad_norm": 2.3681240793421754, + "learning_rate": 4.5726822792750425e-07, + "loss": 0.2783, + "step": 11845 + }, + { + "epoch": 1.8127008416220352, + "grad_norm": 2.2807501390604608, + "learning_rate": 4.565277029543491e-07, + "loss": 0.2452, + "step": 11846 + }, + { + "epoch": 1.8128538638102525, + "grad_norm": 2.0219388437967574, + "learning_rate": 4.5578776408316315e-07, + "loss": 0.233, + "step": 11847 + }, + { + "epoch": 1.8130068859984698, + "grad_norm": 1.944753820724033, + "learning_rate": 4.5504841135938893e-07, + "loss": 0.2404, + "step": 11848 + }, + { + "epoch": 1.813159908186687, + "grad_norm": 1.7793328730279039, + "learning_rate": 4.543096448284301e-07, + "loss": 0.2104, + "step": 11849 + }, + { + "epoch": 1.8133129303749045, + "grad_norm": 2.0493902412848954, + "learning_rate": 4.5357146453565926e-07, + "loss": 0.2623, + "step": 11850 + }, + { + "epoch": 1.8134659525631216, + "grad_norm": 1.8090947119690544, + "learning_rate": 4.5283387052641125e-07, + "loss": 0.1942, + "step": 11851 + }, + { + "epoch": 1.813618974751339, + "grad_norm": 2.0275436226010153, + "learning_rate": 4.520968628459832e-07, + "loss": 0.2955, + "step": 11852 + }, + { + "epoch": 1.8137719969395563, + "grad_norm": 2.4728132882804537, + "learning_rate": 4.5136044153963887e-07, + "loss": 0.3628, + "step": 11853 + }, + { + "epoch": 1.8139250191277734, + "grad_norm": 2.2678757394918745, + "learning_rate": 4.5062460665260434e-07, + "loss": 0.302, + "step": 11854 + }, + { + "epoch": 1.814078041315991, + "grad_norm": 1.9115212445776606, + "learning_rate": 4.498893582300701e-07, + "loss": 0.24, + "step": 11855 + }, + { + "epoch": 1.814231063504208, + "grad_norm": 1.9723840834983504, + "learning_rate": 4.491546963171911e-07, + "loss": 0.2545, + "step": 11856 + }, + { + "epoch": 1.8143840856924254, + "grad_norm": 1.9544300904005711, + "learning_rate": 4.4842062095908467e-07, + "loss": 0.2047, + "step": 11857 + }, + { + "epoch": 1.8145371078806427, + "grad_norm": 2.0727551080590167, + "learning_rate": 4.476871322008336e-07, + "loss": 0.2165, + "step": 11858 + }, + { + "epoch": 1.8146901300688598, + "grad_norm": 2.13104978350104, + "learning_rate": 4.4695423008748406e-07, + "loss": 0.3208, + "step": 11859 + }, + { + "epoch": 1.8148431522570774, + "grad_norm": 2.242696903489749, + "learning_rate": 4.4622191466404894e-07, + "loss": 0.3112, + "step": 11860 + }, + { + "epoch": 1.8149961744452945, + "grad_norm": 2.2981606041884333, + "learning_rate": 4.454901859755001e-07, + "loss": 0.3035, + "step": 11861 + }, + { + "epoch": 1.8151491966335118, + "grad_norm": 2.1319246519937507, + "learning_rate": 4.4475904406677483e-07, + "loss": 0.2548, + "step": 11862 + }, + { + "epoch": 1.8153022188217292, + "grad_norm": 2.152578096052732, + "learning_rate": 4.440284889827795e-07, + "loss": 0.2502, + "step": 11863 + }, + { + "epoch": 1.8154552410099465, + "grad_norm": 2.2940645041026757, + "learning_rate": 4.4329852076837597e-07, + "loss": 0.3296, + "step": 11864 + }, + { + "epoch": 1.8156082631981638, + "grad_norm": 2.199915066734886, + "learning_rate": 4.4256913946839506e-07, + "loss": 0.3083, + "step": 11865 + }, + { + "epoch": 1.815761285386381, + "grad_norm": 2.1312699700865285, + "learning_rate": 4.418403451276354e-07, + "loss": 0.343, + "step": 11866 + }, + { + "epoch": 1.8159143075745983, + "grad_norm": 2.1295718756882573, + "learning_rate": 4.411121377908489e-07, + "loss": 0.2527, + "step": 11867 + }, + { + "epoch": 1.8160673297628156, + "grad_norm": 2.0468010619606978, + "learning_rate": 4.4038451750276213e-07, + "loss": 0.2323, + "step": 11868 + }, + { + "epoch": 1.816220351951033, + "grad_norm": 2.037380646764395, + "learning_rate": 4.396574843080603e-07, + "loss": 0.3027, + "step": 11869 + }, + { + "epoch": 1.8163733741392503, + "grad_norm": 2.4229862444006147, + "learning_rate": 4.3893103825139115e-07, + "loss": 0.3327, + "step": 11870 + }, + { + "epoch": 1.8165263963274674, + "grad_norm": 2.2136068015881913, + "learning_rate": 4.3820517937737005e-07, + "loss": 0.3462, + "step": 11871 + }, + { + "epoch": 1.816679418515685, + "grad_norm": 2.2981031674405994, + "learning_rate": 4.3747990773057693e-07, + "loss": 0.3054, + "step": 11872 + }, + { + "epoch": 1.816832440703902, + "grad_norm": 2.050569164217128, + "learning_rate": 4.367552233555494e-07, + "loss": 0.2547, + "step": 11873 + }, + { + "epoch": 1.8169854628921194, + "grad_norm": 2.1447671798370855, + "learning_rate": 4.3603112629679534e-07, + "loss": 0.2512, + "step": 11874 + }, + { + "epoch": 1.8171384850803367, + "grad_norm": 2.229682016061022, + "learning_rate": 4.353076165987846e-07, + "loss": 0.2623, + "step": 11875 + }, + { + "epoch": 1.8172915072685538, + "grad_norm": 2.3664239681453663, + "learning_rate": 4.3458469430595063e-07, + "loss": 0.2964, + "step": 11876 + }, + { + "epoch": 1.8174445294567714, + "grad_norm": 2.0532090128786655, + "learning_rate": 4.338623594626912e-07, + "loss": 0.3125, + "step": 11877 + }, + { + "epoch": 1.8175975516449885, + "grad_norm": 1.9329806232041002, + "learning_rate": 4.3314061211336633e-07, + "loss": 0.2434, + "step": 11878 + }, + { + "epoch": 1.8177505738332058, + "grad_norm": 2.076618181262312, + "learning_rate": 4.3241945230230286e-07, + "loss": 0.2888, + "step": 11879 + }, + { + "epoch": 1.8179035960214232, + "grad_norm": 2.740737723939962, + "learning_rate": 4.316988800737887e-07, + "loss": 0.3424, + "step": 11880 + }, + { + "epoch": 1.8180566182096403, + "grad_norm": 2.0235334902556255, + "learning_rate": 4.309788954720784e-07, + "loss": 0.2841, + "step": 11881 + }, + { + "epoch": 1.8182096403978578, + "grad_norm": 1.8685472019677516, + "learning_rate": 4.302594985413877e-07, + "loss": 0.2695, + "step": 11882 + }, + { + "epoch": 1.818362662586075, + "grad_norm": 2.088752310739199, + "learning_rate": 4.295406893258991e-07, + "loss": 0.2887, + "step": 11883 + }, + { + "epoch": 1.8185156847742923, + "grad_norm": 2.1044895242632147, + "learning_rate": 4.2882246786975614e-07, + "loss": 0.285, + "step": 11884 + }, + { + "epoch": 1.8186687069625096, + "grad_norm": 1.986025076615623, + "learning_rate": 4.2810483421706796e-07, + "loss": 0.2584, + "step": 11885 + }, + { + "epoch": 1.8188217291507267, + "grad_norm": 2.493855607868984, + "learning_rate": 4.273877884119071e-07, + "loss": 0.3174, + "step": 11886 + }, + { + "epoch": 1.8189747513389443, + "grad_norm": 2.3424713449133083, + "learning_rate": 4.2667133049831166e-07, + "loss": 0.2856, + "step": 11887 + }, + { + "epoch": 1.8191277735271614, + "grad_norm": 2.159188268341222, + "learning_rate": 4.259554605202787e-07, + "loss": 0.295, + "step": 11888 + }, + { + "epoch": 1.8192807957153787, + "grad_norm": 1.9226041677235508, + "learning_rate": 4.2524017852177746e-07, + "loss": 0.2502, + "step": 11889 + }, + { + "epoch": 1.819433817903596, + "grad_norm": 2.0557886809068306, + "learning_rate": 4.245254845467317e-07, + "loss": 0.2254, + "step": 11890 + }, + { + "epoch": 1.8195868400918132, + "grad_norm": 2.2139733495822735, + "learning_rate": 4.238113786390352e-07, + "loss": 0.3166, + "step": 11891 + }, + { + "epoch": 1.8197398622800307, + "grad_norm": 1.8869752641971287, + "learning_rate": 4.230978608425462e-07, + "loss": 0.2612, + "step": 11892 + }, + { + "epoch": 1.8198928844682478, + "grad_norm": 2.361534050142558, + "learning_rate": 4.223849312010808e-07, + "loss": 0.2905, + "step": 11893 + }, + { + "epoch": 1.8200459066564652, + "grad_norm": 2.473559879765051, + "learning_rate": 4.2167258975842394e-07, + "loss": 0.2726, + "step": 11894 + }, + { + "epoch": 1.8201989288446825, + "grad_norm": 1.8092190590853954, + "learning_rate": 4.209608365583262e-07, + "loss": 0.2333, + "step": 11895 + }, + { + "epoch": 1.8203519510328998, + "grad_norm": 1.9546857994825566, + "learning_rate": 4.202496716444948e-07, + "loss": 0.221, + "step": 11896 + }, + { + "epoch": 1.8205049732211172, + "grad_norm": 2.0141397192525066, + "learning_rate": 4.195390950606082e-07, + "loss": 0.2491, + "step": 11897 + }, + { + "epoch": 1.8206579954093343, + "grad_norm": 2.1127973177494597, + "learning_rate": 4.1882910685030587e-07, + "loss": 0.2451, + "step": 11898 + }, + { + "epoch": 1.8208110175975516, + "grad_norm": 1.9673277852549478, + "learning_rate": 4.181197070571874e-07, + "loss": 0.2684, + "step": 11899 + }, + { + "epoch": 1.820964039785769, + "grad_norm": 2.358968570632875, + "learning_rate": 4.174108957248246e-07, + "loss": 0.2537, + "step": 11900 + }, + { + "epoch": 1.8211170619739863, + "grad_norm": 2.1309862940664765, + "learning_rate": 4.16702672896746e-07, + "loss": 0.3338, + "step": 11901 + }, + { + "epoch": 1.8212700841622036, + "grad_norm": 2.170521550489901, + "learning_rate": 4.1599503861644355e-07, + "loss": 0.2589, + "step": 11902 + }, + { + "epoch": 1.8214231063504207, + "grad_norm": 1.9231878873342532, + "learning_rate": 4.152879929273812e-07, + "loss": 0.1625, + "step": 11903 + }, + { + "epoch": 1.8215761285386383, + "grad_norm": 2.136045983086775, + "learning_rate": 4.145815358729777e-07, + "loss": 0.2598, + "step": 11904 + }, + { + "epoch": 1.8217291507268554, + "grad_norm": 2.0576412417232857, + "learning_rate": 4.1387566749662045e-07, + "loss": 0.2408, + "step": 11905 + }, + { + "epoch": 1.8218821729150727, + "grad_norm": 1.9763125602432094, + "learning_rate": 4.131703878416604e-07, + "loss": 0.2477, + "step": 11906 + }, + { + "epoch": 1.82203519510329, + "grad_norm": 2.3306832460828155, + "learning_rate": 4.124656969514107e-07, + "loss": 0.2894, + "step": 11907 + }, + { + "epoch": 1.8221882172915072, + "grad_norm": 2.175829010880975, + "learning_rate": 4.117615948691489e-07, + "loss": 0.2145, + "step": 11908 + }, + { + "epoch": 1.8223412394797247, + "grad_norm": 1.8907633407981779, + "learning_rate": 4.11058081638116e-07, + "loss": 0.2225, + "step": 11909 + }, + { + "epoch": 1.8224942616679418, + "grad_norm": 2.2137627125533275, + "learning_rate": 4.103551573015196e-07, + "loss": 0.2826, + "step": 11910 + }, + { + "epoch": 1.8226472838561592, + "grad_norm": 2.6138023580709238, + "learning_rate": 4.096528219025275e-07, + "loss": 0.3311, + "step": 11911 + }, + { + "epoch": 1.8228003060443765, + "grad_norm": 2.483862774064997, + "learning_rate": 4.0895107548427293e-07, + "loss": 0.3195, + "step": 11912 + }, + { + "epoch": 1.8229533282325936, + "grad_norm": 2.487626032754856, + "learning_rate": 4.082499180898536e-07, + "loss": 0.2799, + "step": 11913 + }, + { + "epoch": 1.8231063504208112, + "grad_norm": 2.1376354412056844, + "learning_rate": 4.0754934976233065e-07, + "loss": 0.2945, + "step": 11914 + }, + { + "epoch": 1.8232593726090283, + "grad_norm": 1.574613938962143, + "learning_rate": 4.068493705447274e-07, + "loss": 0.1875, + "step": 11915 + }, + { + "epoch": 1.8234123947972456, + "grad_norm": 2.1105301064666726, + "learning_rate": 4.0614998048003284e-07, + "loss": 0.2669, + "step": 11916 + }, + { + "epoch": 1.823565416985463, + "grad_norm": 2.268105261960538, + "learning_rate": 4.054511796111982e-07, + "loss": 0.2442, + "step": 11917 + }, + { + "epoch": 1.82371843917368, + "grad_norm": 2.281379829918933, + "learning_rate": 4.047529679811424e-07, + "loss": 0.2729, + "step": 11918 + }, + { + "epoch": 1.8238714613618976, + "grad_norm": 1.9162797641433544, + "learning_rate": 4.040553456327434e-07, + "loss": 0.2425, + "step": 11919 + }, + { + "epoch": 1.8240244835501147, + "grad_norm": 2.123847206220364, + "learning_rate": 4.0335831260884363e-07, + "loss": 0.256, + "step": 11920 + }, + { + "epoch": 1.824177505738332, + "grad_norm": 1.9678421561811497, + "learning_rate": 4.026618689522543e-07, + "loss": 0.255, + "step": 11921 + }, + { + "epoch": 1.8243305279265494, + "grad_norm": 2.1499690708656307, + "learning_rate": 4.019660147057436e-07, + "loss": 0.2687, + "step": 11922 + }, + { + "epoch": 1.8244835501147665, + "grad_norm": 1.839216684375747, + "learning_rate": 4.012707499120472e-07, + "loss": 0.2095, + "step": 11923 + }, + { + "epoch": 1.824636572302984, + "grad_norm": 1.8671693115789987, + "learning_rate": 4.005760746138654e-07, + "loss": 0.2566, + "step": 11924 + }, + { + "epoch": 1.8247895944912012, + "grad_norm": 2.007621689391616, + "learning_rate": 3.998819888538585e-07, + "loss": 0.2492, + "step": 11925 + }, + { + "epoch": 1.8249426166794185, + "grad_norm": 2.33315783039703, + "learning_rate": 3.9918849267465587e-07, + "loss": 0.2988, + "step": 11926 + }, + { + "epoch": 1.8250956388676358, + "grad_norm": 2.2086776410871973, + "learning_rate": 3.984955861188478e-07, + "loss": 0.2698, + "step": 11927 + }, + { + "epoch": 1.8252486610558531, + "grad_norm": 2.0325254374514365, + "learning_rate": 3.978032692289846e-07, + "loss": 0.2833, + "step": 11928 + }, + { + "epoch": 1.8254016832440705, + "grad_norm": 1.9777584449620282, + "learning_rate": 3.9711154204758686e-07, + "loss": 0.2598, + "step": 11929 + }, + { + "epoch": 1.8255547054322876, + "grad_norm": 2.0637922928930887, + "learning_rate": 3.964204046171383e-07, + "loss": 0.2631, + "step": 11930 + }, + { + "epoch": 1.825707727620505, + "grad_norm": 2.3317485527455073, + "learning_rate": 3.957298569800794e-07, + "loss": 0.2423, + "step": 11931 + }, + { + "epoch": 1.8258607498087223, + "grad_norm": 2.252331720437728, + "learning_rate": 3.950398991788229e-07, + "loss": 0.2706, + "step": 11932 + }, + { + "epoch": 1.8260137719969396, + "grad_norm": 2.163044989421106, + "learning_rate": 3.9435053125574164e-07, + "loss": 0.2988, + "step": 11933 + }, + { + "epoch": 1.826166794185157, + "grad_norm": 2.129347941639684, + "learning_rate": 3.936617532531717e-07, + "loss": 0.297, + "step": 11934 + }, + { + "epoch": 1.826319816373374, + "grad_norm": 2.0345026352262123, + "learning_rate": 3.929735652134137e-07, + "loss": 0.2762, + "step": 11935 + }, + { + "epoch": 1.8264728385615916, + "grad_norm": 2.0560842105913837, + "learning_rate": 3.922859671787316e-07, + "loss": 0.3026, + "step": 11936 + }, + { + "epoch": 1.8266258607498087, + "grad_norm": 1.9701130038274002, + "learning_rate": 3.915989591913538e-07, + "loss": 0.2759, + "step": 11937 + }, + { + "epoch": 1.826778882938026, + "grad_norm": 1.8164126230552349, + "learning_rate": 3.9091254129347225e-07, + "loss": 0.2115, + "step": 11938 + }, + { + "epoch": 1.8269319051262434, + "grad_norm": 2.095226323648143, + "learning_rate": 3.902267135272431e-07, + "loss": 0.2918, + "step": 11939 + }, + { + "epoch": 1.8270849273144605, + "grad_norm": 1.8632956629908928, + "learning_rate": 3.8954147593478486e-07, + "loss": 0.2635, + "step": 11940 + }, + { + "epoch": 1.827237949502678, + "grad_norm": 2.125554599686935, + "learning_rate": 3.888568285581795e-07, + "loss": 0.2541, + "step": 11941 + }, + { + "epoch": 1.8273909716908951, + "grad_norm": 1.8686548914535643, + "learning_rate": 3.8817277143947786e-07, + "loss": 0.2687, + "step": 11942 + }, + { + "epoch": 1.8275439938791125, + "grad_norm": 2.2724859585571004, + "learning_rate": 3.874893046206862e-07, + "loss": 0.2784, + "step": 11943 + }, + { + "epoch": 1.8276970160673298, + "grad_norm": 1.9277596501627852, + "learning_rate": 3.8680642814378e-07, + "loss": 0.2747, + "step": 11944 + }, + { + "epoch": 1.827850038255547, + "grad_norm": 1.8385682044969542, + "learning_rate": 3.861241420507e-07, + "loss": 0.2125, + "step": 11945 + }, + { + "epoch": 1.8280030604437645, + "grad_norm": 2.273734952165846, + "learning_rate": 3.8544244638334617e-07, + "loss": 0.2406, + "step": 11946 + }, + { + "epoch": 1.8281560826319816, + "grad_norm": 1.6741109276874935, + "learning_rate": 3.8476134118358153e-07, + "loss": 0.1967, + "step": 11947 + }, + { + "epoch": 1.828309104820199, + "grad_norm": 2.0544841920395363, + "learning_rate": 3.8408082649324165e-07, + "loss": 0.2309, + "step": 11948 + }, + { + "epoch": 1.8284621270084163, + "grad_norm": 2.120247464040846, + "learning_rate": 3.8340090235411186e-07, + "loss": 0.2576, + "step": 11949 + }, + { + "epoch": 1.8286151491966334, + "grad_norm": 2.19249778357952, + "learning_rate": 3.827215688079555e-07, + "loss": 0.2595, + "step": 11950 + }, + { + "epoch": 1.828768171384851, + "grad_norm": 2.0334839445427293, + "learning_rate": 3.8204282589649144e-07, + "loss": 0.3497, + "step": 11951 + }, + { + "epoch": 1.828921193573068, + "grad_norm": 2.414610411001209, + "learning_rate": 3.8136467366140073e-07, + "loss": 0.3199, + "step": 11952 + }, + { + "epoch": 1.8290742157612854, + "grad_norm": 2.293890111032897, + "learning_rate": 3.8068711214433453e-07, + "loss": 0.3205, + "step": 11953 + }, + { + "epoch": 1.8292272379495027, + "grad_norm": 2.141584926134872, + "learning_rate": 3.8001014138690396e-07, + "loss": 0.2612, + "step": 11954 + }, + { + "epoch": 1.8293802601377198, + "grad_norm": 1.9667031981212337, + "learning_rate": 3.793337614306847e-07, + "loss": 0.3036, + "step": 11955 + }, + { + "epoch": 1.8295332823259374, + "grad_norm": 1.8036232157888064, + "learning_rate": 3.7865797231721456e-07, + "loss": 0.2635, + "step": 11956 + }, + { + "epoch": 1.8296863045141545, + "grad_norm": 2.0793528801360943, + "learning_rate": 3.779827740879982e-07, + "loss": 0.2694, + "step": 11957 + }, + { + "epoch": 1.8298393267023718, + "grad_norm": 2.3880010286275724, + "learning_rate": 3.773081667845002e-07, + "loss": 0.26, + "step": 11958 + }, + { + "epoch": 1.8299923488905891, + "grad_norm": 1.922707375441064, + "learning_rate": 3.76634150448153e-07, + "loss": 0.2438, + "step": 11959 + }, + { + "epoch": 1.8301453710788065, + "grad_norm": 1.9779623539916547, + "learning_rate": 3.75960725120349e-07, + "loss": 0.2355, + "step": 11960 + }, + { + "epoch": 1.8302983932670238, + "grad_norm": 2.116293706736384, + "learning_rate": 3.752878908424462e-07, + "loss": 0.2952, + "step": 11961 + }, + { + "epoch": 1.830451415455241, + "grad_norm": 2.0257737983103787, + "learning_rate": 3.7461564765576607e-07, + "loss": 0.2281, + "step": 11962 + }, + { + "epoch": 1.8306044376434583, + "grad_norm": 1.8736255320167776, + "learning_rate": 3.7394399560159336e-07, + "loss": 0.2413, + "step": 11963 + }, + { + "epoch": 1.8307574598316756, + "grad_norm": 2.3071687308411466, + "learning_rate": 3.732729347211772e-07, + "loss": 0.2546, + "step": 11964 + }, + { + "epoch": 1.830910482019893, + "grad_norm": 1.873675215731886, + "learning_rate": 3.726024650557303e-07, + "loss": 0.2861, + "step": 11965 + }, + { + "epoch": 1.8310635042081103, + "grad_norm": 2.7959098918112555, + "learning_rate": 3.7193258664642964e-07, + "loss": 0.276, + "step": 11966 + }, + { + "epoch": 1.8312165263963274, + "grad_norm": 2.3147884301052275, + "learning_rate": 3.712632995344123e-07, + "loss": 0.2683, + "step": 11967 + }, + { + "epoch": 1.8313695485845447, + "grad_norm": 2.1285332905357786, + "learning_rate": 3.7059460376078547e-07, + "loss": 0.2911, + "step": 11968 + }, + { + "epoch": 1.831522570772762, + "grad_norm": 1.9785370093599328, + "learning_rate": 3.6992649936661294e-07, + "loss": 0.2751, + "step": 11969 + }, + { + "epoch": 1.8316755929609794, + "grad_norm": 1.9944491857282887, + "learning_rate": 3.6925898639292634e-07, + "loss": 0.2614, + "step": 11970 + }, + { + "epoch": 1.8318286151491967, + "grad_norm": 2.1796578318532, + "learning_rate": 3.6859206488072396e-07, + "loss": 0.3357, + "step": 11971 + }, + { + "epoch": 1.8319816373374138, + "grad_norm": 2.266227460375382, + "learning_rate": 3.6792573487095975e-07, + "loss": 0.2783, + "step": 11972 + }, + { + "epoch": 1.8321346595256314, + "grad_norm": 2.113008303916984, + "learning_rate": 3.6725999640455533e-07, + "loss": 0.3254, + "step": 11973 + }, + { + "epoch": 1.8322876817138485, + "grad_norm": 1.8368245644367691, + "learning_rate": 3.6659484952240033e-07, + "loss": 0.2533, + "step": 11974 + }, + { + "epoch": 1.8324407039020658, + "grad_norm": 1.9882890052049291, + "learning_rate": 3.6593029426534086e-07, + "loss": 0.2611, + "step": 11975 + }, + { + "epoch": 1.8325937260902831, + "grad_norm": 1.8442393407352757, + "learning_rate": 3.652663306741899e-07, + "loss": 0.2888, + "step": 11976 + }, + { + "epoch": 1.8327467482785003, + "grad_norm": 2.227356989444283, + "learning_rate": 3.6460295878972704e-07, + "loss": 0.3018, + "step": 11977 + }, + { + "epoch": 1.8328997704667178, + "grad_norm": 2.6299960205316566, + "learning_rate": 3.639401786526875e-07, + "loss": 0.2764, + "step": 11978 + }, + { + "epoch": 1.833052792654935, + "grad_norm": 2.3886068477380373, + "learning_rate": 3.6327799030377976e-07, + "loss": 0.282, + "step": 11979 + }, + { + "epoch": 1.8332058148431523, + "grad_norm": 2.3571824622687125, + "learning_rate": 3.6261639378367133e-07, + "loss": 0.2807, + "step": 11980 + }, + { + "epoch": 1.8333588370313696, + "grad_norm": 2.2983280032511173, + "learning_rate": 3.6195538913298856e-07, + "loss": 0.2866, + "step": 11981 + }, + { + "epoch": 1.8335118592195867, + "grad_norm": 2.280510188770605, + "learning_rate": 3.6129497639233123e-07, + "loss": 0.2887, + "step": 11982 + }, + { + "epoch": 1.8336648814078043, + "grad_norm": 2.2769950664737117, + "learning_rate": 3.606351556022558e-07, + "loss": 0.3011, + "step": 11983 + }, + { + "epoch": 1.8338179035960214, + "grad_norm": 2.183026594822403, + "learning_rate": 3.599759268032854e-07, + "loss": 0.2637, + "step": 11984 + }, + { + "epoch": 1.8339709257842387, + "grad_norm": 2.033810868967627, + "learning_rate": 3.5931729003590544e-07, + "loss": 0.2677, + "step": 11985 + }, + { + "epoch": 1.834123947972456, + "grad_norm": 2.0557534905143453, + "learning_rate": 3.586592453405646e-07, + "loss": 0.2727, + "step": 11986 + }, + { + "epoch": 1.8342769701606731, + "grad_norm": 2.005984349800885, + "learning_rate": 3.580017927576773e-07, + "loss": 0.3202, + "step": 11987 + }, + { + "epoch": 1.8344299923488907, + "grad_norm": 2.070018685158, + "learning_rate": 3.57344932327619e-07, + "loss": 0.2542, + "step": 11988 + }, + { + "epoch": 1.8345830145371078, + "grad_norm": 2.3376594691999077, + "learning_rate": 3.5668866409073075e-07, + "loss": 0.2996, + "step": 11989 + }, + { + "epoch": 1.8347360367253251, + "grad_norm": 2.441617675543389, + "learning_rate": 3.56032988087317e-07, + "loss": 0.3233, + "step": 11990 + }, + { + "epoch": 1.8348890589135425, + "grad_norm": 2.4082276906172493, + "learning_rate": 3.5537790435764443e-07, + "loss": 0.3318, + "step": 11991 + }, + { + "epoch": 1.8350420811017596, + "grad_norm": 2.585091922064814, + "learning_rate": 3.5472341294194413e-07, + "loss": 0.2845, + "step": 11992 + }, + { + "epoch": 1.8351951032899771, + "grad_norm": 2.2536038309694173, + "learning_rate": 3.540695138804129e-07, + "loss": 0.303, + "step": 11993 + }, + { + "epoch": 1.8353481254781943, + "grad_norm": 2.255993235887367, + "learning_rate": 3.5341620721320746e-07, + "loss": 0.2994, + "step": 11994 + }, + { + "epoch": 1.8355011476664116, + "grad_norm": 1.9197326776399797, + "learning_rate": 3.527634929804502e-07, + "loss": 0.2682, + "step": 11995 + }, + { + "epoch": 1.835654169854629, + "grad_norm": 2.0008368864449113, + "learning_rate": 3.521113712222268e-07, + "loss": 0.2804, + "step": 11996 + }, + { + "epoch": 1.8358071920428463, + "grad_norm": 1.7878197925898167, + "learning_rate": 3.514598419785875e-07, + "loss": 0.2248, + "step": 11997 + }, + { + "epoch": 1.8359602142310636, + "grad_norm": 2.0309007271436763, + "learning_rate": 3.508089052895436e-07, + "loss": 0.2969, + "step": 11998 + }, + { + "epoch": 1.8361132364192807, + "grad_norm": 1.9657444590479858, + "learning_rate": 3.50158561195072e-07, + "loss": 0.2682, + "step": 11999 + }, + { + "epoch": 1.836266258607498, + "grad_norm": 2.395003585976222, + "learning_rate": 3.4950880973511626e-07, + "loss": 0.3577, + "step": 12000 + }, + { + "epoch": 1.8364192807957154, + "grad_norm": 2.4444206059912723, + "learning_rate": 3.488596509495756e-07, + "loss": 0.5389, + "step": 12001 + }, + { + "epoch": 1.8365723029839327, + "grad_norm": 2.045241268445169, + "learning_rate": 3.4821108487831936e-07, + "loss": 0.2474, + "step": 12002 + }, + { + "epoch": 1.83672532517215, + "grad_norm": 2.0052203274872338, + "learning_rate": 3.4756311156117995e-07, + "loss": 0.2466, + "step": 12003 + }, + { + "epoch": 1.8368783473603671, + "grad_norm": 2.0755297021165817, + "learning_rate": 3.4691573103794894e-07, + "loss": 0.2683, + "step": 12004 + }, + { + "epoch": 1.8370313695485847, + "grad_norm": 2.2371449837594946, + "learning_rate": 3.4626894334838457e-07, + "loss": 0.3022, + "step": 12005 + }, + { + "epoch": 1.8371843917368018, + "grad_norm": 2.4836021577730865, + "learning_rate": 3.456227485322128e-07, + "loss": 0.3733, + "step": 12006 + }, + { + "epoch": 1.8373374139250191, + "grad_norm": 2.1804822352614504, + "learning_rate": 3.4497714662911406e-07, + "loss": 0.2799, + "step": 12007 + }, + { + "epoch": 1.8374904361132365, + "grad_norm": 2.1739341705368256, + "learning_rate": 3.443321376787401e-07, + "loss": 0.2481, + "step": 12008 + }, + { + "epoch": 1.8376434583014536, + "grad_norm": 1.9035207841071022, + "learning_rate": 3.436877217207046e-07, + "loss": 0.2046, + "step": 12009 + }, + { + "epoch": 1.8377964804896711, + "grad_norm": 2.5314565993420812, + "learning_rate": 3.4304389879457835e-07, + "loss": 0.2207, + "step": 12010 + }, + { + "epoch": 1.8379495026778883, + "grad_norm": 2.4787411745654127, + "learning_rate": 3.424006689399073e-07, + "loss": 0.3614, + "step": 12011 + }, + { + "epoch": 1.8381025248661056, + "grad_norm": 1.8064961409465559, + "learning_rate": 3.4175803219619217e-07, + "loss": 0.24, + "step": 12012 + }, + { + "epoch": 1.838255547054323, + "grad_norm": 2.2357013035541717, + "learning_rate": 3.4111598860289696e-07, + "loss": 0.2798, + "step": 12013 + }, + { + "epoch": 1.83840856924254, + "grad_norm": 2.1526836237095313, + "learning_rate": 3.4047453819945564e-07, + "loss": 0.2627, + "step": 12014 + }, + { + "epoch": 1.8385615914307576, + "grad_norm": 2.2397589758269048, + "learning_rate": 3.398336810252623e-07, + "loss": 0.3025, + "step": 12015 + }, + { + "epoch": 1.8387146136189747, + "grad_norm": 2.284066487090659, + "learning_rate": 3.39193417119672e-07, + "loss": 0.3526, + "step": 12016 + }, + { + "epoch": 1.838867635807192, + "grad_norm": 2.1539273397928618, + "learning_rate": 3.385537465220079e-07, + "loss": 0.3264, + "step": 12017 + }, + { + "epoch": 1.8390206579954094, + "grad_norm": 2.0446058854531945, + "learning_rate": 3.3791466927155403e-07, + "loss": 0.2517, + "step": 12018 + }, + { + "epoch": 1.8391736801836265, + "grad_norm": 1.8921978306564606, + "learning_rate": 3.37276185407559e-07, + "loss": 0.2678, + "step": 12019 + }, + { + "epoch": 1.839326702371844, + "grad_norm": 1.9626988958729294, + "learning_rate": 3.366382949692326e-07, + "loss": 0.2971, + "step": 12020 + }, + { + "epoch": 1.8394797245600611, + "grad_norm": 2.0565364790390217, + "learning_rate": 3.360009979957546e-07, + "loss": 0.2869, + "step": 12021 + }, + { + "epoch": 1.8396327467482785, + "grad_norm": 2.1341795864028397, + "learning_rate": 3.353642945262592e-07, + "loss": 0.2688, + "step": 12022 + }, + { + "epoch": 1.8397857689364958, + "grad_norm": 2.4168517975060135, + "learning_rate": 3.347281845998496e-07, + "loss": 0.3241, + "step": 12023 + }, + { + "epoch": 1.839938791124713, + "grad_norm": 2.3182262108339344, + "learning_rate": 3.340926682555956e-07, + "loss": 0.2966, + "step": 12024 + }, + { + "epoch": 1.8400918133129305, + "grad_norm": 2.116133961225955, + "learning_rate": 3.3345774553252273e-07, + "loss": 0.3218, + "step": 12025 + }, + { + "epoch": 1.8402448355011476, + "grad_norm": 2.1324931497746125, + "learning_rate": 3.328234164696242e-07, + "loss": 0.3341, + "step": 12026 + }, + { + "epoch": 1.840397857689365, + "grad_norm": 1.9944972529438794, + "learning_rate": 3.3218968110586114e-07, + "loss": 0.2519, + "step": 12027 + }, + { + "epoch": 1.8405508798775823, + "grad_norm": 1.9909357917518529, + "learning_rate": 3.3155653948014674e-07, + "loss": 0.2383, + "step": 12028 + }, + { + "epoch": 1.8407039020657996, + "grad_norm": 2.515119551859431, + "learning_rate": 3.3092399163137113e-07, + "loss": 0.3151, + "step": 12029 + }, + { + "epoch": 1.840856924254017, + "grad_norm": 2.3275887638623285, + "learning_rate": 3.302920375983787e-07, + "loss": 0.2848, + "step": 12030 + }, + { + "epoch": 1.841009946442234, + "grad_norm": 1.9495815422530196, + "learning_rate": 3.2966067741997844e-07, + "loss": 0.2069, + "step": 12031 + }, + { + "epoch": 1.8411629686304514, + "grad_norm": 1.9592391943120646, + "learning_rate": 3.290299111349471e-07, + "loss": 0.2031, + "step": 12032 + }, + { + "epoch": 1.8413159908186687, + "grad_norm": 2.4296933720566023, + "learning_rate": 3.283997387820248e-07, + "loss": 0.3, + "step": 12033 + }, + { + "epoch": 1.841469013006886, + "grad_norm": 2.056443705826707, + "learning_rate": 3.2777016039990615e-07, + "loss": 0.2354, + "step": 12034 + }, + { + "epoch": 1.8416220351951034, + "grad_norm": 2.4922123003136005, + "learning_rate": 3.2714117602726137e-07, + "loss": 0.2721, + "step": 12035 + }, + { + "epoch": 1.8417750573833205, + "grad_norm": 2.2539525516934087, + "learning_rate": 3.265127857027184e-07, + "loss": 0.2819, + "step": 12036 + }, + { + "epoch": 1.841928079571538, + "grad_norm": 2.3067629986694316, + "learning_rate": 3.2588498946486634e-07, + "loss": 0.2877, + "step": 12037 + }, + { + "epoch": 1.8420811017597551, + "grad_norm": 2.1901125523396123, + "learning_rate": 3.2525778735226444e-07, + "loss": 0.2798, + "step": 12038 + }, + { + "epoch": 1.8422341239479725, + "grad_norm": 2.3560154126153017, + "learning_rate": 3.2463117940342846e-07, + "loss": 0.3309, + "step": 12039 + }, + { + "epoch": 1.8423871461361898, + "grad_norm": 2.0695678919381315, + "learning_rate": 3.2400516565684217e-07, + "loss": 0.2819, + "step": 12040 + }, + { + "epoch": 1.842540168324407, + "grad_norm": 2.0646589283114603, + "learning_rate": 3.2337974615095134e-07, + "loss": 0.247, + "step": 12041 + }, + { + "epoch": 1.8426931905126245, + "grad_norm": 2.1398428401944214, + "learning_rate": 3.2275492092416536e-07, + "loss": 0.2873, + "step": 12042 + }, + { + "epoch": 1.8428462127008416, + "grad_norm": 2.289253406530182, + "learning_rate": 3.2213069001485797e-07, + "loss": 0.3365, + "step": 12043 + }, + { + "epoch": 1.842999234889059, + "grad_norm": 1.9642071977337394, + "learning_rate": 3.21507053461364e-07, + "loss": 0.2847, + "step": 12044 + }, + { + "epoch": 1.8431522570772763, + "grad_norm": 2.3167926844831674, + "learning_rate": 3.208840113019851e-07, + "loss": 0.2929, + "step": 12045 + }, + { + "epoch": 1.8433052792654934, + "grad_norm": 2.1338331581061585, + "learning_rate": 3.2026156357498284e-07, + "loss": 0.2996, + "step": 12046 + }, + { + "epoch": 1.843458301453711, + "grad_norm": 2.1716582602915993, + "learning_rate": 3.1963971031858664e-07, + "loss": 0.2735, + "step": 12047 + }, + { + "epoch": 1.843611323641928, + "grad_norm": 2.07267232853433, + "learning_rate": 3.1901845157098486e-07, + "loss": 0.2846, + "step": 12048 + }, + { + "epoch": 1.8437643458301454, + "grad_norm": 2.0413863077886303, + "learning_rate": 3.1839778737033256e-07, + "loss": 0.2788, + "step": 12049 + }, + { + "epoch": 1.8439173680183627, + "grad_norm": 2.132499243615788, + "learning_rate": 3.1777771775474697e-07, + "loss": 0.2613, + "step": 12050 + }, + { + "epoch": 1.8440703902065798, + "grad_norm": 2.012435426584495, + "learning_rate": 3.171582427623077e-07, + "loss": 0.2077, + "step": 12051 + }, + { + "epoch": 1.8442234123947974, + "grad_norm": 2.3338924545977537, + "learning_rate": 3.1653936243105974e-07, + "loss": 0.2969, + "step": 12052 + }, + { + "epoch": 1.8443764345830145, + "grad_norm": 2.439473361933731, + "learning_rate": 3.1592107679901393e-07, + "loss": 0.3477, + "step": 12053 + }, + { + "epoch": 1.8445294567712318, + "grad_norm": 1.9503200571517338, + "learning_rate": 3.1530338590413877e-07, + "loss": 0.247, + "step": 12054 + }, + { + "epoch": 1.8446824789594491, + "grad_norm": 1.9160994751283733, + "learning_rate": 3.1468628978436723e-07, + "loss": 0.2587, + "step": 12055 + }, + { + "epoch": 1.8448355011476663, + "grad_norm": 2.2006778123960355, + "learning_rate": 3.1406978847760226e-07, + "loss": 0.2572, + "step": 12056 + }, + { + "epoch": 1.8449885233358838, + "grad_norm": 2.298106954995458, + "learning_rate": 3.1345388202170034e-07, + "loss": 0.339, + "step": 12057 + }, + { + "epoch": 1.845141545524101, + "grad_norm": 2.2870522685072068, + "learning_rate": 3.128385704544912e-07, + "loss": 0.2774, + "step": 12058 + }, + { + "epoch": 1.8452945677123183, + "grad_norm": 2.0815855675565844, + "learning_rate": 3.122238538137634e-07, + "loss": 0.2914, + "step": 12059 + }, + { + "epoch": 1.8454475899005356, + "grad_norm": 2.0949484272761567, + "learning_rate": 3.1160973213726466e-07, + "loss": 0.2838, + "step": 12060 + }, + { + "epoch": 1.845600612088753, + "grad_norm": 2.1367734350362744, + "learning_rate": 3.109962054627147e-07, + "loss": 0.2684, + "step": 12061 + }, + { + "epoch": 1.8457536342769703, + "grad_norm": 1.9203254480433196, + "learning_rate": 3.103832738277923e-07, + "loss": 0.2774, + "step": 12062 + }, + { + "epoch": 1.8459066564651874, + "grad_norm": 2.2174394172339484, + "learning_rate": 3.097709372701374e-07, + "loss": 0.3001, + "step": 12063 + }, + { + "epoch": 1.8460596786534047, + "grad_norm": 2.07364457958336, + "learning_rate": 3.0915919582735763e-07, + "loss": 0.2439, + "step": 12064 + }, + { + "epoch": 1.846212700841622, + "grad_norm": 2.4434845521963062, + "learning_rate": 3.085480495370241e-07, + "loss": 0.2569, + "step": 12065 + }, + { + "epoch": 1.8463657230298394, + "grad_norm": 2.155863352925435, + "learning_rate": 3.079374984366668e-07, + "loss": 0.3176, + "step": 12066 + }, + { + "epoch": 1.8465187452180567, + "grad_norm": 1.9860984079286215, + "learning_rate": 3.073275425637834e-07, + "loss": 0.2281, + "step": 12067 + }, + { + "epoch": 1.8466717674062738, + "grad_norm": 2.4004615498420803, + "learning_rate": 3.06718181955834e-07, + "loss": 0.2703, + "step": 12068 + }, + { + "epoch": 1.8468247895944914, + "grad_norm": 2.2840856637864255, + "learning_rate": 3.0610941665024097e-07, + "loss": 0.272, + "step": 12069 + }, + { + "epoch": 1.8469778117827085, + "grad_norm": 2.089748940200058, + "learning_rate": 3.0550124668439097e-07, + "loss": 0.2438, + "step": 12070 + }, + { + "epoch": 1.8471308339709258, + "grad_norm": 2.219418314806367, + "learning_rate": 3.0489367209563527e-07, + "loss": 0.2898, + "step": 12071 + }, + { + "epoch": 1.8472838561591431, + "grad_norm": 2.060183624027669, + "learning_rate": 3.042866929212862e-07, + "loss": 0.2405, + "step": 12072 + }, + { + "epoch": 1.8474368783473603, + "grad_norm": 2.010860439464339, + "learning_rate": 3.036803091986218e-07, + "loss": 0.2258, + "step": 12073 + }, + { + "epoch": 1.8475899005355778, + "grad_norm": 1.9942119712715967, + "learning_rate": 3.030745209648811e-07, + "loss": 0.3044, + "step": 12074 + }, + { + "epoch": 1.847742922723795, + "grad_norm": 2.232052662763698, + "learning_rate": 3.024693282572688e-07, + "loss": 0.285, + "step": 12075 + }, + { + "epoch": 1.8478959449120123, + "grad_norm": 2.0825006725065585, + "learning_rate": 3.01864731112953e-07, + "loss": 0.2628, + "step": 12076 + }, + { + "epoch": 1.8480489671002296, + "grad_norm": 1.9978220340953112, + "learning_rate": 3.012607295690617e-07, + "loss": 0.1955, + "step": 12077 + }, + { + "epoch": 1.8482019892884467, + "grad_norm": 2.0714850256582724, + "learning_rate": 3.0065732366269197e-07, + "loss": 0.2598, + "step": 12078 + }, + { + "epoch": 1.8483550114766643, + "grad_norm": 1.9595160794383788, + "learning_rate": 3.0005451343089964e-07, + "loss": 0.213, + "step": 12079 + }, + { + "epoch": 1.8485080336648814, + "grad_norm": 2.179653007063922, + "learning_rate": 2.9945229891070624e-07, + "loss": 0.2505, + "step": 12080 + }, + { + "epoch": 1.8486610558530987, + "grad_norm": 2.0967176533438967, + "learning_rate": 2.9885068013909444e-07, + "loss": 0.2826, + "step": 12081 + }, + { + "epoch": 1.848814078041316, + "grad_norm": 1.8612577442029008, + "learning_rate": 2.982496571530158e-07, + "loss": 0.2543, + "step": 12082 + }, + { + "epoch": 1.8489671002295331, + "grad_norm": 2.3138064490032746, + "learning_rate": 2.976492299893774e-07, + "loss": 0.2842, + "step": 12083 + }, + { + "epoch": 1.8491201224177507, + "grad_norm": 2.426532374935647, + "learning_rate": 2.9704939868505533e-07, + "loss": 0.3227, + "step": 12084 + }, + { + "epoch": 1.8492731446059678, + "grad_norm": 1.9737808547398248, + "learning_rate": 2.9645016327689014e-07, + "loss": 0.3011, + "step": 12085 + }, + { + "epoch": 1.8494261667941851, + "grad_norm": 1.9982381311792963, + "learning_rate": 2.9585152380167906e-07, + "loss": 0.2686, + "step": 12086 + }, + { + "epoch": 1.8495791889824025, + "grad_norm": 1.985073678732461, + "learning_rate": 2.952534802961882e-07, + "loss": 0.2569, + "step": 12087 + }, + { + "epoch": 1.8497322111706196, + "grad_norm": 1.9793562344285673, + "learning_rate": 2.946560327971493e-07, + "loss": 0.2316, + "step": 12088 + }, + { + "epoch": 1.8498852333588371, + "grad_norm": 2.424299187829567, + "learning_rate": 2.9405918134124747e-07, + "loss": 0.2653, + "step": 12089 + }, + { + "epoch": 1.8500382555470543, + "grad_norm": 2.119611836762099, + "learning_rate": 2.9346292596514227e-07, + "loss": 0.2722, + "step": 12090 + }, + { + "epoch": 1.8501912777352716, + "grad_norm": 2.386035985833903, + "learning_rate": 2.928672667054533e-07, + "loss": 0.3155, + "step": 12091 + }, + { + "epoch": 1.850344299923489, + "grad_norm": 1.9398662606503192, + "learning_rate": 2.9227220359875685e-07, + "loss": 0.279, + "step": 12092 + }, + { + "epoch": 1.850497322111706, + "grad_norm": 2.569255627334306, + "learning_rate": 2.916777366816026e-07, + "loss": 0.2903, + "step": 12093 + }, + { + "epoch": 1.8506503442999236, + "grad_norm": 2.268604135723616, + "learning_rate": 2.9108386599049685e-07, + "loss": 0.2867, + "step": 12094 + }, + { + "epoch": 1.8508033664881407, + "grad_norm": 2.3828483785992374, + "learning_rate": 2.9049059156191385e-07, + "loss": 0.3234, + "step": 12095 + }, + { + "epoch": 1.850956388676358, + "grad_norm": 1.988877704957895, + "learning_rate": 2.8989791343228657e-07, + "loss": 0.2306, + "step": 12096 + }, + { + "epoch": 1.8511094108645754, + "grad_norm": 2.2265877027141383, + "learning_rate": 2.8930583163801487e-07, + "loss": 0.2704, + "step": 12097 + }, + { + "epoch": 1.8512624330527927, + "grad_norm": 2.0913282906056505, + "learning_rate": 2.887143462154596e-07, + "loss": 0.271, + "step": 12098 + }, + { + "epoch": 1.85141545524101, + "grad_norm": 2.172619994500396, + "learning_rate": 2.8812345720094836e-07, + "loss": 0.3294, + "step": 12099 + }, + { + "epoch": 1.8515684774292271, + "grad_norm": 2.136696908463675, + "learning_rate": 2.875331646307677e-07, + "loss": 0.2473, + "step": 12100 + }, + { + "epoch": 1.8517214996174445, + "grad_norm": 2.074684483541853, + "learning_rate": 2.869434685411709e-07, + "loss": 0.2527, + "step": 12101 + }, + { + "epoch": 1.8518745218056618, + "grad_norm": 2.133340003665217, + "learning_rate": 2.863543689683734e-07, + "loss": 0.2935, + "step": 12102 + }, + { + "epoch": 1.8520275439938791, + "grad_norm": 1.951720876586633, + "learning_rate": 2.8576586594855517e-07, + "loss": 0.2645, + "step": 12103 + }, + { + "epoch": 1.8521805661820965, + "grad_norm": 2.147938107651279, + "learning_rate": 2.851779595178561e-07, + "loss": 0.2582, + "step": 12104 + }, + { + "epoch": 1.8523335883703136, + "grad_norm": 2.226493399132335, + "learning_rate": 2.8459064971238293e-07, + "loss": 0.2827, + "step": 12105 + }, + { + "epoch": 1.8524866105585311, + "grad_norm": 2.2685272163662655, + "learning_rate": 2.840039365682068e-07, + "loss": 0.3398, + "step": 12106 + }, + { + "epoch": 1.8526396327467483, + "grad_norm": 2.313428395457457, + "learning_rate": 2.8341782012135665e-07, + "loss": 0.2826, + "step": 12107 + }, + { + "epoch": 1.8527926549349656, + "grad_norm": 2.3340885234134943, + "learning_rate": 2.828323004078293e-07, + "loss": 0.2838, + "step": 12108 + }, + { + "epoch": 1.852945677123183, + "grad_norm": 1.784707937660604, + "learning_rate": 2.822473774635859e-07, + "loss": 0.2615, + "step": 12109 + }, + { + "epoch": 1.8530986993114, + "grad_norm": 1.9472104360411793, + "learning_rate": 2.8166305132454443e-07, + "loss": 0.2362, + "step": 12110 + }, + { + "epoch": 1.8532517214996176, + "grad_norm": 1.9800530360384525, + "learning_rate": 2.8107932202659504e-07, + "loss": 0.298, + "step": 12111 + }, + { + "epoch": 1.8534047436878347, + "grad_norm": 2.2978407370753695, + "learning_rate": 2.804961896055858e-07, + "loss": 0.325, + "step": 12112 + }, + { + "epoch": 1.853557765876052, + "grad_norm": 2.141324858232483, + "learning_rate": 2.7991365409732683e-07, + "loss": 0.2936, + "step": 12113 + }, + { + "epoch": 1.8537107880642694, + "grad_norm": 2.481993377344709, + "learning_rate": 2.7933171553759633e-07, + "loss": 0.3219, + "step": 12114 + }, + { + "epoch": 1.8538638102524865, + "grad_norm": 1.9074014250506492, + "learning_rate": 2.7875037396213335e-07, + "loss": 0.2674, + "step": 12115 + }, + { + "epoch": 1.854016832440704, + "grad_norm": 2.168713968336492, + "learning_rate": 2.7816962940663827e-07, + "loss": 0.2711, + "step": 12116 + }, + { + "epoch": 1.8541698546289211, + "grad_norm": 2.495180968740349, + "learning_rate": 2.7758948190677927e-07, + "loss": 0.3058, + "step": 12117 + }, + { + "epoch": 1.8543228768171385, + "grad_norm": 2.281648475117201, + "learning_rate": 2.7700993149818336e-07, + "loss": 0.2417, + "step": 12118 + }, + { + "epoch": 1.8544758990053558, + "grad_norm": 2.151961070143828, + "learning_rate": 2.764309782164454e-07, + "loss": 0.2624, + "step": 12119 + }, + { + "epoch": 1.854628921193573, + "grad_norm": 1.9512571981380695, + "learning_rate": 2.7585262209711916e-07, + "loss": 0.2198, + "step": 12120 + }, + { + "epoch": 1.8547819433817905, + "grad_norm": 2.072196470323757, + "learning_rate": 2.7527486317572515e-07, + "loss": 0.281, + "step": 12121 + }, + { + "epoch": 1.8549349655700076, + "grad_norm": 2.157528977725661, + "learning_rate": 2.746977014877439e-07, + "loss": 0.3241, + "step": 12122 + }, + { + "epoch": 1.855087987758225, + "grad_norm": 2.1224216420455, + "learning_rate": 2.7412113706862366e-07, + "loss": 0.2503, + "step": 12123 + }, + { + "epoch": 1.8552410099464423, + "grad_norm": 1.9186758271968076, + "learning_rate": 2.7354516995377165e-07, + "loss": 0.228, + "step": 12124 + }, + { + "epoch": 1.8553940321346594, + "grad_norm": 2.2800359932130942, + "learning_rate": 2.729698001785608e-07, + "loss": 0.2943, + "step": 12125 + }, + { + "epoch": 1.855547054322877, + "grad_norm": 2.138629020001359, + "learning_rate": 2.723950277783272e-07, + "loss": 0.2682, + "step": 12126 + }, + { + "epoch": 1.855700076511094, + "grad_norm": 2.059609498153823, + "learning_rate": 2.718208527883692e-07, + "loss": 0.2549, + "step": 12127 + }, + { + "epoch": 1.8558530986993114, + "grad_norm": 2.0336964526176806, + "learning_rate": 2.712472752439499e-07, + "loss": 0.2152, + "step": 12128 + }, + { + "epoch": 1.8560061208875287, + "grad_norm": 2.575196197121381, + "learning_rate": 2.706742951802943e-07, + "loss": 0.2578, + "step": 12129 + }, + { + "epoch": 1.856159143075746, + "grad_norm": 2.0950964294914676, + "learning_rate": 2.701019126325921e-07, + "loss": 0.2927, + "step": 12130 + }, + { + "epoch": 1.8563121652639634, + "grad_norm": 1.9792281674828598, + "learning_rate": 2.6953012763599293e-07, + "loss": 0.2385, + "step": 12131 + }, + { + "epoch": 1.8564651874521805, + "grad_norm": 2.45643230103461, + "learning_rate": 2.689589402256165e-07, + "loss": 0.3375, + "step": 12132 + }, + { + "epoch": 1.8566182096403978, + "grad_norm": 2.1003291220472655, + "learning_rate": 2.683883504365392e-07, + "loss": 0.2646, + "step": 12133 + }, + { + "epoch": 1.8567712318286151, + "grad_norm": 2.4439436285141083, + "learning_rate": 2.67818358303803e-07, + "loss": 0.3014, + "step": 12134 + }, + { + "epoch": 1.8569242540168325, + "grad_norm": 2.0784054999384582, + "learning_rate": 2.672489638624154e-07, + "loss": 0.2691, + "step": 12135 + }, + { + "epoch": 1.8570772762050498, + "grad_norm": 2.3457672844778803, + "learning_rate": 2.666801671473429e-07, + "loss": 0.2712, + "step": 12136 + }, + { + "epoch": 1.857230298393267, + "grad_norm": 2.2535684260502205, + "learning_rate": 2.6611196819351647e-07, + "loss": 0.2782, + "step": 12137 + }, + { + "epoch": 1.8573833205814845, + "grad_norm": 2.4045532560845486, + "learning_rate": 2.6554436703583595e-07, + "loss": 0.2972, + "step": 12138 + }, + { + "epoch": 1.8575363427697016, + "grad_norm": 2.6298059779139744, + "learning_rate": 2.6497736370915573e-07, + "loss": 0.2809, + "step": 12139 + }, + { + "epoch": 1.857689364957919, + "grad_norm": 2.332091100831672, + "learning_rate": 2.64410958248299e-07, + "loss": 0.2753, + "step": 12140 + }, + { + "epoch": 1.8578423871461363, + "grad_norm": 1.7920366458419108, + "learning_rate": 2.638451506880524e-07, + "loss": 0.2298, + "step": 12141 + }, + { + "epoch": 1.8579954093343534, + "grad_norm": 1.9857680513176472, + "learning_rate": 2.6327994106316146e-07, + "loss": 0.2223, + "step": 12142 + }, + { + "epoch": 1.858148431522571, + "grad_norm": 2.1848014681855763, + "learning_rate": 2.627153294083418e-07, + "loss": 0.2969, + "step": 12143 + }, + { + "epoch": 1.858301453710788, + "grad_norm": 1.9778201877077595, + "learning_rate": 2.621513157582656e-07, + "loss": 0.2965, + "step": 12144 + }, + { + "epoch": 1.8584544758990054, + "grad_norm": 2.229623695748292, + "learning_rate": 2.615879001475707e-07, + "loss": 0.2964, + "step": 12145 + }, + { + "epoch": 1.8586074980872227, + "grad_norm": 2.183425999238914, + "learning_rate": 2.6102508261085957e-07, + "loss": 0.3343, + "step": 12146 + }, + { + "epoch": 1.8587605202754398, + "grad_norm": 2.206930477766042, + "learning_rate": 2.6046286318269775e-07, + "loss": 0.2877, + "step": 12147 + }, + { + "epoch": 1.8589135424636574, + "grad_norm": 3.0938588466275148, + "learning_rate": 2.5990124189761325e-07, + "loss": 0.3017, + "step": 12148 + }, + { + "epoch": 1.8590665646518745, + "grad_norm": 2.059332216618004, + "learning_rate": 2.593402187900973e-07, + "loss": 0.2688, + "step": 12149 + }, + { + "epoch": 1.8592195868400918, + "grad_norm": 2.1161232443504594, + "learning_rate": 2.587797938946035e-07, + "loss": 0.3114, + "step": 12150 + }, + { + "epoch": 1.8593726090283091, + "grad_norm": 2.0546549960609592, + "learning_rate": 2.5821996724554985e-07, + "loss": 0.2921, + "step": 12151 + }, + { + "epoch": 1.8595256312165263, + "grad_norm": 2.1037161550199492, + "learning_rate": 2.576607388773189e-07, + "loss": 0.2607, + "step": 12152 + }, + { + "epoch": 1.8596786534047438, + "grad_norm": 2.0870667813048684, + "learning_rate": 2.5710210882425424e-07, + "loss": 0.2594, + "step": 12153 + }, + { + "epoch": 1.859831675592961, + "grad_norm": 2.2012957148422396, + "learning_rate": 2.5654407712066287e-07, + "loss": 0.2806, + "step": 12154 + }, + { + "epoch": 1.8599846977811783, + "grad_norm": 2.1918892229646465, + "learning_rate": 2.559866438008174e-07, + "loss": 0.3092, + "step": 12155 + }, + { + "epoch": 1.8601377199693956, + "grad_norm": 2.2243548563257796, + "learning_rate": 2.554298088989504e-07, + "loss": 0.3184, + "step": 12156 + }, + { + "epoch": 1.8602907421576127, + "grad_norm": 2.3448776203557977, + "learning_rate": 2.54873572449259e-07, + "loss": 0.3585, + "step": 12157 + }, + { + "epoch": 1.8604437643458303, + "grad_norm": 2.063362643976679, + "learning_rate": 2.543179344859059e-07, + "loss": 0.2759, + "step": 12158 + }, + { + "epoch": 1.8605967865340474, + "grad_norm": 2.336058681945544, + "learning_rate": 2.5376289504301375e-07, + "loss": 0.3135, + "step": 12159 + }, + { + "epoch": 1.8607498087222647, + "grad_norm": 2.312197431308048, + "learning_rate": 2.5320845415466756e-07, + "loss": 0.2921, + "step": 12160 + }, + { + "epoch": 1.860902830910482, + "grad_norm": 2.0693817907130034, + "learning_rate": 2.5265461185492224e-07, + "loss": 0.2645, + "step": 12161 + }, + { + "epoch": 1.8610558530986994, + "grad_norm": 2.0482766219818296, + "learning_rate": 2.5210136817778843e-07, + "loss": 0.2503, + "step": 12162 + }, + { + "epoch": 1.8612088752869167, + "grad_norm": 2.391935771524626, + "learning_rate": 2.5154872315724333e-07, + "loss": 0.3501, + "step": 12163 + }, + { + "epoch": 1.8613618974751338, + "grad_norm": 2.236266464810737, + "learning_rate": 2.509966768272276e-07, + "loss": 0.2932, + "step": 12164 + }, + { + "epoch": 1.8615149196633511, + "grad_norm": 2.310119778373762, + "learning_rate": 2.5044522922164414e-07, + "loss": 0.3187, + "step": 12165 + }, + { + "epoch": 1.8616679418515685, + "grad_norm": 2.126907185113331, + "learning_rate": 2.498943803743592e-07, + "loss": 0.2129, + "step": 12166 + }, + { + "epoch": 1.8618209640397858, + "grad_norm": 2.048088304206397, + "learning_rate": 2.4934413031920456e-07, + "loss": 0.2373, + "step": 12167 + }, + { + "epoch": 1.8619739862280031, + "grad_norm": 1.9929259125015373, + "learning_rate": 2.487944790899688e-07, + "loss": 0.2448, + "step": 12168 + }, + { + "epoch": 1.8621270084162203, + "grad_norm": 1.934843136968899, + "learning_rate": 2.4824542672041263e-07, + "loss": 0.25, + "step": 12169 + }, + { + "epoch": 1.8622800306044378, + "grad_norm": 1.8995690929106952, + "learning_rate": 2.4769697324425577e-07, + "loss": 0.2292, + "step": 12170 + }, + { + "epoch": 1.862433052792655, + "grad_norm": 2.1530208013880423, + "learning_rate": 2.471491186951758e-07, + "loss": 0.2673, + "step": 12171 + }, + { + "epoch": 1.8625860749808723, + "grad_norm": 1.8712526951486876, + "learning_rate": 2.4660186310682343e-07, + "loss": 0.2554, + "step": 12172 + }, + { + "epoch": 1.8627390971690896, + "grad_norm": 2.2666491985656885, + "learning_rate": 2.4605520651280634e-07, + "loss": 0.314, + "step": 12173 + }, + { + "epoch": 1.8628921193573067, + "grad_norm": 1.7644271265516238, + "learning_rate": 2.4550914894669544e-07, + "loss": 0.2522, + "step": 12174 + }, + { + "epoch": 1.8630451415455243, + "grad_norm": 2.1397863046954284, + "learning_rate": 2.4496369044202826e-07, + "loss": 0.3065, + "step": 12175 + }, + { + "epoch": 1.8631981637337414, + "grad_norm": 2.2356898789015585, + "learning_rate": 2.4441883103230256e-07, + "loss": 0.2685, + "step": 12176 + }, + { + "epoch": 1.8633511859219587, + "grad_norm": 2.227907810018491, + "learning_rate": 2.438745707509815e-07, + "loss": 0.3018, + "step": 12177 + }, + { + "epoch": 1.863504208110176, + "grad_norm": 2.307269133388442, + "learning_rate": 2.4333090963148843e-07, + "loss": 0.3108, + "step": 12178 + }, + { + "epoch": 1.8636572302983931, + "grad_norm": 2.2926041689812564, + "learning_rate": 2.427878477072121e-07, + "loss": 0.2948, + "step": 12179 + }, + { + "epoch": 1.8638102524866107, + "grad_norm": 2.2200065269776497, + "learning_rate": 2.4224538501150475e-07, + "loss": 0.2883, + "step": 12180 + }, + { + "epoch": 1.8639632746748278, + "grad_norm": 2.0355521766689733, + "learning_rate": 2.417035215776808e-07, + "loss": 0.2665, + "step": 12181 + }, + { + "epoch": 1.8641162968630451, + "grad_norm": 2.0094946627575867, + "learning_rate": 2.4116225743901935e-07, + "loss": 0.2702, + "step": 12182 + }, + { + "epoch": 1.8642693190512625, + "grad_norm": 2.172054629406562, + "learning_rate": 2.4062159262875917e-07, + "loss": 0.3103, + "step": 12183 + }, + { + "epoch": 1.8644223412394796, + "grad_norm": 2.2125272166267274, + "learning_rate": 2.4008152718010493e-07, + "loss": 0.2704, + "step": 12184 + }, + { + "epoch": 1.8645753634276971, + "grad_norm": 1.9717482284027534, + "learning_rate": 2.395420611262278e-07, + "loss": 0.2178, + "step": 12185 + }, + { + "epoch": 1.8647283856159143, + "grad_norm": 2.256922690173856, + "learning_rate": 2.3900319450025464e-07, + "loss": 0.2931, + "step": 12186 + }, + { + "epoch": 1.8648814078041316, + "grad_norm": 2.739839348840143, + "learning_rate": 2.3846492733527905e-07, + "loss": 0.2836, + "step": 12187 + }, + { + "epoch": 1.865034429992349, + "grad_norm": 1.700510993453522, + "learning_rate": 2.3792725966436224e-07, + "loss": 0.2278, + "step": 12188 + }, + { + "epoch": 1.865187452180566, + "grad_norm": 1.9545979397503868, + "learning_rate": 2.3739019152052013e-07, + "loss": 0.2552, + "step": 12189 + }, + { + "epoch": 1.8653404743687836, + "grad_norm": 2.195267570630192, + "learning_rate": 2.3685372293673737e-07, + "loss": 0.2948, + "step": 12190 + }, + { + "epoch": 1.8654934965570007, + "grad_norm": 1.8382213012672446, + "learning_rate": 2.3631785394596317e-07, + "loss": 0.2203, + "step": 12191 + }, + { + "epoch": 1.865646518745218, + "grad_norm": 1.8451009051414737, + "learning_rate": 2.3578258458110347e-07, + "loss": 0.2316, + "step": 12192 + }, + { + "epoch": 1.8657995409334354, + "grad_norm": 2.3210095540011624, + "learning_rate": 2.3524791487503418e-07, + "loss": 0.2854, + "step": 12193 + }, + { + "epoch": 1.8659525631216525, + "grad_norm": 2.146736780352666, + "learning_rate": 2.3471384486059123e-07, + "loss": 0.2932, + "step": 12194 + }, + { + "epoch": 1.86610558530987, + "grad_norm": 1.9464980080620138, + "learning_rate": 2.3418037457057063e-07, + "loss": 0.278, + "step": 12195 + }, + { + "epoch": 1.8662586074980871, + "grad_norm": 2.066963030143947, + "learning_rate": 2.336475040377384e-07, + "loss": 0.2585, + "step": 12196 + }, + { + "epoch": 1.8664116296863045, + "grad_norm": 2.311400102076601, + "learning_rate": 2.3311523329481943e-07, + "loss": 0.2815, + "step": 12197 + }, + { + "epoch": 1.8665646518745218, + "grad_norm": 2.067821743921238, + "learning_rate": 2.3258356237450208e-07, + "loss": 0.2692, + "step": 12198 + }, + { + "epoch": 1.8667176740627391, + "grad_norm": 3.1778578650717884, + "learning_rate": 2.3205249130943908e-07, + "loss": 0.3116, + "step": 12199 + }, + { + "epoch": 1.8668706962509565, + "grad_norm": 2.2301021426943506, + "learning_rate": 2.3152202013224434e-07, + "loss": 0.2504, + "step": 12200 + }, + { + "epoch": 1.8670237184391736, + "grad_norm": 1.783192454316723, + "learning_rate": 2.3099214887549626e-07, + "loss": 0.1713, + "step": 12201 + }, + { + "epoch": 1.867176740627391, + "grad_norm": 2.0357835170166374, + "learning_rate": 2.3046287757173768e-07, + "loss": 0.2838, + "step": 12202 + }, + { + "epoch": 1.8673297628156083, + "grad_norm": 1.9008329193246671, + "learning_rate": 2.2993420625347263e-07, + "loss": 0.2363, + "step": 12203 + }, + { + "epoch": 1.8674827850038256, + "grad_norm": 2.164894992849952, + "learning_rate": 2.2940613495316844e-07, + "loss": 0.2903, + "step": 12204 + }, + { + "epoch": 1.867635807192043, + "grad_norm": 1.818748508477644, + "learning_rate": 2.2887866370325696e-07, + "loss": 0.1966, + "step": 12205 + }, + { + "epoch": 1.86778882938026, + "grad_norm": 2.5998851775941416, + "learning_rate": 2.2835179253613005e-07, + "loss": 0.2729, + "step": 12206 + }, + { + "epoch": 1.8679418515684776, + "grad_norm": 2.2297654340062523, + "learning_rate": 2.2782552148414738e-07, + "loss": 0.3441, + "step": 12207 + }, + { + "epoch": 1.8680948737566947, + "grad_norm": 2.147215993679389, + "learning_rate": 2.2729985057962866e-07, + "loss": 0.2219, + "step": 12208 + }, + { + "epoch": 1.868247895944912, + "grad_norm": 2.015334317509236, + "learning_rate": 2.2677477985485697e-07, + "loss": 0.2214, + "step": 12209 + }, + { + "epoch": 1.8684009181331294, + "grad_norm": 2.0696416363691137, + "learning_rate": 2.262503093420787e-07, + "loss": 0.2825, + "step": 12210 + }, + { + "epoch": 1.8685539403213465, + "grad_norm": 2.0538848905826192, + "learning_rate": 2.2572643907350477e-07, + "loss": 0.2514, + "step": 12211 + }, + { + "epoch": 1.868706962509564, + "grad_norm": 2.0587824354921422, + "learning_rate": 2.2520316908130612e-07, + "loss": 0.2369, + "step": 12212 + }, + { + "epoch": 1.8688599846977811, + "grad_norm": 2.2401753137094067, + "learning_rate": 2.2468049939762038e-07, + "loss": 0.3055, + "step": 12213 + }, + { + "epoch": 1.8690130068859985, + "grad_norm": 1.9765528689673304, + "learning_rate": 2.241584300545485e-07, + "loss": 0.2514, + "step": 12214 + }, + { + "epoch": 1.8691660290742158, + "grad_norm": 2.16610698984525, + "learning_rate": 2.2363696108414822e-07, + "loss": 0.2618, + "step": 12215 + }, + { + "epoch": 1.869319051262433, + "grad_norm": 2.272639274412879, + "learning_rate": 2.2311609251844834e-07, + "loss": 0.3037, + "step": 12216 + }, + { + "epoch": 1.8694720734506505, + "grad_norm": 2.2152036642185946, + "learning_rate": 2.2259582438943773e-07, + "loss": 0.3251, + "step": 12217 + }, + { + "epoch": 1.8696250956388676, + "grad_norm": 2.0843874546884247, + "learning_rate": 2.2207615672906523e-07, + "loss": 0.2705, + "step": 12218 + }, + { + "epoch": 1.869778117827085, + "grad_norm": 2.192065193696456, + "learning_rate": 2.215570895692476e-07, + "loss": 0.274, + "step": 12219 + }, + { + "epoch": 1.8699311400153023, + "grad_norm": 2.252514725359488, + "learning_rate": 2.2103862294186374e-07, + "loss": 0.3029, + "step": 12220 + }, + { + "epoch": 1.8700841622035194, + "grad_norm": 1.7304474836249897, + "learning_rate": 2.2052075687875262e-07, + "loss": 0.2527, + "step": 12221 + }, + { + "epoch": 1.870237184391737, + "grad_norm": 2.319453397662655, + "learning_rate": 2.2000349141171995e-07, + "loss": 0.3471, + "step": 12222 + }, + { + "epoch": 1.870390206579954, + "grad_norm": 2.096261818076652, + "learning_rate": 2.194868265725325e-07, + "loss": 0.2351, + "step": 12223 + }, + { + "epoch": 1.8705432287681714, + "grad_norm": 1.7582270248063583, + "learning_rate": 2.1897076239291936e-07, + "loss": 0.3032, + "step": 12224 + }, + { + "epoch": 1.8706962509563887, + "grad_norm": 2.144383125659378, + "learning_rate": 2.184552989045763e-07, + "loss": 0.2726, + "step": 12225 + }, + { + "epoch": 1.8708492731446058, + "grad_norm": 4.532884435368886, + "learning_rate": 2.1794043613916015e-07, + "loss": 0.3022, + "step": 12226 + }, + { + "epoch": 1.8710022953328234, + "grad_norm": 2.559387386062656, + "learning_rate": 2.1742617412828682e-07, + "loss": 0.3715, + "step": 12227 + }, + { + "epoch": 1.8711553175210405, + "grad_norm": 2.162713822623037, + "learning_rate": 2.169125129035432e-07, + "loss": 0.2815, + "step": 12228 + }, + { + "epoch": 1.8713083397092578, + "grad_norm": 2.2515483930461526, + "learning_rate": 2.163994524964741e-07, + "loss": 0.3593, + "step": 12229 + }, + { + "epoch": 1.8714613618974751, + "grad_norm": 1.8777623529142353, + "learning_rate": 2.158869929385876e-07, + "loss": 0.21, + "step": 12230 + }, + { + "epoch": 1.8716143840856925, + "grad_norm": 2.0469775737816946, + "learning_rate": 2.1537513426135637e-07, + "loss": 0.2926, + "step": 12231 + }, + { + "epoch": 1.8717674062739098, + "grad_norm": 2.303687292526856, + "learning_rate": 2.1486387649621632e-07, + "loss": 0.324, + "step": 12232 + }, + { + "epoch": 1.871920428462127, + "grad_norm": 2.3834650957610792, + "learning_rate": 2.1435321967456567e-07, + "loss": 0.3276, + "step": 12233 + }, + { + "epoch": 1.8720734506503443, + "grad_norm": 2.2343221661881434, + "learning_rate": 2.1384316382776493e-07, + "loss": 0.3089, + "step": 12234 + }, + { + "epoch": 1.8722264728385616, + "grad_norm": 1.8172686458735308, + "learning_rate": 2.1333370898713902e-07, + "loss": 0.2393, + "step": 12235 + }, + { + "epoch": 1.872379495026779, + "grad_norm": 2.608854617407368, + "learning_rate": 2.1282485518397622e-07, + "loss": 0.3307, + "step": 12236 + }, + { + "epoch": 1.8725325172149963, + "grad_norm": 2.50025253282384, + "learning_rate": 2.1231660244952713e-07, + "loss": 0.2851, + "step": 12237 + }, + { + "epoch": 1.8726855394032134, + "grad_norm": 2.0332244215536095, + "learning_rate": 2.1180895081500453e-07, + "loss": 0.2816, + "step": 12238 + }, + { + "epoch": 1.872838561591431, + "grad_norm": 1.8344196972742979, + "learning_rate": 2.113019003115857e-07, + "loss": 0.2178, + "step": 12239 + }, + { + "epoch": 1.872991583779648, + "grad_norm": 2.1325275388576395, + "learning_rate": 2.107954509704102e-07, + "loss": 0.2919, + "step": 12240 + }, + { + "epoch": 1.8731446059678654, + "grad_norm": 2.03238396828965, + "learning_rate": 2.1028960282258204e-07, + "loss": 0.312, + "step": 12241 + }, + { + "epoch": 1.8732976281560827, + "grad_norm": 2.134605895045237, + "learning_rate": 2.0978435589916635e-07, + "loss": 0.2619, + "step": 12242 + }, + { + "epoch": 1.8734506503442998, + "grad_norm": 2.0790319221111764, + "learning_rate": 2.0927971023119498e-07, + "loss": 0.2806, + "step": 12243 + }, + { + "epoch": 1.8736036725325174, + "grad_norm": 2.1151547837451323, + "learning_rate": 2.0877566584965646e-07, + "loss": 0.2672, + "step": 12244 + }, + { + "epoch": 1.8737566947207345, + "grad_norm": 1.8800490838528072, + "learning_rate": 2.082722227855083e-07, + "loss": 0.2575, + "step": 12245 + }, + { + "epoch": 1.8739097169089518, + "grad_norm": 1.9156558849897791, + "learning_rate": 2.0776938106966903e-07, + "loss": 0.2848, + "step": 12246 + }, + { + "epoch": 1.8740627390971691, + "grad_norm": 2.1518764006308797, + "learning_rate": 2.0726714073301845e-07, + "loss": 0.278, + "step": 12247 + }, + { + "epoch": 1.8742157612853863, + "grad_norm": 2.115861447241316, + "learning_rate": 2.0676550180640187e-07, + "loss": 0.3185, + "step": 12248 + }, + { + "epoch": 1.8743687834736038, + "grad_norm": 2.426652595668358, + "learning_rate": 2.0626446432062798e-07, + "loss": 0.2452, + "step": 12249 + }, + { + "epoch": 1.874521805661821, + "grad_norm": 2.011114310109736, + "learning_rate": 2.0576402830646548e-07, + "loss": 0.2554, + "step": 12250 + }, + { + "epoch": 1.8746748278500383, + "grad_norm": 2.441039647615812, + "learning_rate": 2.052641937946509e-07, + "loss": 0.3226, + "step": 12251 + }, + { + "epoch": 1.8748278500382556, + "grad_norm": 2.2657634566140827, + "learning_rate": 2.0476496081587972e-07, + "loss": 0.2532, + "step": 12252 + }, + { + "epoch": 1.8749808722264727, + "grad_norm": 2.2793594074333727, + "learning_rate": 2.042663294008096e-07, + "loss": 0.3178, + "step": 12253 + }, + { + "epoch": 1.8751338944146902, + "grad_norm": 2.3583753493859168, + "learning_rate": 2.0376829958006606e-07, + "loss": 0.3027, + "step": 12254 + }, + { + "epoch": 1.8752869166029074, + "grad_norm": 2.3976898464771317, + "learning_rate": 2.0327087138423464e-07, + "loss": 0.3205, + "step": 12255 + }, + { + "epoch": 1.8754399387911247, + "grad_norm": 2.3833629536154044, + "learning_rate": 2.027740448438631e-07, + "loss": 0.2334, + "step": 12256 + }, + { + "epoch": 1.875592960979342, + "grad_norm": 2.5591541641816455, + "learning_rate": 2.0227781998946483e-07, + "loss": 0.2747, + "step": 12257 + }, + { + "epoch": 1.8757459831675591, + "grad_norm": 1.9460841714107107, + "learning_rate": 2.0178219685151544e-07, + "loss": 0.2954, + "step": 12258 + }, + { + "epoch": 1.8758990053557767, + "grad_norm": 2.130399857557917, + "learning_rate": 2.0128717546045174e-07, + "loss": 0.3298, + "step": 12259 + }, + { + "epoch": 1.8760520275439938, + "grad_norm": 2.054983131339847, + "learning_rate": 2.007927558466749e-07, + "loss": 0.2231, + "step": 12260 + }, + { + "epoch": 1.8762050497322111, + "grad_norm": 1.9846915653480426, + "learning_rate": 2.0029893804054956e-07, + "loss": 0.2199, + "step": 12261 + }, + { + "epoch": 1.8763580719204285, + "grad_norm": 2.1665049130134575, + "learning_rate": 1.9980572207240367e-07, + "loss": 0.3055, + "step": 12262 + }, + { + "epoch": 1.8765110941086458, + "grad_norm": 2.0578045313911857, + "learning_rate": 1.9931310797252635e-07, + "loss": 0.2991, + "step": 12263 + }, + { + "epoch": 1.8766641162968631, + "grad_norm": 1.7469439315722723, + "learning_rate": 1.9882109577117337e-07, + "loss": 0.2242, + "step": 12264 + }, + { + "epoch": 1.8768171384850802, + "grad_norm": 2.174488307923911, + "learning_rate": 1.9832968549855724e-07, + "loss": 0.3399, + "step": 12265 + }, + { + "epoch": 1.8769701606732976, + "grad_norm": 1.947263660138171, + "learning_rate": 1.978388771848594e-07, + "loss": 0.238, + "step": 12266 + }, + { + "epoch": 1.877123182861515, + "grad_norm": 2.155219460753691, + "learning_rate": 1.9734867086022458e-07, + "loss": 0.2746, + "step": 12267 + }, + { + "epoch": 1.8772762050497322, + "grad_norm": 2.042073767668411, + "learning_rate": 1.9685906655475428e-07, + "loss": 0.2658, + "step": 12268 + }, + { + "epoch": 1.8774292272379496, + "grad_norm": 1.8416151846022213, + "learning_rate": 1.9637006429851778e-07, + "loss": 0.214, + "step": 12269 + }, + { + "epoch": 1.8775822494261667, + "grad_norm": 2.3539053420151492, + "learning_rate": 1.9588166412154997e-07, + "loss": 0.3265, + "step": 12270 + }, + { + "epoch": 1.8777352716143842, + "grad_norm": 2.4376515005557637, + "learning_rate": 1.9539386605384125e-07, + "loss": 0.2822, + "step": 12271 + }, + { + "epoch": 1.8778882938026014, + "grad_norm": 1.8886652401346042, + "learning_rate": 1.9490667012535215e-07, + "loss": 0.2249, + "step": 12272 + }, + { + "epoch": 1.8780413159908187, + "grad_norm": 2.329875267873853, + "learning_rate": 1.9442007636600312e-07, + "loss": 0.2711, + "step": 12273 + }, + { + "epoch": 1.878194338179036, + "grad_norm": 2.189273001021051, + "learning_rate": 1.9393408480567477e-07, + "loss": 0.2499, + "step": 12274 + }, + { + "epoch": 1.8783473603672531, + "grad_norm": 2.439968634578106, + "learning_rate": 1.934486954742165e-07, + "loss": 0.3041, + "step": 12275 + }, + { + "epoch": 1.8785003825554707, + "grad_norm": 1.9757854141283482, + "learning_rate": 1.9296390840143897e-07, + "loss": 0.2711, + "step": 12276 + }, + { + "epoch": 1.8786534047436878, + "grad_norm": 2.0689238149257476, + "learning_rate": 1.9247972361711053e-07, + "loss": 0.2598, + "step": 12277 + }, + { + "epoch": 1.8788064269319051, + "grad_norm": 2.109148148749415, + "learning_rate": 1.9199614115097076e-07, + "loss": 0.2545, + "step": 12278 + }, + { + "epoch": 1.8789594491201225, + "grad_norm": 2.4297663317572438, + "learning_rate": 1.91513161032717e-07, + "loss": 0.2645, + "step": 12279 + }, + { + "epoch": 1.8791124713083396, + "grad_norm": 2.1144787392422755, + "learning_rate": 1.9103078329201108e-07, + "loss": 0.2495, + "step": 12280 + }, + { + "epoch": 1.8792654934965571, + "grad_norm": 2.559522126569114, + "learning_rate": 1.905490079584782e-07, + "loss": 0.2995, + "step": 12281 + }, + { + "epoch": 1.8794185156847742, + "grad_norm": 2.186023542033774, + "learning_rate": 1.9006783506170467e-07, + "loss": 0.288, + "step": 12282 + }, + { + "epoch": 1.8795715378729916, + "grad_norm": 2.3879072385137405, + "learning_rate": 1.8958726463124243e-07, + "loss": 0.28, + "step": 12283 + }, + { + "epoch": 1.879724560061209, + "grad_norm": 2.342145125582227, + "learning_rate": 1.8910729669660454e-07, + "loss": 0.2903, + "step": 12284 + }, + { + "epoch": 1.879877582249426, + "grad_norm": 2.2675058883591985, + "learning_rate": 1.8862793128726853e-07, + "loss": 0.2908, + "step": 12285 + }, + { + "epoch": 1.8800306044376436, + "grad_norm": 2.1204252540865367, + "learning_rate": 1.8814916843267306e-07, + "loss": 0.3024, + "step": 12286 + }, + { + "epoch": 1.8801836266258607, + "grad_norm": 2.1179211787734027, + "learning_rate": 1.876710081622224e-07, + "loss": 0.2748, + "step": 12287 + }, + { + "epoch": 1.880336648814078, + "grad_norm": 2.3750767038120775, + "learning_rate": 1.871934505052797e-07, + "loss": 0.3196, + "step": 12288 + }, + { + "epoch": 1.8804896710022954, + "grad_norm": 2.156315708260614, + "learning_rate": 1.86716495491176e-07, + "loss": 0.2977, + "step": 12289 + }, + { + "epoch": 1.8806426931905125, + "grad_norm": 2.0621837306649367, + "learning_rate": 1.8624014314920114e-07, + "loss": 0.3236, + "step": 12290 + }, + { + "epoch": 1.88079571537873, + "grad_norm": 1.917989006367677, + "learning_rate": 1.8576439350861175e-07, + "loss": 0.2343, + "step": 12291 + }, + { + "epoch": 1.8809487375669471, + "grad_norm": 1.9783581142507576, + "learning_rate": 1.8528924659862447e-07, + "loss": 0.2416, + "step": 12292 + }, + { + "epoch": 1.8811017597551645, + "grad_norm": 2.0896830236966557, + "learning_rate": 1.8481470244841925e-07, + "loss": 0.2807, + "step": 12293 + }, + { + "epoch": 1.8812547819433818, + "grad_norm": 2.09622278574327, + "learning_rate": 1.843407610871406e-07, + "loss": 0.2848, + "step": 12294 + }, + { + "epoch": 1.881407804131599, + "grad_norm": 1.9323158480708773, + "learning_rate": 1.838674225438941e-07, + "loss": 0.2593, + "step": 12295 + }, + { + "epoch": 1.8815608263198165, + "grad_norm": 2.2310584238434843, + "learning_rate": 1.8339468684775209e-07, + "loss": 0.2542, + "step": 12296 + }, + { + "epoch": 1.8817138485080336, + "grad_norm": 2.0500256612721874, + "learning_rate": 1.8292255402774462e-07, + "loss": 0.2342, + "step": 12297 + }, + { + "epoch": 1.881866870696251, + "grad_norm": 2.115013908907418, + "learning_rate": 1.824510241128663e-07, + "loss": 0.2748, + "step": 12298 + }, + { + "epoch": 1.8820198928844682, + "grad_norm": 1.907028475113633, + "learning_rate": 1.8198009713207843e-07, + "loss": 0.2571, + "step": 12299 + }, + { + "epoch": 1.8821729150726856, + "grad_norm": 2.071888069560488, + "learning_rate": 1.8150977311430007e-07, + "loss": 0.3025, + "step": 12300 + }, + { + "epoch": 1.882325937260903, + "grad_norm": 1.9734825940094098, + "learning_rate": 1.8104005208841702e-07, + "loss": 0.296, + "step": 12301 + }, + { + "epoch": 1.88247895944912, + "grad_norm": 2.07524085707333, + "learning_rate": 1.805709340832773e-07, + "loss": 0.2904, + "step": 12302 + }, + { + "epoch": 1.8826319816373374, + "grad_norm": 2.006754654899638, + "learning_rate": 1.80102419127689e-07, + "loss": 0.2159, + "step": 12303 + }, + { + "epoch": 1.8827850038255547, + "grad_norm": 2.0742510517617077, + "learning_rate": 1.7963450725042687e-07, + "loss": 0.2474, + "step": 12304 + }, + { + "epoch": 1.882938026013772, + "grad_norm": 2.082631008210018, + "learning_rate": 1.7916719848022902e-07, + "loss": 0.2379, + "step": 12305 + }, + { + "epoch": 1.8830910482019894, + "grad_norm": 2.032130659886695, + "learning_rate": 1.787004928457903e-07, + "loss": 0.2687, + "step": 12306 + }, + { + "epoch": 1.8832440703902065, + "grad_norm": 2.1550406519384424, + "learning_rate": 1.782343903757766e-07, + "loss": 0.2834, + "step": 12307 + }, + { + "epoch": 1.883397092578424, + "grad_norm": 2.22732515632349, + "learning_rate": 1.777688910988118e-07, + "loss": 0.3246, + "step": 12308 + }, + { + "epoch": 1.8835501147666411, + "grad_norm": 2.4500515480082665, + "learning_rate": 1.7730399504348404e-07, + "loss": 0.3195, + "step": 12309 + }, + { + "epoch": 1.8837031369548585, + "grad_norm": 2.268660747881073, + "learning_rate": 1.76839702238345e-07, + "loss": 0.3028, + "step": 12310 + }, + { + "epoch": 1.8838561591430758, + "grad_norm": 1.6763232823881535, + "learning_rate": 1.7637601271190852e-07, + "loss": 0.213, + "step": 12311 + }, + { + "epoch": 1.884009181331293, + "grad_norm": 2.132550267954547, + "learning_rate": 1.7591292649265069e-07, + "loss": 0.2764, + "step": 12312 + }, + { + "epoch": 1.8841622035195105, + "grad_norm": 3.1774836443797185, + "learning_rate": 1.754504436090121e-07, + "loss": 0.3129, + "step": 12313 + }, + { + "epoch": 1.8843152257077276, + "grad_norm": 2.3320424100195085, + "learning_rate": 1.7498856408939558e-07, + "loss": 0.2989, + "step": 12314 + }, + { + "epoch": 1.884468247895945, + "grad_norm": 2.0604693493677138, + "learning_rate": 1.7452728796216732e-07, + "loss": 0.3438, + "step": 12315 + }, + { + "epoch": 1.8846212700841622, + "grad_norm": 2.090110244983448, + "learning_rate": 1.7406661525565582e-07, + "loss": 0.2364, + "step": 12316 + }, + { + "epoch": 1.8847742922723794, + "grad_norm": 2.235882951459802, + "learning_rate": 1.7360654599815285e-07, + "loss": 0.3374, + "step": 12317 + }, + { + "epoch": 1.884927314460597, + "grad_norm": 2.080730815047136, + "learning_rate": 1.731470802179125e-07, + "loss": 0.3256, + "step": 12318 + }, + { + "epoch": 1.885080336648814, + "grad_norm": 2.3867969064570334, + "learning_rate": 1.7268821794315328e-07, + "loss": 0.2551, + "step": 12319 + }, + { + "epoch": 1.8852333588370314, + "grad_norm": 2.2624261138843966, + "learning_rate": 1.7222995920205488e-07, + "loss": 0.3053, + "step": 12320 + }, + { + "epoch": 1.8853863810252487, + "grad_norm": 2.095280750448679, + "learning_rate": 1.7177230402276147e-07, + "loss": 0.2418, + "step": 12321 + }, + { + "epoch": 1.8855394032134658, + "grad_norm": 1.9204988322442775, + "learning_rate": 1.7131525243337944e-07, + "loss": 0.2802, + "step": 12322 + }, + { + "epoch": 1.8856924254016834, + "grad_norm": 2.3341667000082715, + "learning_rate": 1.7085880446197633e-07, + "loss": 0.2716, + "step": 12323 + }, + { + "epoch": 1.8858454475899005, + "grad_norm": 2.4746042589685717, + "learning_rate": 1.7040296013658643e-07, + "loss": 0.3252, + "step": 12324 + }, + { + "epoch": 1.8859984697781178, + "grad_norm": 2.1480370697284847, + "learning_rate": 1.6994771948520506e-07, + "loss": 0.2229, + "step": 12325 + }, + { + "epoch": 1.8861514919663351, + "grad_norm": 1.8615177634968916, + "learning_rate": 1.694930825357899e-07, + "loss": 0.2294, + "step": 12326 + }, + { + "epoch": 1.8863045141545522, + "grad_norm": 2.215497558948797, + "learning_rate": 1.6903904931625968e-07, + "loss": 0.2807, + "step": 12327 + }, + { + "epoch": 1.8864575363427698, + "grad_norm": 2.191961181905781, + "learning_rate": 1.6858561985450327e-07, + "loss": 0.2419, + "step": 12328 + }, + { + "epoch": 1.886610558530987, + "grad_norm": 1.9739018107667339, + "learning_rate": 1.681327941783628e-07, + "loss": 0.3075, + "step": 12329 + }, + { + "epoch": 1.8867635807192042, + "grad_norm": 1.7804060009453269, + "learning_rate": 1.6768057231564938e-07, + "loss": 0.2203, + "step": 12330 + }, + { + "epoch": 1.8869166029074216, + "grad_norm": 2.2020925173246333, + "learning_rate": 1.6722895429413856e-07, + "loss": 0.2131, + "step": 12331 + }, + { + "epoch": 1.887069625095639, + "grad_norm": 2.1911656545700677, + "learning_rate": 1.6677794014156145e-07, + "loss": 0.3261, + "step": 12332 + }, + { + "epoch": 1.8872226472838562, + "grad_norm": 2.007242826500715, + "learning_rate": 1.6632752988561929e-07, + "loss": 0.2273, + "step": 12333 + }, + { + "epoch": 1.8873756694720734, + "grad_norm": 1.9420679663669713, + "learning_rate": 1.6587772355397437e-07, + "loss": 0.2709, + "step": 12334 + }, + { + "epoch": 1.8875286916602907, + "grad_norm": 2.1440316856194244, + "learning_rate": 1.6542852117424791e-07, + "loss": 0.239, + "step": 12335 + }, + { + "epoch": 1.887681713848508, + "grad_norm": 2.155474547422099, + "learning_rate": 1.6497992277403008e-07, + "loss": 0.2677, + "step": 12336 + }, + { + "epoch": 1.8878347360367254, + "grad_norm": 1.772226946632601, + "learning_rate": 1.6453192838086996e-07, + "loss": 0.2311, + "step": 12337 + }, + { + "epoch": 1.8879877582249427, + "grad_norm": 2.0741420930815706, + "learning_rate": 1.6408453802227998e-07, + "loss": 0.301, + "step": 12338 + }, + { + "epoch": 1.8881407804131598, + "grad_norm": 2.3602693015766856, + "learning_rate": 1.6363775172573814e-07, + "loss": 0.3471, + "step": 12339 + }, + { + "epoch": 1.8882938026013774, + "grad_norm": 1.8958952695132465, + "learning_rate": 1.631915695186803e-07, + "loss": 0.2583, + "step": 12340 + }, + { + "epoch": 1.8884468247895945, + "grad_norm": 2.248151990745066, + "learning_rate": 1.6274599142851123e-07, + "loss": 0.3297, + "step": 12341 + }, + { + "epoch": 1.8885998469778118, + "grad_norm": 2.240236669309958, + "learning_rate": 1.6230101748259453e-07, + "loss": 0.2999, + "step": 12342 + }, + { + "epoch": 1.8887528691660291, + "grad_norm": 2.068997854305708, + "learning_rate": 1.6185664770825727e-07, + "loss": 0.2607, + "step": 12343 + }, + { + "epoch": 1.8889058913542462, + "grad_norm": 2.298467877444234, + "learning_rate": 1.6141288213278982e-07, + "loss": 0.3697, + "step": 12344 + }, + { + "epoch": 1.8890589135424638, + "grad_norm": 2.063704879522499, + "learning_rate": 1.6096972078344598e-07, + "loss": 0.3015, + "step": 12345 + }, + { + "epoch": 1.889211935730681, + "grad_norm": 2.368876065700203, + "learning_rate": 1.6052716368744392e-07, + "loss": 0.2898, + "step": 12346 + }, + { + "epoch": 1.8893649579188982, + "grad_norm": 2.07193482877275, + "learning_rate": 1.600852108719597e-07, + "loss": 0.2883, + "step": 12347 + }, + { + "epoch": 1.8895179801071156, + "grad_norm": 2.1398240131474298, + "learning_rate": 1.59643862364135e-07, + "loss": 0.297, + "step": 12348 + }, + { + "epoch": 1.8896710022953327, + "grad_norm": 2.2600179058392778, + "learning_rate": 1.5920311819107915e-07, + "loss": 0.328, + "step": 12349 + }, + { + "epoch": 1.8898240244835502, + "grad_norm": 2.0289016769988897, + "learning_rate": 1.587629783798561e-07, + "loss": 0.2667, + "step": 12350 + }, + { + "epoch": 1.8899770466717674, + "grad_norm": 1.9507615969456944, + "learning_rate": 1.5832344295749758e-07, + "loss": 0.2921, + "step": 12351 + }, + { + "epoch": 1.8901300688599847, + "grad_norm": 2.2770940001537148, + "learning_rate": 1.5788451195099752e-07, + "loss": 0.2753, + "step": 12352 + }, + { + "epoch": 1.890283091048202, + "grad_norm": 2.434599224167367, + "learning_rate": 1.5744618538731106e-07, + "loss": 0.3081, + "step": 12353 + }, + { + "epoch": 1.8904361132364191, + "grad_norm": 2.1500543081305867, + "learning_rate": 1.5700846329335995e-07, + "loss": 0.2885, + "step": 12354 + }, + { + "epoch": 1.8905891354246367, + "grad_norm": 1.920834570258339, + "learning_rate": 1.5657134569602604e-07, + "loss": 0.2705, + "step": 12355 + }, + { + "epoch": 1.8907421576128538, + "grad_norm": 2.1694939010872174, + "learning_rate": 1.5613483262215125e-07, + "loss": 0.2975, + "step": 12356 + }, + { + "epoch": 1.8908951798010711, + "grad_norm": 2.155350044486877, + "learning_rate": 1.5569892409854626e-07, + "loss": 0.2426, + "step": 12357 + }, + { + "epoch": 1.8910482019892885, + "grad_norm": 2.3352026536074475, + "learning_rate": 1.5526362015198305e-07, + "loss": 0.3415, + "step": 12358 + }, + { + "epoch": 1.8912012241775056, + "grad_norm": 2.425425876866039, + "learning_rate": 1.5482892080919132e-07, + "loss": 0.3196, + "step": 12359 + }, + { + "epoch": 1.8913542463657231, + "grad_norm": 2.1513830506914813, + "learning_rate": 1.5439482609687196e-07, + "loss": 0.2952, + "step": 12360 + }, + { + "epoch": 1.8915072685539402, + "grad_norm": 2.0535776550069884, + "learning_rate": 1.539613360416825e-07, + "loss": 0.2172, + "step": 12361 + }, + { + "epoch": 1.8916602907421576, + "grad_norm": 2.1660181727611514, + "learning_rate": 1.5352845067024392e-07, + "loss": 0.2655, + "step": 12362 + }, + { + "epoch": 1.891813312930375, + "grad_norm": 2.0220484053824195, + "learning_rate": 1.530961700091438e-07, + "loss": 0.235, + "step": 12363 + }, + { + "epoch": 1.8919663351185922, + "grad_norm": 2.0225572484268746, + "learning_rate": 1.5266449408492979e-07, + "loss": 0.2296, + "step": 12364 + }, + { + "epoch": 1.8921193573068096, + "grad_norm": 2.5943193506917366, + "learning_rate": 1.522334229241107e-07, + "loss": 0.3002, + "step": 12365 + }, + { + "epoch": 1.8922723794950267, + "grad_norm": 2.314365201977131, + "learning_rate": 1.5180295655316312e-07, + "loss": 0.3017, + "step": 12366 + }, + { + "epoch": 1.892425401683244, + "grad_norm": 2.0155411326429147, + "learning_rate": 1.513730949985215e-07, + "loss": 0.268, + "step": 12367 + }, + { + "epoch": 1.8925784238714614, + "grad_norm": 1.936628601404376, + "learning_rate": 1.5094383828658687e-07, + "loss": 0.2173, + "step": 12368 + }, + { + "epoch": 1.8927314460596787, + "grad_norm": 2.6655173289279808, + "learning_rate": 1.5051518644372044e-07, + "loss": 0.3635, + "step": 12369 + }, + { + "epoch": 1.892884468247896, + "grad_norm": 1.9786815581966082, + "learning_rate": 1.5008713949624777e-07, + "loss": 0.3102, + "step": 12370 + }, + { + "epoch": 1.8930374904361131, + "grad_norm": 2.1595530135445213, + "learning_rate": 1.4965969747045673e-07, + "loss": 0.3011, + "step": 12371 + }, + { + "epoch": 1.8931905126243307, + "grad_norm": 2.203393747451202, + "learning_rate": 1.4923286039259855e-07, + "loss": 0.2665, + "step": 12372 + }, + { + "epoch": 1.8933435348125478, + "grad_norm": 2.1359510563270736, + "learning_rate": 1.488066282888878e-07, + "loss": 0.3154, + "step": 12373 + }, + { + "epoch": 1.8934965570007651, + "grad_norm": 1.952958864492451, + "learning_rate": 1.4838100118549803e-07, + "loss": 0.2494, + "step": 12374 + }, + { + "epoch": 1.8936495791889825, + "grad_norm": 2.7907391263373866, + "learning_rate": 1.4795597910857274e-07, + "loss": 0.3871, + "step": 12375 + }, + { + "epoch": 1.8938026013771996, + "grad_norm": 2.601807768733684, + "learning_rate": 1.4753156208421104e-07, + "loss": 0.2742, + "step": 12376 + }, + { + "epoch": 1.8939556235654171, + "grad_norm": 2.0452811830427478, + "learning_rate": 1.4710775013847879e-07, + "loss": 0.2629, + "step": 12377 + }, + { + "epoch": 1.8941086457536342, + "grad_norm": 2.1949730369156284, + "learning_rate": 1.4668454329740621e-07, + "loss": 0.3188, + "step": 12378 + }, + { + "epoch": 1.8942616679418516, + "grad_norm": 2.1266212476482753, + "learning_rate": 1.4626194158698149e-07, + "loss": 0.293, + "step": 12379 + }, + { + "epoch": 1.894414690130069, + "grad_norm": 2.3910652112091997, + "learning_rate": 1.4583994503315713e-07, + "loss": 0.3309, + "step": 12380 + }, + { + "epoch": 1.894567712318286, + "grad_norm": 2.195165911640832, + "learning_rate": 1.4541855366185464e-07, + "loss": 0.261, + "step": 12381 + }, + { + "epoch": 1.8947207345065036, + "grad_norm": 2.2227897094612015, + "learning_rate": 1.4499776749894668e-07, + "loss": 0.2892, + "step": 12382 + }, + { + "epoch": 1.8948737566947207, + "grad_norm": 1.983371860106799, + "learning_rate": 1.4457758657028142e-07, + "loss": 0.2333, + "step": 12383 + }, + { + "epoch": 1.895026778882938, + "grad_norm": 2.166258470216104, + "learning_rate": 1.4415801090166048e-07, + "loss": 0.2943, + "step": 12384 + }, + { + "epoch": 1.8951798010711554, + "grad_norm": 2.421166045256063, + "learning_rate": 1.43739040518851e-07, + "loss": 0.3261, + "step": 12385 + }, + { + "epoch": 1.8953328232593725, + "grad_norm": 2.5040245258127927, + "learning_rate": 1.4332067544758688e-07, + "loss": 0.3423, + "step": 12386 + }, + { + "epoch": 1.89548584544759, + "grad_norm": 2.0306426946320775, + "learning_rate": 1.4290291571355975e-07, + "loss": 0.2614, + "step": 12387 + }, + { + "epoch": 1.8956388676358071, + "grad_norm": 1.902404848644786, + "learning_rate": 1.4248576134242353e-07, + "loss": 0.235, + "step": 12388 + }, + { + "epoch": 1.8957918898240245, + "grad_norm": 2.030986303913293, + "learning_rate": 1.420692123598011e-07, + "loss": 0.2736, + "step": 12389 + }, + { + "epoch": 1.8959449120122418, + "grad_norm": 2.281148057551701, + "learning_rate": 1.4165326879127195e-07, + "loss": 0.357, + "step": 12390 + }, + { + "epoch": 1.896097934200459, + "grad_norm": 2.1204540996922163, + "learning_rate": 1.4123793066238233e-07, + "loss": 0.2949, + "step": 12391 + }, + { + "epoch": 1.8962509563886765, + "grad_norm": 2.023496130387062, + "learning_rate": 1.4082319799863963e-07, + "loss": 0.276, + "step": 12392 + }, + { + "epoch": 1.8964039785768936, + "grad_norm": 2.1823663321503886, + "learning_rate": 1.4040907082551237e-07, + "loss": 0.3208, + "step": 12393 + }, + { + "epoch": 1.896557000765111, + "grad_norm": 2.002947903538194, + "learning_rate": 1.3999554916843571e-07, + "loss": 0.2331, + "step": 12394 + }, + { + "epoch": 1.8967100229533282, + "grad_norm": 2.046209375144362, + "learning_rate": 1.3958263305280494e-07, + "loss": 0.2501, + "step": 12395 + }, + { + "epoch": 1.8968630451415456, + "grad_norm": 1.9425497013925002, + "learning_rate": 1.3917032250397867e-07, + "loss": 0.2359, + "step": 12396 + }, + { + "epoch": 1.897016067329763, + "grad_norm": 2.0608664471035745, + "learning_rate": 1.387586175472788e-07, + "loss": 0.2626, + "step": 12397 + }, + { + "epoch": 1.89716908951798, + "grad_norm": 2.1798385264217734, + "learning_rate": 1.3834751820798964e-07, + "loss": 0.2987, + "step": 12398 + }, + { + "epoch": 1.8973221117061974, + "grad_norm": 1.976167049765422, + "learning_rate": 1.3793702451135872e-07, + "loss": 0.2521, + "step": 12399 + }, + { + "epoch": 1.8974751338944147, + "grad_norm": 2.1064510108566052, + "learning_rate": 1.3752713648259475e-07, + "loss": 0.3585, + "step": 12400 + }, + { + "epoch": 1.897628156082632, + "grad_norm": 1.8467627262346928, + "learning_rate": 1.3711785414687207e-07, + "loss": 0.2776, + "step": 12401 + }, + { + "epoch": 1.8977811782708494, + "grad_norm": 2.0021520489916513, + "learning_rate": 1.367091775293261e-07, + "loss": 0.2253, + "step": 12402 + }, + { + "epoch": 1.8979342004590665, + "grad_norm": 2.004962053610836, + "learning_rate": 1.3630110665505347e-07, + "loss": 0.2494, + "step": 12403 + }, + { + "epoch": 1.898087222647284, + "grad_norm": 2.1024242844915, + "learning_rate": 1.3589364154911855e-07, + "loss": 0.2501, + "step": 12404 + }, + { + "epoch": 1.8982402448355011, + "grad_norm": 1.974976585706706, + "learning_rate": 1.3548678223654354e-07, + "loss": 0.2945, + "step": 12405 + }, + { + "epoch": 1.8983932670237185, + "grad_norm": 1.822674609909908, + "learning_rate": 1.3508052874231514e-07, + "loss": 0.2078, + "step": 12406 + }, + { + "epoch": 1.8985462892119358, + "grad_norm": 2.409596533644061, + "learning_rate": 1.346748810913834e-07, + "loss": 0.2808, + "step": 12407 + }, + { + "epoch": 1.898699311400153, + "grad_norm": 2.1866218250648752, + "learning_rate": 1.3426983930866055e-07, + "loss": 0.2917, + "step": 12408 + }, + { + "epoch": 1.8988523335883705, + "grad_norm": 2.2318137725230778, + "learning_rate": 1.338654034190212e-07, + "loss": 0.3389, + "step": 12409 + }, + { + "epoch": 1.8990053557765876, + "grad_norm": 2.1864107341649834, + "learning_rate": 1.3346157344730438e-07, + "loss": 0.2668, + "step": 12410 + }, + { + "epoch": 1.899158377964805, + "grad_norm": 2.0640684514509036, + "learning_rate": 1.330583494183102e-07, + "loss": 0.3253, + "step": 12411 + }, + { + "epoch": 1.8993114001530222, + "grad_norm": 2.061874070757145, + "learning_rate": 1.326557313568033e-07, + "loss": 0.2789, + "step": 12412 + }, + { + "epoch": 1.8994644223412394, + "grad_norm": 2.262601025711515, + "learning_rate": 1.3225371928750842e-07, + "loss": 0.3414, + "step": 12413 + }, + { + "epoch": 1.899617444529457, + "grad_norm": 2.071223047182192, + "learning_rate": 1.318523132351157e-07, + "loss": 0.2732, + "step": 12414 + }, + { + "epoch": 1.899770466717674, + "grad_norm": 2.2037935773831743, + "learning_rate": 1.3145151322427663e-07, + "loss": 0.2175, + "step": 12415 + }, + { + "epoch": 1.8999234889058914, + "grad_norm": 1.9905111743598205, + "learning_rate": 1.3105131927960702e-07, + "loss": 0.257, + "step": 12416 + }, + { + "epoch": 1.9000765110941087, + "grad_norm": 2.0918754035359166, + "learning_rate": 1.3065173142568276e-07, + "loss": 0.2877, + "step": 12417 + }, + { + "epoch": 1.9002295332823258, + "grad_norm": 2.1951012882049237, + "learning_rate": 1.3025274968704426e-07, + "loss": 0.3017, + "step": 12418 + }, + { + "epoch": 1.9003825554705434, + "grad_norm": 2.047502157807223, + "learning_rate": 1.2985437408819524e-07, + "loss": 0.2675, + "step": 12419 + }, + { + "epoch": 1.9005355776587605, + "grad_norm": 2.345449255427472, + "learning_rate": 1.2945660465360166e-07, + "loss": 0.2968, + "step": 12420 + }, + { + "epoch": 1.9006885998469778, + "grad_norm": 2.2405813389873455, + "learning_rate": 1.2905944140769178e-07, + "loss": 0.2276, + "step": 12421 + }, + { + "epoch": 1.9008416220351951, + "grad_norm": 1.8624021115253389, + "learning_rate": 1.2866288437485718e-07, + "loss": 0.252, + "step": 12422 + }, + { + "epoch": 1.9009946442234122, + "grad_norm": 2.0359610897944154, + "learning_rate": 1.2826693357945176e-07, + "loss": 0.2366, + "step": 12423 + }, + { + "epoch": 1.9011476664116298, + "grad_norm": 2.2615454945887556, + "learning_rate": 1.2787158904579268e-07, + "loss": 0.2951, + "step": 12424 + }, + { + "epoch": 1.901300688599847, + "grad_norm": 2.738627055525505, + "learning_rate": 1.274768507981583e-07, + "loss": 0.3292, + "step": 12425 + }, + { + "epoch": 1.9014537107880642, + "grad_norm": 2.153882026555609, + "learning_rate": 1.270827188607926e-07, + "loss": 0.2416, + "step": 12426 + }, + { + "epoch": 1.9016067329762816, + "grad_norm": 2.000143651464614, + "learning_rate": 1.2668919325789953e-07, + "loss": 0.2669, + "step": 12427 + }, + { + "epoch": 1.9017597551644987, + "grad_norm": 2.1215493546758073, + "learning_rate": 1.2629627401364864e-07, + "loss": 0.3234, + "step": 12428 + }, + { + "epoch": 1.9019127773527162, + "grad_norm": 1.9597291534053758, + "learning_rate": 1.2590396115216953e-07, + "loss": 0.2631, + "step": 12429 + }, + { + "epoch": 1.9020657995409334, + "grad_norm": 2.429837051524621, + "learning_rate": 1.2551225469755512e-07, + "loss": 0.3115, + "step": 12430 + }, + { + "epoch": 1.9022188217291507, + "grad_norm": 2.089905167506059, + "learning_rate": 1.25121154673864e-07, + "loss": 0.2776, + "step": 12431 + }, + { + "epoch": 1.902371843917368, + "grad_norm": 1.9529949744574147, + "learning_rate": 1.2473066110511244e-07, + "loss": 0.2083, + "step": 12432 + }, + { + "epoch": 1.9025248661055854, + "grad_norm": 2.2045107277586258, + "learning_rate": 1.2434077401528245e-07, + "loss": 0.2174, + "step": 12433 + }, + { + "epoch": 1.9026778882938027, + "grad_norm": 2.1168068178123405, + "learning_rate": 1.2395149342832035e-07, + "loss": 0.2749, + "step": 12434 + }, + { + "epoch": 1.9028309104820198, + "grad_norm": 2.0035038229922337, + "learning_rate": 1.2356281936813152e-07, + "loss": 0.2764, + "step": 12435 + }, + { + "epoch": 1.9029839326702371, + "grad_norm": 1.8645531982106915, + "learning_rate": 1.2317475185858797e-07, + "loss": 0.2857, + "step": 12436 + }, + { + "epoch": 1.9031369548584545, + "grad_norm": 1.9683962909981605, + "learning_rate": 1.227872909235206e-07, + "loss": 0.2156, + "step": 12437 + }, + { + "epoch": 1.9032899770466718, + "grad_norm": 2.5372871229932756, + "learning_rate": 1.2240043658672485e-07, + "loss": 0.3423, + "step": 12438 + }, + { + "epoch": 1.9034429992348891, + "grad_norm": 2.2487446160355384, + "learning_rate": 1.220141888719606e-07, + "loss": 0.3086, + "step": 12439 + }, + { + "epoch": 1.9035960214231062, + "grad_norm": 2.4433518403609487, + "learning_rate": 1.2162854780294775e-07, + "loss": 0.3025, + "step": 12440 + }, + { + "epoch": 1.9037490436113238, + "grad_norm": 2.285883593805717, + "learning_rate": 1.212435134033707e-07, + "loss": 0.3215, + "step": 12441 + }, + { + "epoch": 1.903902065799541, + "grad_norm": 2.07019078472154, + "learning_rate": 1.20859085696875e-07, + "loss": 0.2195, + "step": 12442 + }, + { + "epoch": 1.9040550879877582, + "grad_norm": 2.1764385332904204, + "learning_rate": 1.204752647070706e-07, + "loss": 0.2439, + "step": 12443 + }, + { + "epoch": 1.9042081101759756, + "grad_norm": 2.0657575704110194, + "learning_rate": 1.200920504575287e-07, + "loss": 0.3091, + "step": 12444 + }, + { + "epoch": 1.9043611323641927, + "grad_norm": 2.2142022624529867, + "learning_rate": 1.1970944297178377e-07, + "loss": 0.3519, + "step": 12445 + }, + { + "epoch": 1.9045141545524102, + "grad_norm": 2.2201081578650896, + "learning_rate": 1.1932744227333481e-07, + "loss": 0.2401, + "step": 12446 + }, + { + "epoch": 1.9046671767406274, + "grad_norm": 2.0766398494803036, + "learning_rate": 1.1894604838564083e-07, + "loss": 0.2747, + "step": 12447 + }, + { + "epoch": 1.9048201989288447, + "grad_norm": 1.8669964143468456, + "learning_rate": 1.1856526133212421e-07, + "loss": 0.2535, + "step": 12448 + }, + { + "epoch": 1.904973221117062, + "grad_norm": 2.1864682779631717, + "learning_rate": 1.1818508113617288e-07, + "loss": 0.2969, + "step": 12449 + }, + { + "epoch": 1.9051262433052791, + "grad_norm": 2.1671985516248315, + "learning_rate": 1.1780550782113154e-07, + "loss": 0.2791, + "step": 12450 + }, + { + "epoch": 1.9052792654934967, + "grad_norm": 1.7088933065196328, + "learning_rate": 1.1742654141031484e-07, + "loss": 0.1846, + "step": 12451 + }, + { + "epoch": 1.9054322876817138, + "grad_norm": 2.148970892648732, + "learning_rate": 1.1704818192699419e-07, + "loss": 0.2711, + "step": 12452 + }, + { + "epoch": 1.9055853098699311, + "grad_norm": 2.100215150528268, + "learning_rate": 1.1667042939440765e-07, + "loss": 0.252, + "step": 12453 + }, + { + "epoch": 1.9057383320581485, + "grad_norm": 2.360942987759934, + "learning_rate": 1.1629328383575444e-07, + "loss": 0.3214, + "step": 12454 + }, + { + "epoch": 1.9058913542463656, + "grad_norm": 2.1994757368737536, + "learning_rate": 1.1591674527419495e-07, + "loss": 0.2825, + "step": 12455 + }, + { + "epoch": 1.9060443764345831, + "grad_norm": 2.1044747531659933, + "learning_rate": 1.1554081373285398e-07, + "loss": 0.3447, + "step": 12456 + }, + { + "epoch": 1.9061973986228002, + "grad_norm": 1.779675570955133, + "learning_rate": 1.1516548923482196e-07, + "loss": 0.2186, + "step": 12457 + }, + { + "epoch": 1.9063504208110176, + "grad_norm": 2.09708548162683, + "learning_rate": 1.1479077180314601e-07, + "loss": 0.2884, + "step": 12458 + }, + { + "epoch": 1.906503442999235, + "grad_norm": 1.909809773435519, + "learning_rate": 1.1441666146083885e-07, + "loss": 0.2479, + "step": 12459 + }, + { + "epoch": 1.906656465187452, + "grad_norm": 1.806316291654644, + "learning_rate": 1.1404315823087875e-07, + "loss": 0.2085, + "step": 12460 + }, + { + "epoch": 1.9068094873756696, + "grad_norm": 2.127004509384845, + "learning_rate": 1.1367026213620181e-07, + "loss": 0.2636, + "step": 12461 + }, + { + "epoch": 1.9069625095638867, + "grad_norm": 2.205530653360376, + "learning_rate": 1.1329797319970859e-07, + "loss": 0.2809, + "step": 12462 + }, + { + "epoch": 1.907115531752104, + "grad_norm": 2.41892985391259, + "learning_rate": 1.1292629144426637e-07, + "loss": 0.3434, + "step": 12463 + }, + { + "epoch": 1.9072685539403214, + "grad_norm": 2.0336010224200036, + "learning_rate": 1.1255521689269577e-07, + "loss": 0.2807, + "step": 12464 + }, + { + "epoch": 1.9074215761285387, + "grad_norm": 2.721142821132342, + "learning_rate": 1.1218474956779191e-07, + "loss": 0.3746, + "step": 12465 + }, + { + "epoch": 1.907574598316756, + "grad_norm": 1.8455778126082905, + "learning_rate": 1.1181488949230323e-07, + "loss": 0.2009, + "step": 12466 + }, + { + "epoch": 1.9077276205049731, + "grad_norm": 2.164500498682435, + "learning_rate": 1.1144563668894381e-07, + "loss": 0.3123, + "step": 12467 + }, + { + "epoch": 1.9078806426931905, + "grad_norm": 2.3175653893300705, + "learning_rate": 1.1107699118039328e-07, + "loss": 0.2817, + "step": 12468 + }, + { + "epoch": 1.9080336648814078, + "grad_norm": 2.1705603938031337, + "learning_rate": 1.1070895298929019e-07, + "loss": 0.2563, + "step": 12469 + }, + { + "epoch": 1.9081866870696251, + "grad_norm": 2.170225385518329, + "learning_rate": 1.1034152213823646e-07, + "loss": 0.3054, + "step": 12470 + }, + { + "epoch": 1.9083397092578425, + "grad_norm": 2.392108868688555, + "learning_rate": 1.099746986497996e-07, + "loss": 0.3374, + "step": 12471 + }, + { + "epoch": 1.9084927314460596, + "grad_norm": 1.943591082629844, + "learning_rate": 1.0960848254650603e-07, + "loss": 0.2415, + "step": 12472 + }, + { + "epoch": 1.9086457536342771, + "grad_norm": 2.144948863915394, + "learning_rate": 1.0924287385084776e-07, + "loss": 0.2933, + "step": 12473 + }, + { + "epoch": 1.9087987758224942, + "grad_norm": 1.9600270754442108, + "learning_rate": 1.0887787258527682e-07, + "loss": 0.27, + "step": 12474 + }, + { + "epoch": 1.9089517980107116, + "grad_norm": 2.29484241675231, + "learning_rate": 1.0851347877220975e-07, + "loss": 0.3108, + "step": 12475 + }, + { + "epoch": 1.909104820198929, + "grad_norm": 1.8828750051292165, + "learning_rate": 1.0814969243402529e-07, + "loss": 0.2352, + "step": 12476 + }, + { + "epoch": 1.909257842387146, + "grad_norm": 2.089128050877077, + "learning_rate": 1.0778651359306669e-07, + "loss": 0.2714, + "step": 12477 + }, + { + "epoch": 1.9094108645753636, + "grad_norm": 2.2924888911310894, + "learning_rate": 1.074239422716361e-07, + "loss": 0.2585, + "step": 12478 + }, + { + "epoch": 1.9095638867635807, + "grad_norm": 2.1199032370475708, + "learning_rate": 1.0706197849200128e-07, + "loss": 0.2535, + "step": 12479 + }, + { + "epoch": 1.909716908951798, + "grad_norm": 1.80239557608064, + "learning_rate": 1.0670062227639111e-07, + "loss": 0.265, + "step": 12480 + }, + { + "epoch": 1.9098699311400154, + "grad_norm": 2.147866520218618, + "learning_rate": 1.0633987364700004e-07, + "loss": 0.223, + "step": 12481 + }, + { + "epoch": 1.9100229533282325, + "grad_norm": 2.466143189763118, + "learning_rate": 1.0597973262598038e-07, + "loss": 0.3343, + "step": 12482 + }, + { + "epoch": 1.91017597551645, + "grad_norm": 2.3340145813910613, + "learning_rate": 1.0562019923545108e-07, + "loss": 0.3126, + "step": 12483 + }, + { + "epoch": 1.9103289977046671, + "grad_norm": 2.1269179837487022, + "learning_rate": 1.0526127349749227e-07, + "loss": 0.2429, + "step": 12484 + }, + { + "epoch": 1.9104820198928845, + "grad_norm": 2.248127125499551, + "learning_rate": 1.049029554341463e-07, + "loss": 0.2539, + "step": 12485 + }, + { + "epoch": 1.9106350420811018, + "grad_norm": 2.0877173544121357, + "learning_rate": 1.0454524506742225e-07, + "loss": 0.2841, + "step": 12486 + }, + { + "epoch": 1.910788064269319, + "grad_norm": 2.1014344280122867, + "learning_rate": 1.0418814241928366e-07, + "loss": 0.2673, + "step": 12487 + }, + { + "epoch": 1.9109410864575365, + "grad_norm": 2.2504178395361425, + "learning_rate": 1.0383164751166409e-07, + "loss": 0.2564, + "step": 12488 + }, + { + "epoch": 1.9110941086457536, + "grad_norm": 1.9582453306000769, + "learning_rate": 1.0347576036645824e-07, + "loss": 0.2639, + "step": 12489 + }, + { + "epoch": 1.911247130833971, + "grad_norm": 2.0337728796022656, + "learning_rate": 1.0312048100552085e-07, + "loss": 0.2854, + "step": 12490 + }, + { + "epoch": 1.9114001530221882, + "grad_norm": 2.3114295530401257, + "learning_rate": 1.0276580945067116e-07, + "loss": 0.2843, + "step": 12491 + }, + { + "epoch": 1.9115531752104054, + "grad_norm": 1.7069199598118328, + "learning_rate": 1.0241174572369172e-07, + "loss": 0.2143, + "step": 12492 + }, + { + "epoch": 1.911706197398623, + "grad_norm": 2.099298087764273, + "learning_rate": 1.0205828984632626e-07, + "loss": 0.2617, + "step": 12493 + }, + { + "epoch": 1.91185921958684, + "grad_norm": 2.2982638891837803, + "learning_rate": 1.0170544184028186e-07, + "loss": 0.3266, + "step": 12494 + }, + { + "epoch": 1.9120122417750574, + "grad_norm": 1.921762199018458, + "learning_rate": 1.0135320172723007e-07, + "loss": 0.2517, + "step": 12495 + }, + { + "epoch": 1.9121652639632747, + "grad_norm": 2.0989247222133285, + "learning_rate": 1.0100156952880025e-07, + "loss": 0.2335, + "step": 12496 + }, + { + "epoch": 1.912318286151492, + "grad_norm": 2.2323329776738263, + "learning_rate": 1.0065054526658957e-07, + "loss": 0.2551, + "step": 12497 + }, + { + "epoch": 1.9124713083397094, + "grad_norm": 2.2590344295387066, + "learning_rate": 1.0030012896215635e-07, + "loss": 0.2943, + "step": 12498 + }, + { + "epoch": 1.9126243305279265, + "grad_norm": 2.2349113812770134, + "learning_rate": 9.995032063701781e-08, + "loss": 0.2179, + "step": 12499 + }, + { + "epoch": 1.9127773527161438, + "grad_norm": 2.086848029996208, + "learning_rate": 9.960112031266123e-08, + "loss": 0.2488, + "step": 12500 + }, + { + "epoch": 1.9129303749043611, + "grad_norm": 2.5167862029364088, + "learning_rate": 9.925252801052943e-08, + "loss": 0.2748, + "step": 12501 + }, + { + "epoch": 1.9130833970925785, + "grad_norm": 1.8074348221085181, + "learning_rate": 9.890454375203306e-08, + "loss": 0.2377, + "step": 12502 + }, + { + "epoch": 1.9132364192807958, + "grad_norm": 1.9891945424069828, + "learning_rate": 9.855716755854062e-08, + "loss": 0.2427, + "step": 12503 + }, + { + "epoch": 1.913389441469013, + "grad_norm": 1.9930912025356717, + "learning_rate": 9.821039945138833e-08, + "loss": 0.2679, + "step": 12504 + }, + { + "epoch": 1.9135424636572305, + "grad_norm": 1.9143467675625112, + "learning_rate": 9.78642394518714e-08, + "loss": 0.2671, + "step": 12505 + }, + { + "epoch": 1.9136954858454476, + "grad_norm": 2.3126000901167094, + "learning_rate": 9.751868758124839e-08, + "loss": 0.3273, + "step": 12506 + }, + { + "epoch": 1.913848508033665, + "grad_norm": 2.48212979320504, + "learning_rate": 9.717374386074118e-08, + "loss": 0.3258, + "step": 12507 + }, + { + "epoch": 1.9140015302218822, + "grad_norm": 1.9366579694374044, + "learning_rate": 9.682940831153509e-08, + "loss": 0.2183, + "step": 12508 + }, + { + "epoch": 1.9141545524100994, + "grad_norm": 2.146378976268681, + "learning_rate": 9.648568095477539e-08, + "loss": 0.3257, + "step": 12509 + }, + { + "epoch": 1.914307574598317, + "grad_norm": 2.0975727968979614, + "learning_rate": 9.614256181157411e-08, + "loss": 0.2838, + "step": 12510 + }, + { + "epoch": 1.914460596786534, + "grad_norm": 2.3790386333219535, + "learning_rate": 9.580005090300104e-08, + "loss": 0.2916, + "step": 12511 + }, + { + "epoch": 1.9146136189747514, + "grad_norm": 2.063496598618312, + "learning_rate": 9.54581482500927e-08, + "loss": 0.2567, + "step": 12512 + }, + { + "epoch": 1.9147666411629687, + "grad_norm": 2.177710720193587, + "learning_rate": 9.511685387384673e-08, + "loss": 0.2677, + "step": 12513 + }, + { + "epoch": 1.9149196633511858, + "grad_norm": 2.3674245980586703, + "learning_rate": 9.477616779522191e-08, + "loss": 0.3017, + "step": 12514 + }, + { + "epoch": 1.9150726855394034, + "grad_norm": 2.3237512722434808, + "learning_rate": 9.443609003514376e-08, + "loss": 0.2724, + "step": 12515 + }, + { + "epoch": 1.9152257077276205, + "grad_norm": 2.1349857240907815, + "learning_rate": 9.409662061449553e-08, + "loss": 0.278, + "step": 12516 + }, + { + "epoch": 1.9153787299158378, + "grad_norm": 1.929247113089893, + "learning_rate": 9.375775955412502e-08, + "loss": 0.2658, + "step": 12517 + }, + { + "epoch": 1.9155317521040551, + "grad_norm": 2.078402299391299, + "learning_rate": 9.341950687484447e-08, + "loss": 0.2476, + "step": 12518 + }, + { + "epoch": 1.9156847742922722, + "grad_norm": 2.150314478366211, + "learning_rate": 9.308186259742724e-08, + "loss": 0.292, + "step": 12519 + }, + { + "epoch": 1.9158377964804898, + "grad_norm": 2.2735395559159626, + "learning_rate": 9.274482674260787e-08, + "loss": 0.295, + "step": 12520 + }, + { + "epoch": 1.915990818668707, + "grad_norm": 2.0485204164380066, + "learning_rate": 9.240839933108647e-08, + "loss": 0.2815, + "step": 12521 + }, + { + "epoch": 1.9161438408569242, + "grad_norm": 1.9152040534504782, + "learning_rate": 9.20725803835243e-08, + "loss": 0.2533, + "step": 12522 + }, + { + "epoch": 1.9162968630451416, + "grad_norm": 2.280007794782544, + "learning_rate": 9.173736992054483e-08, + "loss": 0.2678, + "step": 12523 + }, + { + "epoch": 1.9164498852333587, + "grad_norm": 2.5147731875289656, + "learning_rate": 9.140276796273495e-08, + "loss": 0.3013, + "step": 12524 + }, + { + "epoch": 1.9166029074215762, + "grad_norm": 2.1676092373292843, + "learning_rate": 9.106877453064267e-08, + "loss": 0.2655, + "step": 12525 + }, + { + "epoch": 1.9167559296097934, + "grad_norm": 2.3267045294522135, + "learning_rate": 9.073538964478267e-08, + "loss": 0.2656, + "step": 12526 + }, + { + "epoch": 1.9169089517980107, + "grad_norm": 2.089513835534899, + "learning_rate": 9.040261332562639e-08, + "loss": 0.2434, + "step": 12527 + }, + { + "epoch": 1.917061973986228, + "grad_norm": 2.163041959965656, + "learning_rate": 9.0070445593613e-08, + "loss": 0.2686, + "step": 12528 + }, + { + "epoch": 1.9172149961744451, + "grad_norm": 1.979792412059793, + "learning_rate": 8.973888646914064e-08, + "loss": 0.241, + "step": 12529 + }, + { + "epoch": 1.9173680183626627, + "grad_norm": 1.8840145007212765, + "learning_rate": 8.940793597257303e-08, + "loss": 0.2338, + "step": 12530 + }, + { + "epoch": 1.9175210405508798, + "grad_norm": 2.101688674121348, + "learning_rate": 8.90775941242339e-08, + "loss": 0.2335, + "step": 12531 + }, + { + "epoch": 1.9176740627390971, + "grad_norm": 2.200367918955295, + "learning_rate": 8.874786094441257e-08, + "loss": 0.2976, + "step": 12532 + }, + { + "epoch": 1.9178270849273145, + "grad_norm": 2.0360117662478583, + "learning_rate": 8.84187364533573e-08, + "loss": 0.2713, + "step": 12533 + }, + { + "epoch": 1.9179801071155318, + "grad_norm": 1.870087321117023, + "learning_rate": 8.809022067128193e-08, + "loss": 0.2147, + "step": 12534 + }, + { + "epoch": 1.9181331293037491, + "grad_norm": 1.9813617183509307, + "learning_rate": 8.776231361836141e-08, + "loss": 0.2583, + "step": 12535 + }, + { + "epoch": 1.9182861514919662, + "grad_norm": 2.285906135950109, + "learning_rate": 8.743501531473409e-08, + "loss": 0.2987, + "step": 12536 + }, + { + "epoch": 1.9184391736801836, + "grad_norm": 2.0323696479504765, + "learning_rate": 8.710832578050166e-08, + "loss": 0.2593, + "step": 12537 + }, + { + "epoch": 1.918592195868401, + "grad_norm": 2.1254409285361473, + "learning_rate": 8.678224503572474e-08, + "loss": 0.2387, + "step": 12538 + }, + { + "epoch": 1.9187452180566182, + "grad_norm": 2.023023101695638, + "learning_rate": 8.645677310043288e-08, + "loss": 0.2443, + "step": 12539 + }, + { + "epoch": 1.9188982402448356, + "grad_norm": 2.0102458000598773, + "learning_rate": 8.613190999461119e-08, + "loss": 0.2564, + "step": 12540 + }, + { + "epoch": 1.9190512624330527, + "grad_norm": 1.8525700822632671, + "learning_rate": 8.580765573821148e-08, + "loss": 0.2515, + "step": 12541 + }, + { + "epoch": 1.9192042846212702, + "grad_norm": 2.096632072259417, + "learning_rate": 8.548401035115006e-08, + "loss": 0.2639, + "step": 12542 + }, + { + "epoch": 1.9193573068094874, + "grad_norm": 2.1823118733419493, + "learning_rate": 8.516097385330102e-08, + "loss": 0.2719, + "step": 12543 + }, + { + "epoch": 1.9195103289977047, + "grad_norm": 2.217747952601633, + "learning_rate": 8.483854626450183e-08, + "loss": 0.2093, + "step": 12544 + }, + { + "epoch": 1.919663351185922, + "grad_norm": 2.109037847197852, + "learning_rate": 8.451672760455775e-08, + "loss": 0.3524, + "step": 12545 + }, + { + "epoch": 1.9198163733741391, + "grad_norm": 2.2662799826187023, + "learning_rate": 8.419551789322966e-08, + "loss": 0.293, + "step": 12546 + }, + { + "epoch": 1.9199693955623567, + "grad_norm": 2.123070326666157, + "learning_rate": 8.38749171502462e-08, + "loss": 0.2602, + "step": 12547 + }, + { + "epoch": 1.9201224177505738, + "grad_norm": 2.0546614358340296, + "learning_rate": 8.355492539529719e-08, + "loss": 0.305, + "step": 12548 + }, + { + "epoch": 1.9202754399387911, + "grad_norm": 2.072183050978695, + "learning_rate": 8.323554264803247e-08, + "loss": 0.2799, + "step": 12549 + }, + { + "epoch": 1.9204284621270085, + "grad_norm": 2.3442769629175904, + "learning_rate": 8.291676892806743e-08, + "loss": 0.3164, + "step": 12550 + }, + { + "epoch": 1.9205814843152256, + "grad_norm": 2.035394827803196, + "learning_rate": 8.25986042549809e-08, + "loss": 0.2435, + "step": 12551 + }, + { + "epoch": 1.9207345065034431, + "grad_norm": 2.1027386412155615, + "learning_rate": 8.228104864830943e-08, + "loss": 0.3025, + "step": 12552 + }, + { + "epoch": 1.9208875286916602, + "grad_norm": 2.094493796125041, + "learning_rate": 8.196410212755856e-08, + "loss": 0.2693, + "step": 12553 + }, + { + "epoch": 1.9210405508798776, + "grad_norm": 2.4802220142511264, + "learning_rate": 8.164776471219049e-08, + "loss": 0.3032, + "step": 12554 + }, + { + "epoch": 1.921193573068095, + "grad_norm": 1.9498766283903592, + "learning_rate": 8.133203642163523e-08, + "loss": 0.237, + "step": 12555 + }, + { + "epoch": 1.921346595256312, + "grad_norm": 2.2455347228710574, + "learning_rate": 8.10169172752806e-08, + "loss": 0.2767, + "step": 12556 + }, + { + "epoch": 1.9214996174445296, + "grad_norm": 1.957315184740743, + "learning_rate": 8.070240729248114e-08, + "loss": 0.2109, + "step": 12557 + }, + { + "epoch": 1.9216526396327467, + "grad_norm": 1.965643467306098, + "learning_rate": 8.038850649255137e-08, + "loss": 0.2379, + "step": 12558 + }, + { + "epoch": 1.921805661820964, + "grad_norm": 2.19852476658279, + "learning_rate": 8.00752148947681e-08, + "loss": 0.2581, + "step": 12559 + }, + { + "epoch": 1.9219586840091814, + "grad_norm": 2.004743748037509, + "learning_rate": 7.976253251837374e-08, + "loss": 0.301, + "step": 12560 + }, + { + "epoch": 1.9221117061973985, + "grad_norm": 2.2203783236108947, + "learning_rate": 7.945045938257067e-08, + "loss": 0.2943, + "step": 12561 + }, + { + "epoch": 1.922264728385616, + "grad_norm": 2.355138775293594, + "learning_rate": 7.91389955065236e-08, + "loss": 0.5327, + "step": 12562 + }, + { + "epoch": 1.9224177505738331, + "grad_norm": 2.2869360368009946, + "learning_rate": 7.882814090936164e-08, + "loss": 0.3156, + "step": 12563 + }, + { + "epoch": 1.9225707727620505, + "grad_norm": 2.269187238957002, + "learning_rate": 7.85178956101762e-08, + "loss": 0.3369, + "step": 12564 + }, + { + "epoch": 1.9227237949502678, + "grad_norm": 2.260139178110384, + "learning_rate": 7.82082596280187e-08, + "loss": 0.2689, + "step": 12565 + }, + { + "epoch": 1.9228768171384851, + "grad_norm": 2.307748324586521, + "learning_rate": 7.789923298190616e-08, + "loss": 0.317, + "step": 12566 + }, + { + "epoch": 1.9230298393267025, + "grad_norm": 1.878965997966544, + "learning_rate": 7.759081569081784e-08, + "loss": 0.2762, + "step": 12567 + }, + { + "epoch": 1.9231828615149196, + "grad_norm": 2.121729343823374, + "learning_rate": 7.728300777369412e-08, + "loss": 0.2658, + "step": 12568 + }, + { + "epoch": 1.923335883703137, + "grad_norm": 2.1638380361755796, + "learning_rate": 7.697580924943881e-08, + "loss": 0.2773, + "step": 12569 + }, + { + "epoch": 1.9234889058913542, + "grad_norm": 2.1895033980428797, + "learning_rate": 7.666922013691791e-08, + "loss": 0.1938, + "step": 12570 + }, + { + "epoch": 1.9236419280795716, + "grad_norm": 2.3956553016334556, + "learning_rate": 7.63632404549608e-08, + "loss": 0.3345, + "step": 12571 + }, + { + "epoch": 1.923794950267789, + "grad_norm": 1.7449932799207868, + "learning_rate": 7.605787022235912e-08, + "loss": 0.2398, + "step": 12572 + }, + { + "epoch": 1.923947972456006, + "grad_norm": 2.2376418795698387, + "learning_rate": 7.575310945786452e-08, + "loss": 0.284, + "step": 12573 + }, + { + "epoch": 1.9241009946442236, + "grad_norm": 1.9035723603217567, + "learning_rate": 7.54489581801976e-08, + "loss": 0.2773, + "step": 12574 + }, + { + "epoch": 1.9242540168324407, + "grad_norm": 2.2575554720495394, + "learning_rate": 7.514541640803342e-08, + "loss": 0.2984, + "step": 12575 + }, + { + "epoch": 1.924407039020658, + "grad_norm": 1.981363852999967, + "learning_rate": 7.484248416001594e-08, + "loss": 0.2508, + "step": 12576 + }, + { + "epoch": 1.9245600612088754, + "grad_norm": 1.9614331502072087, + "learning_rate": 7.454016145474918e-08, + "loss": 0.2651, + "step": 12577 + }, + { + "epoch": 1.9247130833970925, + "grad_norm": 2.1497780699564566, + "learning_rate": 7.423844831079941e-08, + "loss": 0.347, + "step": 12578 + }, + { + "epoch": 1.92486610558531, + "grad_norm": 2.240514112168765, + "learning_rate": 7.393734474669623e-08, + "loss": 0.2748, + "step": 12579 + }, + { + "epoch": 1.9250191277735271, + "grad_norm": 1.8984880370754438, + "learning_rate": 7.363685078093264e-08, + "loss": 0.2359, + "step": 12580 + }, + { + "epoch": 1.9251721499617445, + "grad_norm": 1.9864328339745811, + "learning_rate": 7.333696643196164e-08, + "loss": 0.236, + "step": 12581 + }, + { + "epoch": 1.9253251721499618, + "grad_norm": 2.234747741359613, + "learning_rate": 7.303769171820074e-08, + "loss": 0.252, + "step": 12582 + }, + { + "epoch": 1.925478194338179, + "grad_norm": 1.9754636671622134, + "learning_rate": 7.273902665802967e-08, + "loss": 0.2453, + "step": 12583 + }, + { + "epoch": 1.9256312165263965, + "grad_norm": 2.146107414003439, + "learning_rate": 7.244097126979044e-08, + "loss": 0.2792, + "step": 12584 + }, + { + "epoch": 1.9257842387146136, + "grad_norm": 1.9724035356572298, + "learning_rate": 7.214352557178838e-08, + "loss": 0.2872, + "step": 12585 + }, + { + "epoch": 1.925937260902831, + "grad_norm": 1.757835085743604, + "learning_rate": 7.184668958229003e-08, + "loss": 0.2815, + "step": 12586 + }, + { + "epoch": 1.9260902830910482, + "grad_norm": 2.1502075931636013, + "learning_rate": 7.15504633195263e-08, + "loss": 0.2656, + "step": 12587 + }, + { + "epoch": 1.9262433052792653, + "grad_norm": 2.0155875426810343, + "learning_rate": 7.125484680168826e-08, + "loss": 0.27, + "step": 12588 + }, + { + "epoch": 1.926396327467483, + "grad_norm": 1.8428482707965728, + "learning_rate": 7.095984004693246e-08, + "loss": 0.2136, + "step": 12589 + }, + { + "epoch": 1.9265493496557, + "grad_norm": 2.2304850376404213, + "learning_rate": 7.066544307337442e-08, + "loss": 0.2676, + "step": 12590 + }, + { + "epoch": 1.9267023718439173, + "grad_norm": 2.5671786865674706, + "learning_rate": 7.037165589909523e-08, + "loss": 0.316, + "step": 12591 + }, + { + "epoch": 1.9268553940321347, + "grad_norm": 2.2924701688919007, + "learning_rate": 7.007847854213822e-08, + "loss": 0.2505, + "step": 12592 + }, + { + "epoch": 1.9270084162203518, + "grad_norm": 2.0974154895266355, + "learning_rate": 6.978591102050791e-08, + "loss": 0.309, + "step": 12593 + }, + { + "epoch": 1.9271614384085693, + "grad_norm": 2.1726512258841764, + "learning_rate": 6.949395335217102e-08, + "loss": 0.3529, + "step": 12594 + }, + { + "epoch": 1.9273144605967865, + "grad_norm": 2.4202430555156775, + "learning_rate": 6.920260555506098e-08, + "loss": 0.3122, + "step": 12595 + }, + { + "epoch": 1.9274674827850038, + "grad_norm": 2.542246552788122, + "learning_rate": 6.891186764706681e-08, + "loss": 0.2913, + "step": 12596 + }, + { + "epoch": 1.9276205049732211, + "grad_norm": 2.3066907503970624, + "learning_rate": 6.862173964604535e-08, + "loss": 0.277, + "step": 12597 + }, + { + "epoch": 1.9277735271614385, + "grad_norm": 2.4028158038821417, + "learning_rate": 6.833222156981567e-08, + "loss": 0.2989, + "step": 12598 + }, + { + "epoch": 1.9279265493496558, + "grad_norm": 2.4353789679467313, + "learning_rate": 6.804331343615577e-08, + "loss": 0.2893, + "step": 12599 + }, + { + "epoch": 1.928079571537873, + "grad_norm": 2.2241183604402632, + "learning_rate": 6.775501526281037e-08, + "loss": 0.272, + "step": 12600 + }, + { + "epoch": 1.9282325937260902, + "grad_norm": 2.430754290427599, + "learning_rate": 6.746732706748637e-08, + "loss": 0.2673, + "step": 12601 + }, + { + "epoch": 1.9283856159143076, + "grad_norm": 2.281050405523614, + "learning_rate": 6.718024886784858e-08, + "loss": 0.2809, + "step": 12602 + }, + { + "epoch": 1.928538638102525, + "grad_norm": 2.1745371935601066, + "learning_rate": 6.689378068152841e-08, + "loss": 0.2604, + "step": 12603 + }, + { + "epoch": 1.9286916602907422, + "grad_norm": 2.1098243004454025, + "learning_rate": 6.660792252612069e-08, + "loss": 0.2782, + "step": 12604 + }, + { + "epoch": 1.9288446824789593, + "grad_norm": 2.298117415399452, + "learning_rate": 6.632267441918027e-08, + "loss": 0.2785, + "step": 12605 + }, + { + "epoch": 1.928997704667177, + "grad_norm": 2.486226541978198, + "learning_rate": 6.603803637822537e-08, + "loss": 0.3168, + "step": 12606 + }, + { + "epoch": 1.929150726855394, + "grad_norm": 2.236276600745981, + "learning_rate": 6.575400842073643e-08, + "loss": 0.293, + "step": 12607 + }, + { + "epoch": 1.9293037490436113, + "grad_norm": 2.3638793794792217, + "learning_rate": 6.547059056415617e-08, + "loss": 0.2954, + "step": 12608 + }, + { + "epoch": 1.9294567712318287, + "grad_norm": 2.2428113757322494, + "learning_rate": 6.518778282589067e-08, + "loss": 0.2923, + "step": 12609 + }, + { + "epoch": 1.9296097934200458, + "grad_norm": 2.5389227577644284, + "learning_rate": 6.490558522330936e-08, + "loss": 0.3074, + "step": 12610 + }, + { + "epoch": 1.9297628156082633, + "grad_norm": 2.090509434099971, + "learning_rate": 6.462399777374284e-08, + "loss": 0.2688, + "step": 12611 + }, + { + "epoch": 1.9299158377964805, + "grad_norm": 2.1129234004043957, + "learning_rate": 6.434302049448393e-08, + "loss": 0.2829, + "step": 12612 + }, + { + "epoch": 1.9300688599846978, + "grad_norm": 1.8011178532916772, + "learning_rate": 6.406265340278772e-08, + "loss": 0.2099, + "step": 12613 + }, + { + "epoch": 1.9302218821729151, + "grad_norm": 2.264044979485212, + "learning_rate": 6.378289651587377e-08, + "loss": 0.2486, + "step": 12614 + }, + { + "epoch": 1.9303749043611322, + "grad_norm": 2.1444665625136534, + "learning_rate": 6.350374985092389e-08, + "loss": 0.2625, + "step": 12615 + }, + { + "epoch": 1.9305279265493498, + "grad_norm": 1.8452951812548426, + "learning_rate": 6.322521342507993e-08, + "loss": 0.2091, + "step": 12616 + }, + { + "epoch": 1.930680948737567, + "grad_norm": 2.2619235464051566, + "learning_rate": 6.294728725544708e-08, + "loss": 0.2916, + "step": 12617 + }, + { + "epoch": 1.9308339709257842, + "grad_norm": 1.8293153143516894, + "learning_rate": 6.266997135909725e-08, + "loss": 0.2575, + "step": 12618 + }, + { + "epoch": 1.9309869931140016, + "grad_norm": 1.9826796507956632, + "learning_rate": 6.239326575305904e-08, + "loss": 0.2715, + "step": 12619 + }, + { + "epoch": 1.9311400153022187, + "grad_norm": 2.3014999251705257, + "learning_rate": 6.211717045432553e-08, + "loss": 0.3369, + "step": 12620 + }, + { + "epoch": 1.9312930374904362, + "grad_norm": 2.2258466809607436, + "learning_rate": 6.184168547985536e-08, + "loss": 0.299, + "step": 12621 + }, + { + "epoch": 1.9314460596786533, + "grad_norm": 1.994257024190829, + "learning_rate": 6.156681084656501e-08, + "loss": 0.2588, + "step": 12622 + }, + { + "epoch": 1.9315990818668707, + "grad_norm": 2.1592430726452148, + "learning_rate": 6.129254657133544e-08, + "loss": 0.2516, + "step": 12623 + }, + { + "epoch": 1.931752104055088, + "grad_norm": 1.931950663977846, + "learning_rate": 6.101889267101201e-08, + "loss": 0.2307, + "step": 12624 + }, + { + "epoch": 1.9319051262433051, + "grad_norm": 2.3942192745722406, + "learning_rate": 6.074584916239911e-08, + "loss": 0.3507, + "step": 12625 + }, + { + "epoch": 1.9320581484315227, + "grad_norm": 2.098354148192397, + "learning_rate": 6.047341606226664e-08, + "loss": 0.3061, + "step": 12626 + }, + { + "epoch": 1.9322111706197398, + "grad_norm": 2.163742519632644, + "learning_rate": 6.020159338734566e-08, + "loss": 0.2322, + "step": 12627 + }, + { + "epoch": 1.9323641928079571, + "grad_norm": 2.3218334894180708, + "learning_rate": 5.993038115432837e-08, + "loss": 0.3174, + "step": 12628 + }, + { + "epoch": 1.9325172149961745, + "grad_norm": 1.7732066918572658, + "learning_rate": 5.965977937987366e-08, + "loss": 0.2302, + "step": 12629 + }, + { + "epoch": 1.9326702371843916, + "grad_norm": 2.000109135862685, + "learning_rate": 5.938978808059825e-08, + "loss": 0.2409, + "step": 12630 + }, + { + "epoch": 1.9328232593726091, + "grad_norm": 2.333789338782402, + "learning_rate": 5.9120407273082215e-08, + "loss": 0.345, + "step": 12631 + }, + { + "epoch": 1.9329762815608262, + "grad_norm": 1.90728146018191, + "learning_rate": 5.8851636973872306e-08, + "loss": 0.2432, + "step": 12632 + }, + { + "epoch": 1.9331293037490436, + "grad_norm": 2.208001178052822, + "learning_rate": 5.8583477199474213e-08, + "loss": 0.294, + "step": 12633 + }, + { + "epoch": 1.933282325937261, + "grad_norm": 2.354523824473684, + "learning_rate": 5.831592796635477e-08, + "loss": 0.3385, + "step": 12634 + }, + { + "epoch": 1.9334353481254782, + "grad_norm": 1.8312645826200542, + "learning_rate": 5.804898929094638e-08, + "loss": 0.2109, + "step": 12635 + }, + { + "epoch": 1.9335883703136956, + "grad_norm": 2.0830635861639366, + "learning_rate": 5.77826611896426e-08, + "loss": 0.2663, + "step": 12636 + }, + { + "epoch": 1.9337413925019127, + "grad_norm": 2.419482269865649, + "learning_rate": 5.751694367879923e-08, + "loss": 0.2717, + "step": 12637 + }, + { + "epoch": 1.93389441469013, + "grad_norm": 2.0657515526693464, + "learning_rate": 5.725183677473656e-08, + "loss": 0.2888, + "step": 12638 + }, + { + "epoch": 1.9340474368783473, + "grad_norm": 2.0596978612467565, + "learning_rate": 5.698734049373489e-08, + "loss": 0.2544, + "step": 12639 + }, + { + "epoch": 1.9342004590665647, + "grad_norm": 2.5250076265493955, + "learning_rate": 5.6723454852036785e-08, + "loss": 0.3538, + "step": 12640 + }, + { + "epoch": 1.934353481254782, + "grad_norm": 2.2943276103623536, + "learning_rate": 5.64601798658504e-08, + "loss": 0.2966, + "step": 12641 + }, + { + "epoch": 1.9345065034429991, + "grad_norm": 2.1423583068445087, + "learning_rate": 5.6197515551343895e-08, + "loss": 0.3187, + "step": 12642 + }, + { + "epoch": 1.9346595256312167, + "grad_norm": 2.123351982906415, + "learning_rate": 5.593546192464883e-08, + "loss": 0.2805, + "step": 12643 + }, + { + "epoch": 1.9348125478194338, + "grad_norm": 2.1385231757290257, + "learning_rate": 5.567401900185787e-08, + "loss": 0.2985, + "step": 12644 + }, + { + "epoch": 1.9349655700076511, + "grad_norm": 2.283449728780997, + "learning_rate": 5.541318679902707e-08, + "loss": 0.3114, + "step": 12645 + }, + { + "epoch": 1.9351185921958685, + "grad_norm": 1.9670750492646343, + "learning_rate": 5.515296533217696e-08, + "loss": 0.2487, + "step": 12646 + }, + { + "epoch": 1.9352716143840856, + "grad_norm": 2.095455321897977, + "learning_rate": 5.4893354617286954e-08, + "loss": 0.2793, + "step": 12647 + }, + { + "epoch": 1.9354246365723031, + "grad_norm": 1.9765462462707317, + "learning_rate": 5.4634354670302095e-08, + "loss": 0.3064, + "step": 12648 + }, + { + "epoch": 1.9355776587605202, + "grad_norm": 1.8820560851270636, + "learning_rate": 5.437596550712743e-08, + "loss": 0.2657, + "step": 12649 + }, + { + "epoch": 1.9357306809487376, + "grad_norm": 2.3788353271474634, + "learning_rate": 5.411818714363248e-08, + "loss": 0.3056, + "step": 12650 + }, + { + "epoch": 1.935883703136955, + "grad_norm": 2.2261586061883953, + "learning_rate": 5.3861019595649045e-08, + "loss": 0.3066, + "step": 12651 + }, + { + "epoch": 1.936036725325172, + "grad_norm": 1.785826281369035, + "learning_rate": 5.360446287896781e-08, + "loss": 0.2866, + "step": 12652 + }, + { + "epoch": 1.9361897475133896, + "grad_norm": 2.331191024955345, + "learning_rate": 5.334851700934951e-08, + "loss": 0.3063, + "step": 12653 + }, + { + "epoch": 1.9363427697016067, + "grad_norm": 1.8719935808142152, + "learning_rate": 5.309318200250824e-08, + "loss": 0.2523, + "step": 12654 + }, + { + "epoch": 1.936495791889824, + "grad_norm": 2.223253250317612, + "learning_rate": 5.283845787412811e-08, + "loss": 0.3068, + "step": 12655 + }, + { + "epoch": 1.9366488140780413, + "grad_norm": 2.1951752055550853, + "learning_rate": 5.258434463985218e-08, + "loss": 0.2619, + "step": 12656 + }, + { + "epoch": 1.9368018362662585, + "grad_norm": 2.059501912533121, + "learning_rate": 5.233084231528574e-08, + "loss": 0.2819, + "step": 12657 + }, + { + "epoch": 1.936954858454476, + "grad_norm": 2.2992385321273896, + "learning_rate": 5.207795091599743e-08, + "loss": 0.2655, + "step": 12658 + }, + { + "epoch": 1.9371078806426931, + "grad_norm": 2.034712783110768, + "learning_rate": 5.182567045751929e-08, + "loss": 0.2648, + "step": 12659 + }, + { + "epoch": 1.9372609028309105, + "grad_norm": 2.0115061960293747, + "learning_rate": 5.157400095534337e-08, + "loss": 0.2186, + "step": 12660 + }, + { + "epoch": 1.9374139250191278, + "grad_norm": 1.9064111987400696, + "learning_rate": 5.1322942424927303e-08, + "loss": 0.2377, + "step": 12661 + }, + { + "epoch": 1.937566947207345, + "grad_norm": 2.1764256108666236, + "learning_rate": 5.107249488168875e-08, + "loss": 0.247, + "step": 12662 + }, + { + "epoch": 1.9377199693955625, + "grad_norm": 1.7123673286799752, + "learning_rate": 5.082265834100875e-08, + "loss": 0.2426, + "step": 12663 + }, + { + "epoch": 1.9378729915837796, + "grad_norm": 2.3757105271033483, + "learning_rate": 5.0573432818229464e-08, + "loss": 0.2783, + "step": 12664 + }, + { + "epoch": 1.938026013771997, + "grad_norm": 2.0579111088781223, + "learning_rate": 5.0324818328659766e-08, + "loss": 0.2608, + "step": 12665 + }, + { + "epoch": 1.9381790359602142, + "grad_norm": 2.160905591204374, + "learning_rate": 5.007681488756522e-08, + "loss": 0.2706, + "step": 12666 + }, + { + "epoch": 1.9383320581484316, + "grad_norm": 2.2877271140280393, + "learning_rate": 4.982942251017808e-08, + "loss": 0.2357, + "step": 12667 + }, + { + "epoch": 1.938485080336649, + "grad_norm": 2.1542809755239216, + "learning_rate": 4.958264121169176e-08, + "loss": 0.2625, + "step": 12668 + }, + { + "epoch": 1.938638102524866, + "grad_norm": 2.059582513808786, + "learning_rate": 4.93364710072608e-08, + "loss": 0.2883, + "step": 12669 + }, + { + "epoch": 1.9387911247130833, + "grad_norm": 2.1724063331417636, + "learning_rate": 4.9090911912005325e-08, + "loss": 0.2804, + "step": 12670 + }, + { + "epoch": 1.9389441469013007, + "grad_norm": 2.076507363841404, + "learning_rate": 4.884596394100438e-08, + "loss": 0.2798, + "step": 12671 + }, + { + "epoch": 1.939097169089518, + "grad_norm": 2.1280474655352286, + "learning_rate": 4.8601627109302605e-08, + "loss": 0.3037, + "step": 12672 + }, + { + "epoch": 1.9392501912777353, + "grad_norm": 2.191630607477236, + "learning_rate": 4.835790143190466e-08, + "loss": 0.2652, + "step": 12673 + }, + { + "epoch": 1.9394032134659525, + "grad_norm": 2.017617923200027, + "learning_rate": 4.8114786923778575e-08, + "loss": 0.2702, + "step": 12674 + }, + { + "epoch": 1.93955623565417, + "grad_norm": 2.1656108843312754, + "learning_rate": 4.787228359985685e-08, + "loss": 0.2703, + "step": 12675 + }, + { + "epoch": 1.9397092578423871, + "grad_norm": 2.056305398421697, + "learning_rate": 4.7630391475029794e-08, + "loss": 0.2748, + "step": 12676 + }, + { + "epoch": 1.9398622800306045, + "grad_norm": 2.0212531782535934, + "learning_rate": 4.7389110564154407e-08, + "loss": 0.2648, + "step": 12677 + }, + { + "epoch": 1.9400153022188218, + "grad_norm": 2.183826196568856, + "learning_rate": 4.714844088204884e-08, + "loss": 0.2861, + "step": 12678 + }, + { + "epoch": 1.940168324407039, + "grad_norm": 1.9595429105828446, + "learning_rate": 4.6908382443493496e-08, + "loss": 0.2806, + "step": 12679 + }, + { + "epoch": 1.9403213465952565, + "grad_norm": 2.104117403711713, + "learning_rate": 4.666893526323102e-08, + "loss": 0.3612, + "step": 12680 + }, + { + "epoch": 1.9404743687834736, + "grad_norm": 2.3085008583768447, + "learning_rate": 4.643009935596632e-08, + "loss": 0.3073, + "step": 12681 + }, + { + "epoch": 1.940627390971691, + "grad_norm": 2.146451918561183, + "learning_rate": 4.6191874736367656e-08, + "loss": 0.2558, + "step": 12682 + }, + { + "epoch": 1.9407804131599082, + "grad_norm": 2.3192462362711734, + "learning_rate": 4.595426141906667e-08, + "loss": 0.3116, + "step": 12683 + }, + { + "epoch": 1.9409334353481253, + "grad_norm": 2.4574745152024793, + "learning_rate": 4.57172594186539e-08, + "loss": 0.2959, + "step": 12684 + }, + { + "epoch": 1.941086457536343, + "grad_norm": 1.9916261660219228, + "learning_rate": 4.54808687496866e-08, + "loss": 0.2328, + "step": 12685 + }, + { + "epoch": 1.94123947972456, + "grad_norm": 1.8992522773265608, + "learning_rate": 4.5245089426680934e-08, + "loss": 0.2541, + "step": 12686 + }, + { + "epoch": 1.9413925019127773, + "grad_norm": 1.8079036077768846, + "learning_rate": 4.500992146411753e-08, + "loss": 0.2135, + "step": 12687 + }, + { + "epoch": 1.9415455241009947, + "grad_norm": 1.9889511801750943, + "learning_rate": 4.4775364876438185e-08, + "loss": 0.2427, + "step": 12688 + }, + { + "epoch": 1.9416985462892118, + "grad_norm": 1.751479263129326, + "learning_rate": 4.454141967805026e-08, + "loss": 0.2189, + "step": 12689 + }, + { + "epoch": 1.9418515684774293, + "grad_norm": 2.0423292553879957, + "learning_rate": 4.430808588331892e-08, + "loss": 0.257, + "step": 12690 + }, + { + "epoch": 1.9420045906656465, + "grad_norm": 2.0041499310977504, + "learning_rate": 4.407536350657493e-08, + "loss": 0.2035, + "step": 12691 + }, + { + "epoch": 1.9421576128538638, + "grad_norm": 2.312755751955661, + "learning_rate": 4.3843252562110193e-08, + "loss": 0.2828, + "step": 12692 + }, + { + "epoch": 1.9423106350420811, + "grad_norm": 1.971954947422424, + "learning_rate": 4.361175306418109e-08, + "loss": 0.2417, + "step": 12693 + }, + { + "epoch": 1.9424636572302982, + "grad_norm": 1.831074912801939, + "learning_rate": 4.33808650270029e-08, + "loss": 0.2528, + "step": 12694 + }, + { + "epoch": 1.9426166794185158, + "grad_norm": 2.0578830783806525, + "learning_rate": 4.3150588464756504e-08, + "loss": 0.2771, + "step": 12695 + }, + { + "epoch": 1.942769701606733, + "grad_norm": 1.961835942990468, + "learning_rate": 4.2920923391583937e-08, + "loss": 0.221, + "step": 12696 + }, + { + "epoch": 1.9429227237949502, + "grad_norm": 1.9705490121987508, + "learning_rate": 4.269186982159057e-08, + "loss": 0.2571, + "step": 12697 + }, + { + "epoch": 1.9430757459831676, + "grad_norm": 2.0673965559846312, + "learning_rate": 4.246342776884182e-08, + "loss": 0.2342, + "step": 12698 + }, + { + "epoch": 1.943228768171385, + "grad_norm": 2.027833565441114, + "learning_rate": 4.223559724736759e-08, + "loss": 0.2193, + "step": 12699 + }, + { + "epoch": 1.9433817903596022, + "grad_norm": 1.928042773055958, + "learning_rate": 4.200837827116111e-08, + "loss": 0.2249, + "step": 12700 + }, + { + "epoch": 1.9435348125478193, + "grad_norm": 2.0145095411771186, + "learning_rate": 4.1781770854175674e-08, + "loss": 0.2194, + "step": 12701 + }, + { + "epoch": 1.9436878347360367, + "grad_norm": 2.0755925996216606, + "learning_rate": 4.1555775010329034e-08, + "loss": 0.2577, + "step": 12702 + }, + { + "epoch": 1.943840856924254, + "grad_norm": 2.1088247690311714, + "learning_rate": 4.1330390753500094e-08, + "loss": 0.28, + "step": 12703 + }, + { + "epoch": 1.9439938791124713, + "grad_norm": 2.070960100346576, + "learning_rate": 4.110561809752889e-08, + "loss": 0.2509, + "step": 12704 + }, + { + "epoch": 1.9441469013006887, + "grad_norm": 2.0891100512185603, + "learning_rate": 4.0881457056222153e-08, + "loss": 0.2513, + "step": 12705 + }, + { + "epoch": 1.9442999234889058, + "grad_norm": 2.5962353577847566, + "learning_rate": 4.065790764334554e-08, + "loss": 0.3419, + "step": 12706 + }, + { + "epoch": 1.9444529456771233, + "grad_norm": 1.9640071319967864, + "learning_rate": 4.043496987262696e-08, + "loss": 0.2708, + "step": 12707 + }, + { + "epoch": 1.9446059678653405, + "grad_norm": 2.179001565244998, + "learning_rate": 4.021264375775991e-08, + "loss": 0.3385, + "step": 12708 + }, + { + "epoch": 1.9447589900535578, + "grad_norm": 2.1426220088913954, + "learning_rate": 3.99909293123979e-08, + "loss": 0.2687, + "step": 12709 + }, + { + "epoch": 1.9449120122417751, + "grad_norm": 1.8754076871641312, + "learning_rate": 3.97698265501556e-08, + "loss": 0.2513, + "step": 12710 + }, + { + "epoch": 1.9450650344299922, + "grad_norm": 2.042815422922247, + "learning_rate": 3.954933548461326e-08, + "loss": 0.2162, + "step": 12711 + }, + { + "epoch": 1.9452180566182098, + "grad_norm": 1.8552967583474373, + "learning_rate": 3.932945612931227e-08, + "loss": 0.2369, + "step": 12712 + }, + { + "epoch": 1.945371078806427, + "grad_norm": 1.907696643956236, + "learning_rate": 3.911018849775405e-08, + "loss": 0.2624, + "step": 12713 + }, + { + "epoch": 1.9455241009946442, + "grad_norm": 2.0798771441829116, + "learning_rate": 3.8891532603407834e-08, + "loss": 0.2829, + "step": 12714 + }, + { + "epoch": 1.9456771231828616, + "grad_norm": 2.3790723531215257, + "learning_rate": 3.867348845970065e-08, + "loss": 0.3033, + "step": 12715 + }, + { + "epoch": 1.9458301453710787, + "grad_norm": 1.8852530500177367, + "learning_rate": 3.84560560800229e-08, + "loss": 0.228, + "step": 12716 + }, + { + "epoch": 1.9459831675592962, + "grad_norm": 2.1820151584706613, + "learning_rate": 3.8239235477728344e-08, + "loss": 0.3082, + "step": 12717 + }, + { + "epoch": 1.9461361897475133, + "grad_norm": 2.2140014660667933, + "learning_rate": 3.8023026666133e-08, + "loss": 0.2537, + "step": 12718 + }, + { + "epoch": 1.9462892119357307, + "grad_norm": 2.2659188032539195, + "learning_rate": 3.7807429658516246e-08, + "loss": 0.2632, + "step": 12719 + }, + { + "epoch": 1.946442234123948, + "grad_norm": 2.5293218796462464, + "learning_rate": 3.7592444468116384e-08, + "loss": 0.2743, + "step": 12720 + }, + { + "epoch": 1.9465952563121651, + "grad_norm": 1.8671996561905655, + "learning_rate": 3.7378071108138405e-08, + "loss": 0.2409, + "step": 12721 + }, + { + "epoch": 1.9467482785003827, + "grad_norm": 2.239141860501856, + "learning_rate": 3.716430959174622e-08, + "loss": 0.3049, + "step": 12722 + }, + { + "epoch": 1.9469013006885998, + "grad_norm": 1.909309810127325, + "learning_rate": 3.6951159932069323e-08, + "loss": 0.2702, + "step": 12723 + }, + { + "epoch": 1.9470543228768171, + "grad_norm": 2.1027549057800097, + "learning_rate": 3.673862214219726e-08, + "loss": 0.2468, + "step": 12724 + }, + { + "epoch": 1.9472073450650345, + "grad_norm": 1.864832404974843, + "learning_rate": 3.65266962351829e-08, + "loss": 0.223, + "step": 12725 + }, + { + "epoch": 1.9473603672532516, + "grad_norm": 1.7866644661828868, + "learning_rate": 3.6315382224041406e-08, + "loss": 0.2208, + "step": 12726 + }, + { + "epoch": 1.9475133894414691, + "grad_norm": 2.125342586374641, + "learning_rate": 3.610468012175017e-08, + "loss": 0.3016, + "step": 12727 + }, + { + "epoch": 1.9476664116296862, + "grad_norm": 2.676246515950065, + "learning_rate": 3.5894589941248835e-08, + "loss": 0.3001, + "step": 12728 + }, + { + "epoch": 1.9478194338179036, + "grad_norm": 2.1133835150212024, + "learning_rate": 3.568511169544153e-08, + "loss": 0.3017, + "step": 12729 + }, + { + "epoch": 1.947972456006121, + "grad_norm": 1.9531071847013737, + "learning_rate": 3.547624539719241e-08, + "loss": 0.2712, + "step": 12730 + }, + { + "epoch": 1.9481254781943382, + "grad_norm": 2.3816136624101403, + "learning_rate": 3.526799105932788e-08, + "loss": 0.3098, + "step": 12731 + }, + { + "epoch": 1.9482785003825556, + "grad_norm": 2.0472889114981436, + "learning_rate": 3.506034869463881e-08, + "loss": 0.2275, + "step": 12732 + }, + { + "epoch": 1.9484315225707727, + "grad_norm": 2.005149594082394, + "learning_rate": 3.4853318315876126e-08, + "loss": 0.28, + "step": 12733 + }, + { + "epoch": 1.94858454475899, + "grad_norm": 2.307715143977197, + "learning_rate": 3.4646899935755205e-08, + "loss": 0.3267, + "step": 12734 + }, + { + "epoch": 1.9487375669472073, + "grad_norm": 2.6018657290133205, + "learning_rate": 3.4441093566953685e-08, + "loss": 0.2484, + "step": 12735 + }, + { + "epoch": 1.9488905891354247, + "grad_norm": 2.2228026058532255, + "learning_rate": 3.4235899222108126e-08, + "loss": 0.2843, + "step": 12736 + }, + { + "epoch": 1.949043611323642, + "grad_norm": 2.278609420116441, + "learning_rate": 3.4031316913824e-08, + "loss": 0.2884, + "step": 12737 + }, + { + "epoch": 1.9491966335118591, + "grad_norm": 2.3154914775864954, + "learning_rate": 3.3827346654663473e-08, + "loss": 0.324, + "step": 12738 + }, + { + "epoch": 1.9493496557000765, + "grad_norm": 2.6375485637022438, + "learning_rate": 3.36239884571532e-08, + "loss": 0.2916, + "step": 12739 + }, + { + "epoch": 1.9495026778882938, + "grad_norm": 1.9118207499010522, + "learning_rate": 3.342124233378319e-08, + "loss": 0.2729, + "step": 12740 + }, + { + "epoch": 1.9496557000765111, + "grad_norm": 1.9701216325844941, + "learning_rate": 3.3219108297003476e-08, + "loss": 0.2535, + "step": 12741 + }, + { + "epoch": 1.9498087222647285, + "grad_norm": 2.2188220603731286, + "learning_rate": 3.3017586359228584e-08, + "loss": 0.2926, + "step": 12742 + }, + { + "epoch": 1.9499617444529456, + "grad_norm": 1.828588995912328, + "learning_rate": 3.281667653283416e-08, + "loss": 0.2301, + "step": 12743 + }, + { + "epoch": 1.9501147666411631, + "grad_norm": 2.072114708886476, + "learning_rate": 3.2616378830160336e-08, + "loss": 0.2664, + "step": 12744 + }, + { + "epoch": 1.9502677888293802, + "grad_norm": 2.09207507253338, + "learning_rate": 3.241669326350727e-08, + "loss": 0.2392, + "step": 12745 + }, + { + "epoch": 1.9504208110175976, + "grad_norm": 2.222014245372224, + "learning_rate": 3.221761984513849e-08, + "loss": 0.3022, + "step": 12746 + }, + { + "epoch": 1.950573833205815, + "grad_norm": 2.4473576328151823, + "learning_rate": 3.2019158587279773e-08, + "loss": 0.3092, + "step": 12747 + }, + { + "epoch": 1.950726855394032, + "grad_norm": 2.1372539044169256, + "learning_rate": 3.1821309502119146e-08, + "loss": 0.2586, + "step": 12748 + }, + { + "epoch": 1.9508798775822496, + "grad_norm": 2.2224469014528383, + "learning_rate": 3.1624072601808e-08, + "loss": 0.3213, + "step": 12749 + }, + { + "epoch": 1.9510328997704667, + "grad_norm": 2.268220714282305, + "learning_rate": 3.142744789845997e-08, + "loss": 0.2969, + "step": 12750 + }, + { + "epoch": 1.951185921958684, + "grad_norm": 2.2497531369255426, + "learning_rate": 3.123143540414875e-08, + "loss": 0.2996, + "step": 12751 + }, + { + "epoch": 1.9513389441469013, + "grad_norm": 2.0618539697503935, + "learning_rate": 3.1036035130912464e-08, + "loss": 0.3029, + "step": 12752 + }, + { + "epoch": 1.9514919663351185, + "grad_norm": 1.9194248768655344, + "learning_rate": 3.0841247090752646e-08, + "loss": 0.2567, + "step": 12753 + }, + { + "epoch": 1.951644988523336, + "grad_norm": 2.2237292380811384, + "learning_rate": 3.064707129563194e-08, + "loss": 0.2751, + "step": 12754 + }, + { + "epoch": 1.9517980107115531, + "grad_norm": 2.1877555347738062, + "learning_rate": 3.045350775747413e-08, + "loss": 0.2889, + "step": 12755 + }, + { + "epoch": 1.9519510328997705, + "grad_norm": 2.1896793457465074, + "learning_rate": 3.02605564881675e-08, + "loss": 0.2728, + "step": 12756 + }, + { + "epoch": 1.9521040550879878, + "grad_norm": 1.6954979190860753, + "learning_rate": 3.006821749956146e-08, + "loss": 0.2552, + "step": 12757 + }, + { + "epoch": 1.952257077276205, + "grad_norm": 2.1512376385993712, + "learning_rate": 2.9876490803469884e-08, + "loss": 0.2612, + "step": 12758 + }, + { + "epoch": 1.9524100994644225, + "grad_norm": 2.4529291510609856, + "learning_rate": 2.968537641166669e-08, + "loss": 0.3468, + "step": 12759 + }, + { + "epoch": 1.9525631216526396, + "grad_norm": 2.25080392769247, + "learning_rate": 2.9494874335886935e-08, + "loss": 0.2853, + "step": 12760 + }, + { + "epoch": 1.952716143840857, + "grad_norm": 2.520938479932384, + "learning_rate": 2.9304984587833485e-08, + "loss": 0.3053, + "step": 12761 + }, + { + "epoch": 1.9528691660290742, + "grad_norm": 2.014803782393137, + "learning_rate": 2.9115707179165898e-08, + "loss": 0.2684, + "step": 12762 + }, + { + "epoch": 1.9530221882172913, + "grad_norm": 2.2036033789059104, + "learning_rate": 2.8927042121508208e-08, + "loss": 0.2638, + "step": 12763 + }, + { + "epoch": 1.953175210405509, + "grad_norm": 2.2003394728145618, + "learning_rate": 2.8738989426448928e-08, + "loss": 0.3004, + "step": 12764 + }, + { + "epoch": 1.953328232593726, + "grad_norm": 1.9303823675832934, + "learning_rate": 2.855154910553548e-08, + "loss": 0.2712, + "step": 12765 + }, + { + "epoch": 1.9534812547819433, + "grad_norm": 2.1463555044250775, + "learning_rate": 2.8364721170280883e-08, + "loss": 0.2339, + "step": 12766 + }, + { + "epoch": 1.9536342769701607, + "grad_norm": 2.2150266187516934, + "learning_rate": 2.817850563215707e-08, + "loss": 0.3033, + "step": 12767 + }, + { + "epoch": 1.953787299158378, + "grad_norm": 2.258512569394117, + "learning_rate": 2.799290250260156e-08, + "loss": 0.2643, + "step": 12768 + }, + { + "epoch": 1.9539403213465953, + "grad_norm": 2.300631573365148, + "learning_rate": 2.780791179301301e-08, + "loss": 0.3314, + "step": 12769 + }, + { + "epoch": 1.9540933435348125, + "grad_norm": 2.148802915378045, + "learning_rate": 2.7623533514751223e-08, + "loss": 0.2894, + "step": 12770 + }, + { + "epoch": 1.9542463657230298, + "grad_norm": 2.5611381404597, + "learning_rate": 2.7439767679140474e-08, + "loss": 0.2613, + "step": 12771 + }, + { + "epoch": 1.9543993879112471, + "grad_norm": 1.7155427881312075, + "learning_rate": 2.7256614297467287e-08, + "loss": 0.2885, + "step": 12772 + }, + { + "epoch": 1.9545524100994645, + "grad_norm": 1.8547576958237793, + "learning_rate": 2.707407338097823e-08, + "loss": 0.253, + "step": 12773 + }, + { + "epoch": 1.9547054322876818, + "grad_norm": 2.401444734532838, + "learning_rate": 2.6892144940885435e-08, + "loss": 0.3416, + "step": 12774 + }, + { + "epoch": 1.954858454475899, + "grad_norm": 2.3311045666651102, + "learning_rate": 2.6710828988359973e-08, + "loss": 0.3475, + "step": 12775 + }, + { + "epoch": 1.9550114766641165, + "grad_norm": 2.233725116048879, + "learning_rate": 2.653012553453849e-08, + "loss": 0.2838, + "step": 12776 + }, + { + "epoch": 1.9551644988523336, + "grad_norm": 2.06475531213535, + "learning_rate": 2.635003459051877e-08, + "loss": 0.3182, + "step": 12777 + }, + { + "epoch": 1.955317521040551, + "grad_norm": 2.2259592026875135, + "learning_rate": 2.617055616736086e-08, + "loss": 0.2861, + "step": 12778 + }, + { + "epoch": 1.9554705432287682, + "grad_norm": 2.3905401922864216, + "learning_rate": 2.5991690276087056e-08, + "loss": 0.297, + "step": 12779 + }, + { + "epoch": 1.9556235654169853, + "grad_norm": 1.7548109646120047, + "learning_rate": 2.581343692768079e-08, + "loss": 0.1689, + "step": 12780 + }, + { + "epoch": 1.955776587605203, + "grad_norm": 2.3205852966806804, + "learning_rate": 2.5635796133091084e-08, + "loss": 0.2712, + "step": 12781 + }, + { + "epoch": 1.95592960979342, + "grad_norm": 2.076512690272171, + "learning_rate": 2.54587679032281e-08, + "loss": 0.2737, + "step": 12782 + }, + { + "epoch": 1.9560826319816373, + "grad_norm": 1.9197513713440644, + "learning_rate": 2.528235224896203e-08, + "loss": 0.2378, + "step": 12783 + }, + { + "epoch": 1.9562356541698547, + "grad_norm": 2.137377531971493, + "learning_rate": 2.5106549181127536e-08, + "loss": 0.2342, + "step": 12784 + }, + { + "epoch": 1.9563886763580718, + "grad_norm": 2.4352962597543413, + "learning_rate": 2.4931358710522657e-08, + "loss": 0.2878, + "step": 12785 + }, + { + "epoch": 1.9565416985462893, + "grad_norm": 2.244106347035568, + "learning_rate": 2.4756780847905448e-08, + "loss": 0.2981, + "step": 12786 + }, + { + "epoch": 1.9566947207345065, + "grad_norm": 2.212890440083552, + "learning_rate": 2.4582815603998445e-08, + "loss": 0.2592, + "step": 12787 + }, + { + "epoch": 1.9568477429227238, + "grad_norm": 2.0677355320744435, + "learning_rate": 2.440946298948421e-08, + "loss": 0.2614, + "step": 12788 + }, + { + "epoch": 1.9570007651109411, + "grad_norm": 2.185133311922832, + "learning_rate": 2.4236723015008678e-08, + "loss": 0.2937, + "step": 12789 + }, + { + "epoch": 1.9571537872991582, + "grad_norm": 2.16851766484783, + "learning_rate": 2.4064595691182247e-08, + "loss": 0.2739, + "step": 12790 + }, + { + "epoch": 1.9573068094873758, + "grad_norm": 2.0968905865267, + "learning_rate": 2.3893081028575348e-08, + "loss": 0.2871, + "step": 12791 + }, + { + "epoch": 1.957459831675593, + "grad_norm": 2.1826172797738552, + "learning_rate": 2.3722179037720673e-08, + "loss": 0.287, + "step": 12792 + }, + { + "epoch": 1.9576128538638102, + "grad_norm": 2.0634184984657478, + "learning_rate": 2.3551889729114264e-08, + "loss": 0.2339, + "step": 12793 + }, + { + "epoch": 1.9577658760520276, + "grad_norm": 2.3855698367985756, + "learning_rate": 2.3382213113214426e-08, + "loss": 0.2683, + "step": 12794 + }, + { + "epoch": 1.9579188982402447, + "grad_norm": 2.2611003903495996, + "learning_rate": 2.321314920044171e-08, + "loss": 0.2936, + "step": 12795 + }, + { + "epoch": 1.9580719204284622, + "grad_norm": 2.1631865336468046, + "learning_rate": 2.3044698001178923e-08, + "loss": 0.295, + "step": 12796 + }, + { + "epoch": 1.9582249426166793, + "grad_norm": 2.012992323941021, + "learning_rate": 2.287685952577223e-08, + "loss": 0.3247, + "step": 12797 + }, + { + "epoch": 1.9583779648048967, + "grad_norm": 2.2715497490794054, + "learning_rate": 2.270963378452673e-08, + "loss": 0.2826, + "step": 12798 + }, + { + "epoch": 1.958530986993114, + "grad_norm": 2.3168577682957023, + "learning_rate": 2.2543020787715308e-08, + "loss": 0.3328, + "step": 12799 + }, + { + "epoch": 1.9586840091813313, + "grad_norm": 2.0418637103355786, + "learning_rate": 2.2377020545568673e-08, + "loss": 0.2509, + "step": 12800 + }, + { + "epoch": 1.9588370313695487, + "grad_norm": 2.1613693720849323, + "learning_rate": 2.221163306828089e-08, + "loss": 0.2569, + "step": 12801 + }, + { + "epoch": 1.9589900535577658, + "grad_norm": 1.972935734881763, + "learning_rate": 2.2046858366010503e-08, + "loss": 0.2469, + "step": 12802 + }, + { + "epoch": 1.9591430757459831, + "grad_norm": 1.8252344132384086, + "learning_rate": 2.1882696448877193e-08, + "loss": 0.2564, + "step": 12803 + }, + { + "epoch": 1.9592960979342005, + "grad_norm": 1.8533980617714614, + "learning_rate": 2.1719147326961786e-08, + "loss": 0.2677, + "step": 12804 + }, + { + "epoch": 1.9594491201224178, + "grad_norm": 1.9106183949752555, + "learning_rate": 2.155621101030847e-08, + "loss": 0.3246, + "step": 12805 + }, + { + "epoch": 1.9596021423106351, + "grad_norm": 2.4116686758652293, + "learning_rate": 2.1393887508924794e-08, + "loss": 0.2986, + "step": 12806 + }, + { + "epoch": 1.9597551644988522, + "grad_norm": 2.0370580101783755, + "learning_rate": 2.123217683277834e-08, + "loss": 0.2389, + "step": 12807 + }, + { + "epoch": 1.9599081866870698, + "grad_norm": 2.3374821126533636, + "learning_rate": 2.1071078991801164e-08, + "loss": 0.2636, + "step": 12808 + }, + { + "epoch": 1.960061208875287, + "grad_norm": 1.7950419140596388, + "learning_rate": 2.0910593995887573e-08, + "loss": 0.2105, + "step": 12809 + }, + { + "epoch": 1.9602142310635042, + "grad_norm": 2.073353178793678, + "learning_rate": 2.075072185489191e-08, + "loss": 0.2332, + "step": 12810 + }, + { + "epoch": 1.9603672532517216, + "grad_norm": 2.4863571935424273, + "learning_rate": 2.059146257863409e-08, + "loss": 0.308, + "step": 12811 + }, + { + "epoch": 1.9605202754399387, + "grad_norm": 2.2423145973838268, + "learning_rate": 2.0432816176894075e-08, + "loss": 0.2668, + "step": 12812 + }, + { + "epoch": 1.9606732976281562, + "grad_norm": 2.2613131124583283, + "learning_rate": 2.0274782659414072e-08, + "loss": 0.2899, + "step": 12813 + }, + { + "epoch": 1.9608263198163733, + "grad_norm": 2.2129547501462046, + "learning_rate": 2.0117362035901866e-08, + "loss": 0.2971, + "step": 12814 + }, + { + "epoch": 1.9609793420045907, + "grad_norm": 2.096062489754385, + "learning_rate": 1.9960554316024174e-08, + "loss": 0.2741, + "step": 12815 + }, + { + "epoch": 1.961132364192808, + "grad_norm": 1.853363747759997, + "learning_rate": 1.9804359509409954e-08, + "loss": 0.2461, + "step": 12816 + }, + { + "epoch": 1.9612853863810251, + "grad_norm": 2.1127207539609905, + "learning_rate": 1.9648777625653757e-08, + "loss": 0.2984, + "step": 12817 + }, + { + "epoch": 1.9614384085692427, + "grad_norm": 1.8244916883822468, + "learning_rate": 1.9493808674307945e-08, + "loss": 0.2208, + "step": 12818 + }, + { + "epoch": 1.9615914307574598, + "grad_norm": 1.9379785767665714, + "learning_rate": 1.9339452664892677e-08, + "loss": 0.2565, + "step": 12819 + }, + { + "epoch": 1.9617444529456771, + "grad_norm": 1.971258507742363, + "learning_rate": 1.918570960688593e-08, + "loss": 0.2639, + "step": 12820 + }, + { + "epoch": 1.9618974751338945, + "grad_norm": 2.206834865901176, + "learning_rate": 1.9032579509729033e-08, + "loss": 0.2336, + "step": 12821 + }, + { + "epoch": 1.9620504973221116, + "grad_norm": 2.2371951125913694, + "learning_rate": 1.88800623828278e-08, + "loss": 0.2632, + "step": 12822 + }, + { + "epoch": 1.9622035195103291, + "grad_norm": 2.214750729742328, + "learning_rate": 1.8728158235549187e-08, + "loss": 0.3168, + "step": 12823 + }, + { + "epoch": 1.9623565416985462, + "grad_norm": 1.8243923581833847, + "learning_rate": 1.8576867077220174e-08, + "loss": 0.2238, + "step": 12824 + }, + { + "epoch": 1.9625095638867636, + "grad_norm": 2.2820739863423527, + "learning_rate": 1.8426188917133324e-08, + "loss": 0.2798, + "step": 12825 + }, + { + "epoch": 1.962662586074981, + "grad_norm": 2.3473441828274417, + "learning_rate": 1.8276123764543463e-08, + "loss": 0.3383, + "step": 12826 + }, + { + "epoch": 1.962815608263198, + "grad_norm": 1.810468033827546, + "learning_rate": 1.8126671628664328e-08, + "loss": 0.2646, + "step": 12827 + }, + { + "epoch": 1.9629686304514156, + "grad_norm": 2.0161550530630774, + "learning_rate": 1.7977832518676354e-08, + "loss": 0.2554, + "step": 12828 + }, + { + "epoch": 1.9631216526396327, + "grad_norm": 2.0057193127755526, + "learning_rate": 1.7829606443718893e-08, + "loss": 0.2195, + "step": 12829 + }, + { + "epoch": 1.96327467482785, + "grad_norm": 1.9895992190324854, + "learning_rate": 1.7681993412895782e-08, + "loss": 0.1972, + "step": 12830 + }, + { + "epoch": 1.9634276970160673, + "grad_norm": 2.424438500859323, + "learning_rate": 1.7534993435273095e-08, + "loss": 0.284, + "step": 12831 + }, + { + "epoch": 1.9635807192042847, + "grad_norm": 2.342021551362485, + "learning_rate": 1.738860651987806e-08, + "loss": 0.2584, + "step": 12832 + }, + { + "epoch": 1.963733741392502, + "grad_norm": 1.952229890512349, + "learning_rate": 1.724283267570126e-08, + "loss": 0.3085, + "step": 12833 + }, + { + "epoch": 1.9638867635807191, + "grad_norm": 2.042895088129427, + "learning_rate": 1.7097671911693315e-08, + "loss": 0.236, + "step": 12834 + }, + { + "epoch": 1.9640397857689365, + "grad_norm": 2.325198304496035, + "learning_rate": 1.6953124236772645e-08, + "loss": 0.2503, + "step": 12835 + }, + { + "epoch": 1.9641928079571538, + "grad_norm": 2.1434028335453803, + "learning_rate": 1.6809189659813264e-08, + "loss": 0.2563, + "step": 12836 + }, + { + "epoch": 1.9643458301453711, + "grad_norm": 2.7341650968690705, + "learning_rate": 1.6665868189655878e-08, + "loss": 0.2641, + "step": 12837 + }, + { + "epoch": 1.9644988523335885, + "grad_norm": 2.1110716721025806, + "learning_rate": 1.652315983510344e-08, + "loss": 0.2664, + "step": 12838 + }, + { + "epoch": 1.9646518745218056, + "grad_norm": 2.4763953852189524, + "learning_rate": 1.638106460491895e-08, + "loss": 0.3259, + "step": 12839 + }, + { + "epoch": 1.9648048967100231, + "grad_norm": 1.9515477357591082, + "learning_rate": 1.6239582507828754e-08, + "loss": 0.2274, + "step": 12840 + }, + { + "epoch": 1.9649579188982402, + "grad_norm": 2.1874600207771144, + "learning_rate": 1.6098713552523683e-08, + "loss": 0.3048, + "step": 12841 + }, + { + "epoch": 1.9651109410864576, + "grad_norm": 2.163087834030164, + "learning_rate": 1.5958457747652368e-08, + "loss": 0.2767, + "step": 12842 + }, + { + "epoch": 1.965263963274675, + "grad_norm": 2.405722917761059, + "learning_rate": 1.5818815101831252e-08, + "loss": 0.2629, + "step": 12843 + }, + { + "epoch": 1.965416985462892, + "grad_norm": 2.3389774387611086, + "learning_rate": 1.567978562363459e-08, + "loss": 0.2692, + "step": 12844 + }, + { + "epoch": 1.9655700076511096, + "grad_norm": 1.774637055198869, + "learning_rate": 1.5541369321601107e-08, + "loss": 0.2302, + "step": 12845 + }, + { + "epoch": 1.9657230298393267, + "grad_norm": 2.2203580711233815, + "learning_rate": 1.5403566204231777e-08, + "loss": 0.2742, + "step": 12846 + }, + { + "epoch": 1.965876052027544, + "grad_norm": 2.0022244855679183, + "learning_rate": 1.5266376279988726e-08, + "loss": 0.2747, + "step": 12847 + }, + { + "epoch": 1.9660290742157613, + "grad_norm": 2.086556860314529, + "learning_rate": 1.512979955729854e-08, + "loss": 0.2604, + "step": 12848 + }, + { + "epoch": 1.9661820964039785, + "grad_norm": 2.044756387848911, + "learning_rate": 1.499383604454896e-08, + "loss": 0.2365, + "step": 12849 + }, + { + "epoch": 1.966335118592196, + "grad_norm": 2.2031325687200654, + "learning_rate": 1.4858485750088857e-08, + "loss": 0.2612, + "step": 12850 + }, + { + "epoch": 1.9664881407804131, + "grad_norm": 2.0442598118479873, + "learning_rate": 1.4723748682231587e-08, + "loss": 0.2223, + "step": 12851 + }, + { + "epoch": 1.9666411629686305, + "grad_norm": 2.024031388837172, + "learning_rate": 1.4589624849250527e-08, + "loss": 0.2676, + "step": 12852 + }, + { + "epoch": 1.9667941851568478, + "grad_norm": 2.042948797618122, + "learning_rate": 1.4456114259384647e-08, + "loss": 0.2578, + "step": 12853 + }, + { + "epoch": 1.966947207345065, + "grad_norm": 1.9906287211778895, + "learning_rate": 1.4323216920831829e-08, + "loss": 0.2132, + "step": 12854 + }, + { + "epoch": 1.9671002295332825, + "grad_norm": 2.482263149556431, + "learning_rate": 1.4190932841755544e-08, + "loss": 0.333, + "step": 12855 + }, + { + "epoch": 1.9672532517214996, + "grad_norm": 1.7243344603142814, + "learning_rate": 1.4059262030278186e-08, + "loss": 0.2489, + "step": 12856 + }, + { + "epoch": 1.967406273909717, + "grad_norm": 2.451829415489958, + "learning_rate": 1.392820449448662e-08, + "loss": 0.2961, + "step": 12857 + }, + { + "epoch": 1.9675592960979342, + "grad_norm": 2.3167822602608132, + "learning_rate": 1.3797760242429958e-08, + "loss": 0.2894, + "step": 12858 + }, + { + "epoch": 1.9677123182861513, + "grad_norm": 2.163481464374622, + "learning_rate": 1.3667929282118464e-08, + "loss": 0.2569, + "step": 12859 + }, + { + "epoch": 1.967865340474369, + "grad_norm": 1.8496510780178912, + "learning_rate": 1.3538711621526868e-08, + "loss": 0.2437, + "step": 12860 + }, + { + "epoch": 1.968018362662586, + "grad_norm": 2.229870372063878, + "learning_rate": 1.3410107268589934e-08, + "loss": 0.3158, + "step": 12861 + }, + { + "epoch": 1.9681713848508033, + "grad_norm": 2.30265422085295, + "learning_rate": 1.3282116231205789e-08, + "loss": 0.2171, + "step": 12862 + }, + { + "epoch": 1.9683244070390207, + "grad_norm": 2.053435870710879, + "learning_rate": 1.3154738517235921e-08, + "loss": 0.2806, + "step": 12863 + }, + { + "epoch": 1.9684774292272378, + "grad_norm": 2.1300685488173254, + "learning_rate": 1.3027974134501853e-08, + "loss": 0.3023, + "step": 12864 + }, + { + "epoch": 1.9686304514154553, + "grad_norm": 2.094110877470264, + "learning_rate": 1.2901823090789577e-08, + "loss": 0.2508, + "step": 12865 + }, + { + "epoch": 1.9687834736036725, + "grad_norm": 2.003064762109647, + "learning_rate": 1.2776285393845122e-08, + "loss": 0.258, + "step": 12866 + }, + { + "epoch": 1.9689364957918898, + "grad_norm": 2.4322997253204144, + "learning_rate": 1.2651361051380095e-08, + "loss": 0.3001, + "step": 12867 + }, + { + "epoch": 1.9690895179801071, + "grad_norm": 2.0065478541875463, + "learning_rate": 1.2527050071065028e-08, + "loss": 0.3071, + "step": 12868 + }, + { + "epoch": 1.9692425401683245, + "grad_norm": 1.9145659667385493, + "learning_rate": 1.2403352460536034e-08, + "loss": 0.267, + "step": 12869 + }, + { + "epoch": 1.9693955623565418, + "grad_norm": 1.998726280086352, + "learning_rate": 1.2280268227388148e-08, + "loss": 0.2625, + "step": 12870 + }, + { + "epoch": 1.969548584544759, + "grad_norm": 1.7810985547272813, + "learning_rate": 1.215779737918088e-08, + "loss": 0.2553, + "step": 12871 + }, + { + "epoch": 1.9697016067329762, + "grad_norm": 2.0506737515397133, + "learning_rate": 1.20359399234371e-08, + "loss": 0.2376, + "step": 12872 + }, + { + "epoch": 1.9698546289211936, + "grad_norm": 2.4243875323116053, + "learning_rate": 1.1914695867638603e-08, + "loss": 0.3341, + "step": 12873 + }, + { + "epoch": 1.970007651109411, + "grad_norm": 1.930622018969895, + "learning_rate": 1.1794065219231654e-08, + "loss": 0.2257, + "step": 12874 + }, + { + "epoch": 1.9701606732976282, + "grad_norm": 1.864437419947269, + "learning_rate": 1.167404798562588e-08, + "loss": 0.2139, + "step": 12875 + }, + { + "epoch": 1.9703136954858453, + "grad_norm": 1.897571077812364, + "learning_rate": 1.1554644174192053e-08, + "loss": 0.2248, + "step": 12876 + }, + { + "epoch": 1.970466717674063, + "grad_norm": 1.9202972446689073, + "learning_rate": 1.1435853792260976e-08, + "loss": 0.2011, + "step": 12877 + }, + { + "epoch": 1.97061973986228, + "grad_norm": 2.110629293594774, + "learning_rate": 1.1317676847131254e-08, + "loss": 0.3001, + "step": 12878 + }, + { + "epoch": 1.9707727620504973, + "grad_norm": 2.392523432977483, + "learning_rate": 1.1200113346058195e-08, + "loss": 0.3097, + "step": 12879 + }, + { + "epoch": 1.9709257842387147, + "grad_norm": 2.2066538540809435, + "learning_rate": 1.1083163296262689e-08, + "loss": 0.2743, + "step": 12880 + }, + { + "epoch": 1.9710788064269318, + "grad_norm": 1.9974484118386584, + "learning_rate": 1.096682670492677e-08, + "loss": 0.2149, + "step": 12881 + }, + { + "epoch": 1.9712318286151493, + "grad_norm": 2.0342333461670425, + "learning_rate": 1.0851103579194722e-08, + "loss": 0.2746, + "step": 12882 + }, + { + "epoch": 1.9713848508033665, + "grad_norm": 2.3163031751434295, + "learning_rate": 1.0735993926175304e-08, + "loss": 0.2656, + "step": 12883 + }, + { + "epoch": 1.9715378729915838, + "grad_norm": 1.9829201798149283, + "learning_rate": 1.0621497752936194e-08, + "loss": 0.2601, + "step": 12884 + }, + { + "epoch": 1.9716908951798011, + "grad_norm": 2.1571377906044735, + "learning_rate": 1.0507615066509547e-08, + "loss": 0.2833, + "step": 12885 + }, + { + "epoch": 1.9718439173680182, + "grad_norm": 2.07040945084194, + "learning_rate": 1.0394345873889766e-08, + "loss": 0.2879, + "step": 12886 + }, + { + "epoch": 1.9719969395562358, + "grad_norm": 2.2778466632041887, + "learning_rate": 1.0281690182032399e-08, + "loss": 0.2694, + "step": 12887 + }, + { + "epoch": 1.972149961744453, + "grad_norm": 1.9727267744304013, + "learning_rate": 1.0169647997856358e-08, + "loss": 0.2375, + "step": 12888 + }, + { + "epoch": 1.9723029839326702, + "grad_norm": 2.0960160436743003, + "learning_rate": 1.0058219328242802e-08, + "loss": 0.3329, + "step": 12889 + }, + { + "epoch": 1.9724560061208876, + "grad_norm": 2.046906516879214, + "learning_rate": 9.947404180035147e-09, + "loss": 0.2837, + "step": 12890 + }, + { + "epoch": 1.9726090283091047, + "grad_norm": 2.3133831775980847, + "learning_rate": 9.83720256003795e-09, + "loss": 0.3173, + "step": 12891 + }, + { + "epoch": 1.9727620504973222, + "grad_norm": 2.025923377837509, + "learning_rate": 9.727614475020241e-09, + "loss": 0.2715, + "step": 12892 + }, + { + "epoch": 1.9729150726855393, + "grad_norm": 2.1217931982552334, + "learning_rate": 9.618639931712193e-09, + "loss": 0.2897, + "step": 12893 + }, + { + "epoch": 1.9730680948737567, + "grad_norm": 2.3348882198976333, + "learning_rate": 9.510278936806227e-09, + "loss": 0.3275, + "step": 12894 + }, + { + "epoch": 1.973221117061974, + "grad_norm": 2.5591447995277403, + "learning_rate": 9.402531496957024e-09, + "loss": 0.3593, + "step": 12895 + }, + { + "epoch": 1.9733741392501911, + "grad_norm": 2.0794411023520243, + "learning_rate": 9.29539761878151e-09, + "loss": 0.2514, + "step": 12896 + }, + { + "epoch": 1.9735271614384087, + "grad_norm": 2.097465611753309, + "learning_rate": 9.188877308858867e-09, + "loss": 0.2895, + "step": 12897 + }, + { + "epoch": 1.9736801836266258, + "grad_norm": 2.0471184159575007, + "learning_rate": 9.082970573732752e-09, + "loss": 0.2243, + "step": 12898 + }, + { + "epoch": 1.9738332058148431, + "grad_norm": 2.1026098669748086, + "learning_rate": 8.97767741990574e-09, + "loss": 0.2623, + "step": 12899 + }, + { + "epoch": 1.9739862280030605, + "grad_norm": 2.007922658885538, + "learning_rate": 8.872997853843767e-09, + "loss": 0.2379, + "step": 12900 + }, + { + "epoch": 1.9741392501912778, + "grad_norm": 2.1522852184706345, + "learning_rate": 8.76893188197725e-09, + "loss": 0.2646, + "step": 12901 + }, + { + "epoch": 1.9742922723794951, + "grad_norm": 1.7457113137200364, + "learning_rate": 8.665479510696629e-09, + "loss": 0.2275, + "step": 12902 + }, + { + "epoch": 1.9744452945677122, + "grad_norm": 2.3852903678590494, + "learning_rate": 8.562640746354601e-09, + "loss": 0.2778, + "step": 12903 + }, + { + "epoch": 1.9745983167559296, + "grad_norm": 2.0897669244676216, + "learning_rate": 8.460415595268334e-09, + "loss": 0.2541, + "step": 12904 + }, + { + "epoch": 1.974751338944147, + "grad_norm": 2.21857340447905, + "learning_rate": 8.358804063715032e-09, + "loss": 0.2525, + "step": 12905 + }, + { + "epoch": 1.9749043611323642, + "grad_norm": 1.9060225156510942, + "learning_rate": 8.257806157934145e-09, + "loss": 0.276, + "step": 12906 + }, + { + "epoch": 1.9750573833205816, + "grad_norm": 2.062959820646989, + "learning_rate": 8.157421884129602e-09, + "loss": 0.2851, + "step": 12907 + }, + { + "epoch": 1.9752104055087987, + "grad_norm": 1.7379375669518808, + "learning_rate": 8.057651248466469e-09, + "loss": 0.2096, + "step": 12908 + }, + { + "epoch": 1.9753634276970162, + "grad_norm": 1.8775977832890811, + "learning_rate": 7.958494257072069e-09, + "loss": 0.2281, + "step": 12909 + }, + { + "epoch": 1.9755164498852333, + "grad_norm": 1.8697635311100709, + "learning_rate": 7.859950916034865e-09, + "loss": 0.2855, + "step": 12910 + }, + { + "epoch": 1.9756694720734507, + "grad_norm": 2.3981445826317906, + "learning_rate": 7.76202123140779e-09, + "loss": 0.3044, + "step": 12911 + }, + { + "epoch": 1.975822494261668, + "grad_norm": 2.168622460349219, + "learning_rate": 7.664705209204925e-09, + "loss": 0.2939, + "step": 12912 + }, + { + "epoch": 1.9759755164498851, + "grad_norm": 2.210540309755076, + "learning_rate": 7.568002855402602e-09, + "loss": 0.3456, + "step": 12913 + }, + { + "epoch": 1.9761285386381027, + "grad_norm": 2.47623544445302, + "learning_rate": 7.471914175940509e-09, + "loss": 0.3503, + "step": 12914 + }, + { + "epoch": 1.9762815608263198, + "grad_norm": 2.0110291162646146, + "learning_rate": 7.376439176718375e-09, + "loss": 0.2296, + "step": 12915 + }, + { + "epoch": 1.9764345830145371, + "grad_norm": 2.059626415844223, + "learning_rate": 7.2815778636003975e-09, + "loss": 0.261, + "step": 12916 + }, + { + "epoch": 1.9765876052027544, + "grad_norm": 1.9309744737462933, + "learning_rate": 7.187330242413026e-09, + "loss": 0.2504, + "step": 12917 + }, + { + "epoch": 1.9767406273909716, + "grad_norm": 2.153764527156966, + "learning_rate": 7.093696318943854e-09, + "loss": 0.2581, + "step": 12918 + }, + { + "epoch": 1.9768936495791891, + "grad_norm": 2.184851336719344, + "learning_rate": 7.000676098942727e-09, + "loss": 0.3208, + "step": 12919 + }, + { + "epoch": 1.9770466717674062, + "grad_norm": 2.1162936254329954, + "learning_rate": 6.9082695881228515e-09, + "loss": 0.2874, + "step": 12920 + }, + { + "epoch": 1.9771996939556236, + "grad_norm": 1.8457304539215134, + "learning_rate": 6.816476792159687e-09, + "loss": 0.2402, + "step": 12921 + }, + { + "epoch": 1.977352716143841, + "grad_norm": 2.2107074116766294, + "learning_rate": 6.725297716689838e-09, + "loss": 0.3502, + "step": 12922 + }, + { + "epoch": 1.977505738332058, + "grad_norm": 2.312117402985523, + "learning_rate": 6.6347323673143786e-09, + "loss": 0.3004, + "step": 12923 + }, + { + "epoch": 1.9776587605202756, + "grad_norm": 1.916658673354669, + "learning_rate": 6.544780749593305e-09, + "loss": 0.2303, + "step": 12924 + }, + { + "epoch": 1.9778117827084927, + "grad_norm": 1.8504219486404065, + "learning_rate": 6.455442869052197e-09, + "loss": 0.2086, + "step": 12925 + }, + { + "epoch": 1.97796480489671, + "grad_norm": 2.252796300111889, + "learning_rate": 6.366718731177779e-09, + "loss": 0.3257, + "step": 12926 + }, + { + "epoch": 1.9781178270849273, + "grad_norm": 2.0448647401610414, + "learning_rate": 6.278608341416803e-09, + "loss": 0.2517, + "step": 12927 + }, + { + "epoch": 1.9782708492731444, + "grad_norm": 1.9816261463855467, + "learning_rate": 6.1911117051838274e-09, + "loss": 0.2709, + "step": 12928 + }, + { + "epoch": 1.978423871461362, + "grad_norm": 2.1873499122980555, + "learning_rate": 6.104228827850111e-09, + "loss": 0.287, + "step": 12929 + }, + { + "epoch": 1.9785768936495791, + "grad_norm": 2.1254405287529763, + "learning_rate": 6.0179597147524975e-09, + "loss": 0.2381, + "step": 12930 + }, + { + "epoch": 1.9787299158377964, + "grad_norm": 2.0765132192816216, + "learning_rate": 5.932304371187858e-09, + "loss": 0.2851, + "step": 12931 + }, + { + "epoch": 1.9788829380260138, + "grad_norm": 2.2452308925752194, + "learning_rate": 5.847262802417542e-09, + "loss": 0.2512, + "step": 12932 + }, + { + "epoch": 1.9790359602142311, + "grad_norm": 2.1688896379609885, + "learning_rate": 5.7628350136640365e-09, + "loss": 0.3165, + "step": 12933 + }, + { + "epoch": 1.9791889824024484, + "grad_norm": 2.1543318719909053, + "learning_rate": 5.6790210101131945e-09, + "loss": 0.2452, + "step": 12934 + }, + { + "epoch": 1.9793420045906656, + "grad_norm": 1.9687252404414846, + "learning_rate": 5.595820796912011e-09, + "loss": 0.2407, + "step": 12935 + }, + { + "epoch": 1.979495026778883, + "grad_norm": 2.2367361565308825, + "learning_rate": 5.513234379168619e-09, + "loss": 0.2738, + "step": 12936 + }, + { + "epoch": 1.9796480489671002, + "grad_norm": 2.0907816123855185, + "learning_rate": 5.431261761956741e-09, + "loss": 0.2742, + "step": 12937 + }, + { + "epoch": 1.9798010711553176, + "grad_norm": 1.9706136989908423, + "learning_rate": 5.349902950310126e-09, + "loss": 0.2708, + "step": 12938 + }, + { + "epoch": 1.979954093343535, + "grad_norm": 1.9392410273061564, + "learning_rate": 5.269157949224779e-09, + "loss": 0.2548, + "step": 12939 + }, + { + "epoch": 1.980107115531752, + "grad_norm": 2.3816947627575873, + "learning_rate": 5.189026763661176e-09, + "loss": 0.2669, + "step": 12940 + }, + { + "epoch": 1.9802601377199696, + "grad_norm": 2.3186158240362045, + "learning_rate": 5.109509398538714e-09, + "loss": 0.3214, + "step": 12941 + }, + { + "epoch": 1.9804131599081867, + "grad_norm": 1.9554878445595376, + "learning_rate": 5.030605858740156e-09, + "loss": 0.2202, + "step": 12942 + }, + { + "epoch": 1.980566182096404, + "grad_norm": 2.0322729150228134, + "learning_rate": 4.952316149114955e-09, + "loss": 0.2662, + "step": 12943 + }, + { + "epoch": 1.9807192042846213, + "grad_norm": 1.8278177150486519, + "learning_rate": 4.874640274467046e-09, + "loss": 0.2244, + "step": 12944 + }, + { + "epoch": 1.9808722264728384, + "grad_norm": 2.174806576540863, + "learning_rate": 4.797578239569278e-09, + "loss": 0.2741, + "step": 12945 + }, + { + "epoch": 1.981025248661056, + "grad_norm": 2.2415653534967217, + "learning_rate": 4.721130049154532e-09, + "loss": 0.2785, + "step": 12946 + }, + { + "epoch": 1.9811782708492731, + "grad_norm": 2.166734196094784, + "learning_rate": 4.64529570791572e-09, + "loss": 0.2677, + "step": 12947 + }, + { + "epoch": 1.9813312930374904, + "grad_norm": 2.0414327018784006, + "learning_rate": 4.5700752205113385e-09, + "loss": 0.2732, + "step": 12948 + }, + { + "epoch": 1.9814843152257078, + "grad_norm": 2.256044758470271, + "learning_rate": 4.495468591562135e-09, + "loss": 0.3323, + "step": 12949 + }, + { + "epoch": 1.981637337413925, + "grad_norm": 2.1976314322826114, + "learning_rate": 4.421475825647781e-09, + "loss": 0.2598, + "step": 12950 + }, + { + "epoch": 1.9817903596021424, + "grad_norm": 1.7322213738511125, + "learning_rate": 4.3480969273135276e-09, + "loss": 0.2153, + "step": 12951 + }, + { + "epoch": 1.9819433817903596, + "grad_norm": 1.9892742007837072, + "learning_rate": 4.275331901066881e-09, + "loss": 0.2685, + "step": 12952 + }, + { + "epoch": 1.982096403978577, + "grad_norm": 2.201534831571064, + "learning_rate": 4.20318075137427e-09, + "loss": 0.2703, + "step": 12953 + }, + { + "epoch": 1.9822494261667942, + "grad_norm": 2.3807635297799283, + "learning_rate": 4.1316434826688124e-09, + "loss": 0.243, + "step": 12954 + }, + { + "epoch": 1.9824024483550113, + "grad_norm": 2.2659288950701346, + "learning_rate": 4.060720099343662e-09, + "loss": 0.3149, + "step": 12955 + }, + { + "epoch": 1.982555470543229, + "grad_norm": 2.1254319890729083, + "learning_rate": 3.990410605753115e-09, + "loss": 0.2547, + "step": 12956 + }, + { + "epoch": 1.982708492731446, + "grad_norm": 1.7746557192110035, + "learning_rate": 3.920715006217047e-09, + "loss": 0.2284, + "step": 12957 + }, + { + "epoch": 1.9828615149196633, + "grad_norm": 2.070971432899368, + "learning_rate": 3.851633305014257e-09, + "loss": 0.2593, + "step": 12958 + }, + { + "epoch": 1.9830145371078807, + "grad_norm": 2.2118533165855534, + "learning_rate": 3.78316550638691e-09, + "loss": 0.3016, + "step": 12959 + }, + { + "epoch": 1.9831675592960978, + "grad_norm": 1.9279899584450073, + "learning_rate": 3.715311614541639e-09, + "loss": 0.2188, + "step": 12960 + }, + { + "epoch": 1.9833205814843153, + "grad_norm": 2.071591635449214, + "learning_rate": 3.648071633645112e-09, + "loss": 0.2602, + "step": 12961 + }, + { + "epoch": 1.9834736036725324, + "grad_norm": 2.147092374012748, + "learning_rate": 3.5814455678262473e-09, + "loss": 0.3022, + "step": 12962 + }, + { + "epoch": 1.9836266258607498, + "grad_norm": 2.022467934629971, + "learning_rate": 3.5154334211762174e-09, + "loss": 0.2448, + "step": 12963 + }, + { + "epoch": 1.9837796480489671, + "grad_norm": 2.130643029150222, + "learning_rate": 3.4500351977506675e-09, + "loss": 0.2165, + "step": 12964 + }, + { + "epoch": 1.9839326702371842, + "grad_norm": 2.0960930753843914, + "learning_rate": 3.3852509015652734e-09, + "loss": 0.2506, + "step": 12965 + }, + { + "epoch": 1.9840856924254018, + "grad_norm": 1.9855073543678754, + "learning_rate": 3.3210805365979648e-09, + "loss": 0.2233, + "step": 12966 + }, + { + "epoch": 1.984238714613619, + "grad_norm": 2.051055830940307, + "learning_rate": 3.2575241067911435e-09, + "loss": 0.2452, + "step": 12967 + }, + { + "epoch": 1.9843917368018362, + "grad_norm": 2.0931128703850708, + "learning_rate": 3.194581616046133e-09, + "loss": 0.2692, + "step": 12968 + }, + { + "epoch": 1.9845447589900536, + "grad_norm": 1.736009426122614, + "learning_rate": 3.1322530682309506e-09, + "loss": 0.1924, + "step": 12969 + }, + { + "epoch": 1.984697781178271, + "grad_norm": 2.124895379549314, + "learning_rate": 3.0705384671714245e-09, + "loss": 0.2864, + "step": 12970 + }, + { + "epoch": 1.9848508033664882, + "grad_norm": 2.0882242357227283, + "learning_rate": 3.0094378166578563e-09, + "loss": 0.2996, + "step": 12971 + }, + { + "epoch": 1.9850038255547053, + "grad_norm": 1.9487330157379934, + "learning_rate": 2.9489511204439105e-09, + "loss": 0.2617, + "step": 12972 + }, + { + "epoch": 1.9851568477429227, + "grad_norm": 2.2667445577756444, + "learning_rate": 2.8890783822432823e-09, + "loss": 0.2701, + "step": 12973 + }, + { + "epoch": 1.98530986993114, + "grad_norm": 2.3116383396322986, + "learning_rate": 2.8298196057330308e-09, + "loss": 0.2981, + "step": 12974 + }, + { + "epoch": 1.9854628921193573, + "grad_norm": 1.7453561995500706, + "learning_rate": 2.7711747945524668e-09, + "loss": 0.1991, + "step": 12975 + }, + { + "epoch": 1.9856159143075747, + "grad_norm": 1.9390207186144572, + "learning_rate": 2.7131439523042646e-09, + "loss": 0.2393, + "step": 12976 + }, + { + "epoch": 1.9857689364957918, + "grad_norm": 2.3644792875455125, + "learning_rate": 2.6557270825511293e-09, + "loss": 0.2784, + "step": 12977 + }, + { + "epoch": 1.9859219586840093, + "grad_norm": 2.178932135966078, + "learning_rate": 2.5989241888191296e-09, + "loss": 0.343, + "step": 12978 + }, + { + "epoch": 1.9860749808722264, + "grad_norm": 2.1307936669410132, + "learning_rate": 2.542735274597696e-09, + "loss": 0.2718, + "step": 12979 + }, + { + "epoch": 1.9862280030604438, + "grad_norm": 1.9874821763775943, + "learning_rate": 2.4871603433374026e-09, + "loss": 0.2545, + "step": 12980 + }, + { + "epoch": 1.9863810252486611, + "grad_norm": 1.790519718630377, + "learning_rate": 2.432199398451074e-09, + "loss": 0.188, + "step": 12981 + }, + { + "epoch": 1.9865340474368782, + "grad_norm": 2.3174362383429723, + "learning_rate": 2.3778524433137882e-09, + "loss": 0.2746, + "step": 12982 + }, + { + "epoch": 1.9866870696250958, + "grad_norm": 2.346253570180665, + "learning_rate": 2.3241194812639868e-09, + "loss": 0.3198, + "step": 12983 + }, + { + "epoch": 1.986840091813313, + "grad_norm": 2.1172501130386716, + "learning_rate": 2.2710005156001415e-09, + "loss": 0.2755, + "step": 12984 + }, + { + "epoch": 1.9869931140015302, + "grad_norm": 2.2663121503499677, + "learning_rate": 2.218495549586308e-09, + "loss": 0.2343, + "step": 12985 + }, + { + "epoch": 1.9871461361897476, + "grad_norm": 1.9793839185682316, + "learning_rate": 2.166604586445464e-09, + "loss": 0.2497, + "step": 12986 + }, + { + "epoch": 1.9872991583779647, + "grad_norm": 2.2530991162584217, + "learning_rate": 2.1153276293661708e-09, + "loss": 0.2837, + "step": 12987 + }, + { + "epoch": 1.9874521805661822, + "grad_norm": 2.3052278983511703, + "learning_rate": 2.0646646814959093e-09, + "loss": 0.3059, + "step": 12988 + }, + { + "epoch": 1.9876052027543993, + "grad_norm": 2.0974839786486728, + "learning_rate": 2.014615745946635e-09, + "loss": 0.3217, + "step": 12989 + }, + { + "epoch": 1.9877582249426167, + "grad_norm": 2.233579314139941, + "learning_rate": 1.9651808257925564e-09, + "loss": 0.2597, + "step": 12990 + }, + { + "epoch": 1.987911247130834, + "grad_norm": 1.969583542552495, + "learning_rate": 1.9163599240690225e-09, + "loss": 0.2457, + "step": 12991 + }, + { + "epoch": 1.9880642693190511, + "grad_norm": 2.2150331863004236, + "learning_rate": 1.868153043774745e-09, + "loss": 0.2915, + "step": 12992 + }, + { + "epoch": 1.9882172915072687, + "grad_norm": 2.0676187007120923, + "learning_rate": 1.82056018786958e-09, + "loss": 0.2538, + "step": 12993 + }, + { + "epoch": 1.9883703136954858, + "grad_norm": 1.9421580439547903, + "learning_rate": 1.773581359277854e-09, + "loss": 0.2766, + "step": 12994 + }, + { + "epoch": 1.9885233358837031, + "grad_norm": 1.913701865466778, + "learning_rate": 1.727216560882816e-09, + "loss": 0.2257, + "step": 12995 + }, + { + "epoch": 1.9886763580719204, + "grad_norm": 2.220818373435268, + "learning_rate": 1.6814657955332992e-09, + "loss": 0.3052, + "step": 12996 + }, + { + "epoch": 1.9888293802601376, + "grad_norm": 2.179646529566567, + "learning_rate": 1.6363290660392773e-09, + "loss": 0.3342, + "step": 12997 + }, + { + "epoch": 1.9889824024483551, + "grad_norm": 2.435209578355301, + "learning_rate": 1.591806375170757e-09, + "loss": 0.3043, + "step": 12998 + }, + { + "epoch": 1.9891354246365722, + "grad_norm": 2.353359008708445, + "learning_rate": 1.5478977256644379e-09, + "loss": 0.2977, + "step": 12999 + }, + { + "epoch": 1.9892884468247896, + "grad_norm": 2.2126170930853903, + "learning_rate": 1.5046031202159417e-09, + "loss": 0.2601, + "step": 13000 + }, + { + "epoch": 1.989441469013007, + "grad_norm": 2.1203776595014348, + "learning_rate": 1.4619225614831424e-09, + "loss": 0.2572, + "step": 13001 + }, + { + "epoch": 1.9895944912012242, + "grad_norm": 2.253613767274853, + "learning_rate": 1.4198560520883865e-09, + "loss": 0.2622, + "step": 13002 + }, + { + "epoch": 1.9897475133894416, + "grad_norm": 2.11290047824741, + "learning_rate": 1.378403594615163e-09, + "loss": 0.3005, + "step": 13003 + }, + { + "epoch": 1.9899005355776587, + "grad_norm": 2.1312196286465244, + "learning_rate": 1.3375651916092135e-09, + "loss": 0.2949, + "step": 13004 + }, + { + "epoch": 1.990053557765876, + "grad_norm": 2.3139439421090287, + "learning_rate": 1.297340845578532e-09, + "loss": 0.2974, + "step": 13005 + }, + { + "epoch": 1.9902065799540933, + "grad_norm": 1.9727633886230713, + "learning_rate": 1.2577305589933642e-09, + "loss": 0.2459, + "step": 13006 + }, + { + "epoch": 1.9903596021423107, + "grad_norm": 2.0438809630472794, + "learning_rate": 1.218734334286209e-09, + "loss": 0.2784, + "step": 13007 + }, + { + "epoch": 1.990512624330528, + "grad_norm": 1.9332274761646762, + "learning_rate": 1.1803521738507072e-09, + "loss": 0.3422, + "step": 13008 + }, + { + "epoch": 1.9906656465187451, + "grad_norm": 1.9007967207990857, + "learning_rate": 1.1425840800471933e-09, + "loss": 0.254, + "step": 13009 + }, + { + "epoch": 1.9908186687069627, + "grad_norm": 2.1458707099063057, + "learning_rate": 1.1054300551927022e-09, + "loss": 0.2593, + "step": 13010 + }, + { + "epoch": 1.9909716908951798, + "grad_norm": 1.9056536998552065, + "learning_rate": 1.0688901015687425e-09, + "loss": 0.233, + "step": 13011 + }, + { + "epoch": 1.9911247130833971, + "grad_norm": 2.3772144145607546, + "learning_rate": 1.0329642214212953e-09, + "loss": 0.313, + "step": 13012 + }, + { + "epoch": 1.9912777352716144, + "grad_norm": 2.0814580331247927, + "learning_rate": 9.976524169552638e-10, + "loss": 0.2907, + "step": 13013 + }, + { + "epoch": 1.9914307574598316, + "grad_norm": 2.0239695611574064, + "learning_rate": 9.629546903400232e-10, + "loss": 0.2643, + "step": 13014 + }, + { + "epoch": 1.9915837796480491, + "grad_norm": 2.5593845320658444, + "learning_rate": 9.288710437060922e-10, + "loss": 0.3326, + "step": 13015 + }, + { + "epoch": 1.9917368018362662, + "grad_norm": 2.48462633721508, + "learning_rate": 8.954014791473509e-10, + "loss": 0.2874, + "step": 13016 + }, + { + "epoch": 1.9918898240244836, + "grad_norm": 1.883559842168018, + "learning_rate": 8.62545998717712e-10, + "loss": 0.1955, + "step": 13017 + }, + { + "epoch": 1.992042846212701, + "grad_norm": 1.7267492856950604, + "learning_rate": 8.303046044366713e-10, + "loss": 0.2448, + "step": 13018 + }, + { + "epoch": 1.992195868400918, + "grad_norm": 2.033471506716017, + "learning_rate": 7.986772982826463e-10, + "loss": 0.2754, + "step": 13019 + }, + { + "epoch": 1.9923488905891356, + "grad_norm": 1.8562161515444404, + "learning_rate": 7.676640821996373e-10, + "loss": 0.2387, + "step": 13020 + }, + { + "epoch": 1.9925019127773527, + "grad_norm": 2.150187407301561, + "learning_rate": 7.372649580916768e-10, + "loss": 0.2375, + "step": 13021 + }, + { + "epoch": 1.99265493496557, + "grad_norm": 1.9946054431849656, + "learning_rate": 7.074799278261602e-10, + "loss": 0.2818, + "step": 13022 + }, + { + "epoch": 1.9928079571537873, + "grad_norm": 1.7498242022545936, + "learning_rate": 6.783089932305142e-10, + "loss": 0.207, + "step": 13023 + }, + { + "epoch": 1.9929609793420044, + "grad_norm": 2.1900542288629032, + "learning_rate": 6.497521560977494e-10, + "loss": 0.278, + "step": 13024 + }, + { + "epoch": 1.993114001530222, + "grad_norm": 2.1975617817398447, + "learning_rate": 6.218094181820177e-10, + "loss": 0.237, + "step": 13025 + }, + { + "epoch": 1.9932670237184391, + "grad_norm": 1.8776794834042783, + "learning_rate": 5.944807811986142e-10, + "loss": 0.2505, + "step": 13026 + }, + { + "epoch": 1.9934200459066564, + "grad_norm": 1.9385495670411852, + "learning_rate": 5.677662468250855e-10, + "loss": 0.2213, + "step": 13027 + }, + { + "epoch": 1.9935730680948738, + "grad_norm": 2.0533741177883713, + "learning_rate": 5.416658167045619e-10, + "loss": 0.285, + "step": 13028 + }, + { + "epoch": 1.993726090283091, + "grad_norm": 2.1770054768883202, + "learning_rate": 5.161794924368746e-10, + "loss": 0.2512, + "step": 13029 + }, + { + "epoch": 1.9938791124713084, + "grad_norm": 2.305230017581347, + "learning_rate": 4.913072755896586e-10, + "loss": 0.3359, + "step": 13030 + }, + { + "epoch": 1.9940321346595256, + "grad_norm": 2.002526031764342, + "learning_rate": 4.670491676894706e-10, + "loss": 0.2198, + "step": 13031 + }, + { + "epoch": 1.994185156847743, + "grad_norm": 2.0269799838385905, + "learning_rate": 4.434051702262299e-10, + "loss": 0.2523, + "step": 13032 + }, + { + "epoch": 1.9943381790359602, + "grad_norm": 1.9888908831604968, + "learning_rate": 4.203752846521081e-10, + "loss": 0.2865, + "step": 13033 + }, + { + "epoch": 1.9944912012241776, + "grad_norm": 1.9144641539035228, + "learning_rate": 3.979595123815294e-10, + "loss": 0.2994, + "step": 13034 + }, + { + "epoch": 1.994644223412395, + "grad_norm": 1.776279721281176, + "learning_rate": 3.761578547900602e-10, + "loss": 0.1904, + "step": 13035 + }, + { + "epoch": 1.994797245600612, + "grad_norm": 2.298937100217351, + "learning_rate": 3.549703132188498e-10, + "loss": 0.2974, + "step": 13036 + }, + { + "epoch": 1.9949502677888293, + "grad_norm": 1.919240065983227, + "learning_rate": 3.343968889668592e-10, + "loss": 0.2379, + "step": 13037 + }, + { + "epoch": 1.9951032899770467, + "grad_norm": 2.0107139718990346, + "learning_rate": 3.1443758329752214e-10, + "loss": 0.2555, + "step": 13038 + }, + { + "epoch": 1.995256312165264, + "grad_norm": 2.1252763337912985, + "learning_rate": 2.950923974387454e-10, + "loss": 0.2751, + "step": 13039 + }, + { + "epoch": 1.9954093343534813, + "grad_norm": 2.411812539166475, + "learning_rate": 2.763613325773573e-10, + "loss": 0.3436, + "step": 13040 + }, + { + "epoch": 1.9955623565416984, + "grad_norm": 1.9107329649484788, + "learning_rate": 2.5824438986354895e-10, + "loss": 0.2166, + "step": 13041 + }, + { + "epoch": 1.995715378729916, + "grad_norm": 2.358934465004862, + "learning_rate": 2.4074157040976377e-10, + "loss": 0.2485, + "step": 13042 + }, + { + "epoch": 1.9958684009181331, + "grad_norm": 2.215031190581882, + "learning_rate": 2.2385287529180788e-10, + "loss": 0.2784, + "step": 13043 + }, + { + "epoch": 1.9960214231063504, + "grad_norm": 2.236157444646783, + "learning_rate": 2.075783055466296e-10, + "loss": 0.4084, + "step": 13044 + }, + { + "epoch": 1.9961744452945678, + "grad_norm": 2.0773031268943996, + "learning_rate": 1.9191786217342967e-10, + "loss": 0.2735, + "step": 13045 + }, + { + "epoch": 1.996327467482785, + "grad_norm": 1.783273240684617, + "learning_rate": 1.768715461336612e-10, + "loss": 0.2927, + "step": 13046 + }, + { + "epoch": 1.9964804896710024, + "grad_norm": 1.9397137676314884, + "learning_rate": 1.6243935835213998e-10, + "loss": 0.2428, + "step": 13047 + }, + { + "epoch": 1.9966335118592196, + "grad_norm": 1.8943175353851476, + "learning_rate": 1.4862129971371375e-10, + "loss": 0.2924, + "step": 13048 + }, + { + "epoch": 1.996786534047437, + "grad_norm": 2.1136284004737766, + "learning_rate": 1.3541737106881336e-10, + "loss": 0.2756, + "step": 13049 + }, + { + "epoch": 1.9969395562356542, + "grad_norm": 1.8973543054141213, + "learning_rate": 1.2282757322790162e-10, + "loss": 0.2196, + "step": 13050 + }, + { + "epoch": 1.9970925784238713, + "grad_norm": 2.0657394153839554, + "learning_rate": 1.1085190696369374e-10, + "loss": 0.2822, + "step": 13051 + }, + { + "epoch": 1.997245600612089, + "grad_norm": 1.8308499398834328, + "learning_rate": 9.94903730122676e-11, + "loss": 0.2253, + "step": 13052 + }, + { + "epoch": 1.997398622800306, + "grad_norm": 1.8283706378478568, + "learning_rate": 8.874297207084326e-11, + "loss": 0.2385, + "step": 13053 + }, + { + "epoch": 1.9975516449885233, + "grad_norm": 1.9678986030662173, + "learning_rate": 7.860970479889318e-11, + "loss": 0.3014, + "step": 13054 + }, + { + "epoch": 1.9977046671767407, + "grad_norm": 1.8653265671577366, + "learning_rate": 6.909057181925249e-11, + "loss": 0.2535, + "step": 13055 + }, + { + "epoch": 1.9978576893649578, + "grad_norm": 2.581058041189355, + "learning_rate": 6.018557371811895e-11, + "loss": 0.3192, + "step": 13056 + }, + { + "epoch": 1.9980107115531753, + "grad_norm": 2.005465358101594, + "learning_rate": 5.1894711039501836e-11, + "loss": 0.219, + "step": 13057 + }, + { + "epoch": 1.9981637337413924, + "grad_norm": 2.0811354665325625, + "learning_rate": 4.421798429521396e-11, + "loss": 0.2814, + "step": 13058 + }, + { + "epoch": 1.9983167559296098, + "grad_norm": 2.1460837784604236, + "learning_rate": 3.715539395487966e-11, + "loss": 0.2241, + "step": 13059 + }, + { + "epoch": 1.9984697781178271, + "grad_norm": 2.081856996888235, + "learning_rate": 3.0706940452596145e-11, + "loss": 0.242, + "step": 13060 + }, + { + "epoch": 1.9986228003060442, + "grad_norm": 2.227922939830152, + "learning_rate": 2.4872624185823258e-11, + "loss": 0.2824, + "step": 13061 + }, + { + "epoch": 1.9987758224942618, + "grad_norm": 2.3071203921932977, + "learning_rate": 1.9652445510942586e-11, + "loss": 0.2726, + "step": 13062 + }, + { + "epoch": 1.998928844682479, + "grad_norm": 2.4540772967670152, + "learning_rate": 1.5046404748808585e-11, + "loss": 0.2448, + "step": 13063 + }, + { + "epoch": 1.9990818668706962, + "grad_norm": 1.9558988952277159, + "learning_rate": 1.1054502183638349e-11, + "loss": 0.2816, + "step": 13064 + }, + { + "epoch": 1.9992348890589136, + "grad_norm": 2.437269378289497, + "learning_rate": 7.676738059680944e-12, + "loss": 0.366, + "step": 13065 + }, + { + "epoch": 1.999387911247131, + "grad_norm": 2.309948760100383, + "learning_rate": 4.913112584548074e-12, + "loss": 0.2803, + "step": 13066 + }, + { + "epoch": 1.9995409334353482, + "grad_norm": 2.4610462328848994, + "learning_rate": 2.7636259281038637e-12, + "loss": 0.3342, + "step": 13067 + }, + { + "epoch": 1.9996939556235653, + "grad_norm": 2.4319473298190433, + "learning_rate": 1.2282782213546284e-12, + "loss": 0.2665, + "step": 13068 + }, + { + "epoch": 1.9998469778117827, + "grad_norm": 1.9865301756122895, + "learning_rate": 3.070695597795492e-13, + "loss": 0.2713, + "step": 13069 + }, + { + "epoch": 2.0, + "grad_norm": 2.437039631382003, + "learning_rate": 0.0, + "loss": 0.3371, + "step": 13070 + }, + { + "epoch": 2.0, + "step": 13070, + "total_flos": 2.021711277939753e+19, + "train_loss": 0.5611064869115256, + "train_runtime": 74118.7755, + "train_samples_per_second": 39.019, + "train_steps_per_second": 0.176 + } + ], + "logging_steps": 1.0, + "max_steps": 13070, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.021711277939753e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}