diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14253 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.48204386599180526, + "eval_steps": 200, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00024102193299590263, + "grad_norm": 42.72804641723633, + "learning_rate": 3.2e-07, + "loss": 65.4564, + "step": 1 + }, + { + "epoch": 0.00048204386599180526, + "grad_norm": 56.71217346191406, + "learning_rate": 6.4e-07, + "loss": 66.0975, + "step": 2 + }, + { + "epoch": 0.0007230657989877079, + "grad_norm": 64.2192153930664, + "learning_rate": 9.600000000000001e-07, + "loss": 66.4252, + "step": 3 + }, + { + "epoch": 0.0009640877319836105, + "grad_norm": 56.814781188964844, + "learning_rate": 1.28e-06, + "loss": 66.3293, + "step": 4 + }, + { + "epoch": 0.001205109664979513, + "grad_norm": 60.4368896484375, + "learning_rate": 1.6000000000000001e-06, + "loss": 66.1336, + "step": 5 + }, + { + "epoch": 0.0014461315979754157, + "grad_norm": 62.0732536315918, + "learning_rate": 1.9200000000000003e-06, + "loss": 65.8064, + "step": 6 + }, + { + "epoch": 0.0016871535309713184, + "grad_norm": 58.37015151977539, + "learning_rate": 2.24e-06, + "loss": 66.0365, + "step": 7 + }, + { + "epoch": 0.001928175463967221, + "grad_norm": 63.92992401123047, + "learning_rate": 2.56e-06, + "loss": 66.0123, + "step": 8 + }, + { + "epoch": 0.0021691973969631237, + "grad_norm": 59.743309020996094, + "learning_rate": 2.88e-06, + "loss": 66.3628, + "step": 9 + }, + { + "epoch": 0.002410219329959026, + "grad_norm": 55.556434631347656, + "learning_rate": 3.2000000000000003e-06, + "loss": 65.4957, + "step": 10 + }, + { + "epoch": 0.002651241262954929, + "grad_norm": 62.14134979248047, + "learning_rate": 3.52e-06, + "loss": 65.5067, + "step": 11 + }, + { + "epoch": 0.0028922631959508315, + "grad_norm": 76.36312103271484, + "learning_rate": 3.8400000000000005e-06, + "loss": 65.8513, + "step": 12 + }, + { + "epoch": 0.0031332851289467343, + "grad_norm": 85.70427703857422, + "learning_rate": 4.16e-06, + "loss": 65.7141, + "step": 13 + }, + { + "epoch": 0.003374307061942637, + "grad_norm": 77.02534484863281, + "learning_rate": 4.48e-06, + "loss": 66.0758, + "step": 14 + }, + { + "epoch": 0.0036153289949385392, + "grad_norm": 95.97369384765625, + "learning_rate": 4.800000000000001e-06, + "loss": 65.6901, + "step": 15 + }, + { + "epoch": 0.003856350927934442, + "grad_norm": 86.85653686523438, + "learning_rate": 5.12e-06, + "loss": 65.7116, + "step": 16 + }, + { + "epoch": 0.0040973728609303445, + "grad_norm": 70.26445770263672, + "learning_rate": 5.4400000000000004e-06, + "loss": 65.9554, + "step": 17 + }, + { + "epoch": 0.004338394793926247, + "grad_norm": 36.44063949584961, + "learning_rate": 5.76e-06, + "loss": 65.646, + "step": 18 + }, + { + "epoch": 0.00457941672692215, + "grad_norm": 50.508514404296875, + "learning_rate": 6.08e-06, + "loss": 65.4548, + "step": 19 + }, + { + "epoch": 0.004820438659918052, + "grad_norm": 56.767398834228516, + "learning_rate": 6.4000000000000006e-06, + "loss": 65.6105, + "step": 20 + }, + { + "epoch": 0.005061460592913955, + "grad_norm": 46.89820098876953, + "learning_rate": 6.720000000000001e-06, + "loss": 65.5333, + "step": 21 + }, + { + "epoch": 0.005302482525909858, + "grad_norm": 30.504426956176758, + "learning_rate": 7.04e-06, + "loss": 65.6336, + "step": 22 + }, + { + "epoch": 0.00554350445890576, + "grad_norm": 47.7249870300293, + "learning_rate": 7.360000000000001e-06, + "loss": 65.5996, + "step": 23 + }, + { + "epoch": 0.005784526391901663, + "grad_norm": 53.1598014831543, + "learning_rate": 7.680000000000001e-06, + "loss": 65.5955, + "step": 24 + }, + { + "epoch": 0.006025548324897566, + "grad_norm": 38.94886779785156, + "learning_rate": 8.000000000000001e-06, + "loss": 65.3947, + "step": 25 + }, + { + "epoch": 0.006266570257893469, + "grad_norm": 15.044118881225586, + "learning_rate": 8.32e-06, + "loss": 65.4503, + "step": 26 + }, + { + "epoch": 0.006507592190889371, + "grad_norm": 21.987150192260742, + "learning_rate": 8.64e-06, + "loss": 65.301, + "step": 27 + }, + { + "epoch": 0.006748614123885274, + "grad_norm": 11.280169486999512, + "learning_rate": 8.96e-06, + "loss": 65.5754, + "step": 28 + }, + { + "epoch": 0.0069896360568811764, + "grad_norm": 10.873299598693848, + "learning_rate": 9.280000000000001e-06, + "loss": 65.478, + "step": 29 + }, + { + "epoch": 0.0072306579898770785, + "grad_norm": 10.589656829833984, + "learning_rate": 9.600000000000001e-06, + "loss": 65.455, + "step": 30 + }, + { + "epoch": 0.007471679922872981, + "grad_norm": 15.558884620666504, + "learning_rate": 9.920000000000002e-06, + "loss": 65.3145, + "step": 31 + }, + { + "epoch": 0.007712701855868884, + "grad_norm": 9.434968948364258, + "learning_rate": 1.024e-05, + "loss": 65.256, + "step": 32 + }, + { + "epoch": 0.007953723788864787, + "grad_norm": 9.530153274536133, + "learning_rate": 1.056e-05, + "loss": 64.9717, + "step": 33 + }, + { + "epoch": 0.008194745721860689, + "grad_norm": 7.609200954437256, + "learning_rate": 1.0880000000000001e-05, + "loss": 64.8197, + "step": 34 + }, + { + "epoch": 0.008435767654856591, + "grad_norm": 9.002848625183105, + "learning_rate": 1.1200000000000001e-05, + "loss": 64.6697, + "step": 35 + }, + { + "epoch": 0.008676789587852495, + "grad_norm": 7.137924671173096, + "learning_rate": 1.152e-05, + "loss": 64.5697, + "step": 36 + }, + { + "epoch": 0.008917811520848397, + "grad_norm": 10.054763793945312, + "learning_rate": 1.184e-05, + "loss": 64.047, + "step": 37 + }, + { + "epoch": 0.0091588334538443, + "grad_norm": 8.42515754699707, + "learning_rate": 1.216e-05, + "loss": 63.8343, + "step": 38 + }, + { + "epoch": 0.009399855386840203, + "grad_norm": 8.234342575073242, + "learning_rate": 1.248e-05, + "loss": 63.2305, + "step": 39 + }, + { + "epoch": 0.009640877319836105, + "grad_norm": 8.490630149841309, + "learning_rate": 1.2800000000000001e-05, + "loss": 62.4364, + "step": 40 + }, + { + "epoch": 0.009881899252832008, + "grad_norm": 9.7130708694458, + "learning_rate": 1.3120000000000001e-05, + "loss": 61.5853, + "step": 41 + }, + { + "epoch": 0.01012292118582791, + "grad_norm": 11.698753356933594, + "learning_rate": 1.3440000000000002e-05, + "loss": 60.3393, + "step": 42 + }, + { + "epoch": 0.010363943118823812, + "grad_norm": 13.491514205932617, + "learning_rate": 1.376e-05, + "loss": 58.7705, + "step": 43 + }, + { + "epoch": 0.010604965051819716, + "grad_norm": 15.559304237365723, + "learning_rate": 1.408e-05, + "loss": 57.2696, + "step": 44 + }, + { + "epoch": 0.010845986984815618, + "grad_norm": 17.024511337280273, + "learning_rate": 1.4400000000000001e-05, + "loss": 55.8881, + "step": 45 + }, + { + "epoch": 0.01108700891781152, + "grad_norm": 18.470815658569336, + "learning_rate": 1.4720000000000001e-05, + "loss": 54.1749, + "step": 46 + }, + { + "epoch": 0.011328030850807424, + "grad_norm": 20.235261917114258, + "learning_rate": 1.5040000000000002e-05, + "loss": 52.3142, + "step": 47 + }, + { + "epoch": 0.011569052783803326, + "grad_norm": 21.192331314086914, + "learning_rate": 1.5360000000000002e-05, + "loss": 50.6883, + "step": 48 + }, + { + "epoch": 0.011810074716799228, + "grad_norm": 21.096004486083984, + "learning_rate": 1.5680000000000002e-05, + "loss": 49.5085, + "step": 49 + }, + { + "epoch": 0.012051096649795132, + "grad_norm": 19.711599349975586, + "learning_rate": 1.6000000000000003e-05, + "loss": 48.9813, + "step": 50 + }, + { + "epoch": 0.012292118582791034, + "grad_norm": 19.09426498413086, + "learning_rate": 1.632e-05, + "loss": 48.1431, + "step": 51 + }, + { + "epoch": 0.012533140515786937, + "grad_norm": 18.691404342651367, + "learning_rate": 1.664e-05, + "loss": 46.8445, + "step": 52 + }, + { + "epoch": 0.01277416244878284, + "grad_norm": 18.45130729675293, + "learning_rate": 1.696e-05, + "loss": 45.9947, + "step": 53 + }, + { + "epoch": 0.013015184381778741, + "grad_norm": 18.915958404541016, + "learning_rate": 1.728e-05, + "loss": 44.8987, + "step": 54 + }, + { + "epoch": 0.013256206314774645, + "grad_norm": 17.18844223022461, + "learning_rate": 1.76e-05, + "loss": 44.2361, + "step": 55 + }, + { + "epoch": 0.013497228247770547, + "grad_norm": 16.911972045898438, + "learning_rate": 1.792e-05, + "loss": 43.5362, + "step": 56 + }, + { + "epoch": 0.01373825018076645, + "grad_norm": 16.539419174194336, + "learning_rate": 1.824e-05, + "loss": 42.9404, + "step": 57 + }, + { + "epoch": 0.013979272113762353, + "grad_norm": 14.957817077636719, + "learning_rate": 1.8560000000000002e-05, + "loss": 43.2938, + "step": 58 + }, + { + "epoch": 0.014220294046758255, + "grad_norm": 16.026010513305664, + "learning_rate": 1.8880000000000002e-05, + "loss": 41.2015, + "step": 59 + }, + { + "epoch": 0.014461315979754157, + "grad_norm": 16.654521942138672, + "learning_rate": 1.9200000000000003e-05, + "loss": 40.8632, + "step": 60 + }, + { + "epoch": 0.01470233791275006, + "grad_norm": 15.298394203186035, + "learning_rate": 1.9520000000000003e-05, + "loss": 41.6137, + "step": 61 + }, + { + "epoch": 0.014943359845745963, + "grad_norm": 16.35810661315918, + "learning_rate": 1.9840000000000003e-05, + "loss": 40.601, + "step": 62 + }, + { + "epoch": 0.015184381778741865, + "grad_norm": 17.227214813232422, + "learning_rate": 2.016e-05, + "loss": 40.9833, + "step": 63 + }, + { + "epoch": 0.015425403711737768, + "grad_norm": 15.655694961547852, + "learning_rate": 2.048e-05, + "loss": 41.0024, + "step": 64 + }, + { + "epoch": 0.01566642564473367, + "grad_norm": 18.70063591003418, + "learning_rate": 2.08e-05, + "loss": 39.2855, + "step": 65 + }, + { + "epoch": 0.015907447577729574, + "grad_norm": 17.024856567382812, + "learning_rate": 2.112e-05, + "loss": 40.2739, + "step": 66 + }, + { + "epoch": 0.016148469510725474, + "grad_norm": 17.809558868408203, + "learning_rate": 2.144e-05, + "loss": 39.317, + "step": 67 + }, + { + "epoch": 0.016389491443721378, + "grad_norm": 17.542661666870117, + "learning_rate": 2.1760000000000002e-05, + "loss": 39.5904, + "step": 68 + }, + { + "epoch": 0.016630513376717282, + "grad_norm": 18.485530853271484, + "learning_rate": 2.2080000000000002e-05, + "loss": 38.3617, + "step": 69 + }, + { + "epoch": 0.016871535309713182, + "grad_norm": 18.264142990112305, + "learning_rate": 2.2400000000000002e-05, + "loss": 38.5905, + "step": 70 + }, + { + "epoch": 0.017112557242709086, + "grad_norm": 18.51151466369629, + "learning_rate": 2.272e-05, + "loss": 38.5741, + "step": 71 + }, + { + "epoch": 0.01735357917570499, + "grad_norm": 18.297344207763672, + "learning_rate": 2.304e-05, + "loss": 38.54, + "step": 72 + }, + { + "epoch": 0.017594601108700893, + "grad_norm": 18.055278778076172, + "learning_rate": 2.336e-05, + "loss": 38.9258, + "step": 73 + }, + { + "epoch": 0.017835623041696794, + "grad_norm": 19.29004669189453, + "learning_rate": 2.368e-05, + "loss": 37.9942, + "step": 74 + }, + { + "epoch": 0.018076644974692697, + "grad_norm": 21.222789764404297, + "learning_rate": 2.4e-05, + "loss": 36.9128, + "step": 75 + }, + { + "epoch": 0.0183176669076886, + "grad_norm": 19.091541290283203, + "learning_rate": 2.432e-05, + "loss": 38.0379, + "step": 76 + }, + { + "epoch": 0.0185586888406845, + "grad_norm": 19.868362426757812, + "learning_rate": 2.464e-05, + "loss": 37.2547, + "step": 77 + }, + { + "epoch": 0.018799710773680405, + "grad_norm": 20.644834518432617, + "learning_rate": 2.496e-05, + "loss": 37.2592, + "step": 78 + }, + { + "epoch": 0.01904073270667631, + "grad_norm": 19.1152286529541, + "learning_rate": 2.5280000000000002e-05, + "loss": 36.8179, + "step": 79 + }, + { + "epoch": 0.01928175463967221, + "grad_norm": 19.392606735229492, + "learning_rate": 2.5600000000000002e-05, + "loss": 36.9811, + "step": 80 + }, + { + "epoch": 0.019522776572668113, + "grad_norm": 17.07343101501465, + "learning_rate": 2.5920000000000003e-05, + "loss": 37.3406, + "step": 81 + }, + { + "epoch": 0.019763798505664017, + "grad_norm": 20.794109344482422, + "learning_rate": 2.6240000000000003e-05, + "loss": 35.9503, + "step": 82 + }, + { + "epoch": 0.020004820438659917, + "grad_norm": 18.083938598632812, + "learning_rate": 2.6560000000000003e-05, + "loss": 35.8583, + "step": 83 + }, + { + "epoch": 0.02024584237165582, + "grad_norm": 18.34483528137207, + "learning_rate": 2.6880000000000004e-05, + "loss": 35.7889, + "step": 84 + }, + { + "epoch": 0.020486864304651724, + "grad_norm": 18.733062744140625, + "learning_rate": 2.7200000000000004e-05, + "loss": 34.992, + "step": 85 + }, + { + "epoch": 0.020727886237647625, + "grad_norm": 18.474979400634766, + "learning_rate": 2.752e-05, + "loss": 35.4855, + "step": 86 + }, + { + "epoch": 0.02096890817064353, + "grad_norm": 18.55208396911621, + "learning_rate": 2.784e-05, + "loss": 35.2044, + "step": 87 + }, + { + "epoch": 0.021209930103639432, + "grad_norm": 17.742801666259766, + "learning_rate": 2.816e-05, + "loss": 35.6894, + "step": 88 + }, + { + "epoch": 0.021450952036635332, + "grad_norm": 18.13235855102539, + "learning_rate": 2.8480000000000002e-05, + "loss": 36.0043, + "step": 89 + }, + { + "epoch": 0.021691973969631236, + "grad_norm": 17.610307693481445, + "learning_rate": 2.8800000000000002e-05, + "loss": 36.136, + "step": 90 + }, + { + "epoch": 0.02193299590262714, + "grad_norm": 18.641040802001953, + "learning_rate": 2.9120000000000002e-05, + "loss": 34.1837, + "step": 91 + }, + { + "epoch": 0.02217401783562304, + "grad_norm": 19.180612564086914, + "learning_rate": 2.9440000000000003e-05, + "loss": 33.6604, + "step": 92 + }, + { + "epoch": 0.022415039768618944, + "grad_norm": 16.752777099609375, + "learning_rate": 2.9760000000000003e-05, + "loss": 34.6888, + "step": 93 + }, + { + "epoch": 0.022656061701614848, + "grad_norm": 20.233341217041016, + "learning_rate": 3.0080000000000003e-05, + "loss": 33.1623, + "step": 94 + }, + { + "epoch": 0.022897083634610748, + "grad_norm": 18.2009334564209, + "learning_rate": 3.0400000000000004e-05, + "loss": 33.1372, + "step": 95 + }, + { + "epoch": 0.023138105567606652, + "grad_norm": 16.896150588989258, + "learning_rate": 3.0720000000000004e-05, + "loss": 33.6226, + "step": 96 + }, + { + "epoch": 0.023379127500602555, + "grad_norm": 16.449636459350586, + "learning_rate": 3.104e-05, + "loss": 33.2728, + "step": 97 + }, + { + "epoch": 0.023620149433598456, + "grad_norm": 16.56513023376465, + "learning_rate": 3.1360000000000005e-05, + "loss": 32.7007, + "step": 98 + }, + { + "epoch": 0.02386117136659436, + "grad_norm": 15.720605850219727, + "learning_rate": 3.168e-05, + "loss": 34.0864, + "step": 99 + }, + { + "epoch": 0.024102193299590263, + "grad_norm": 16.957950592041016, + "learning_rate": 3.2000000000000005e-05, + "loss": 33.1375, + "step": 100 + }, + { + "epoch": 0.024343215232586167, + "grad_norm": 18.224735260009766, + "learning_rate": 3.232e-05, + "loss": 31.7851, + "step": 101 + }, + { + "epoch": 0.024584237165582067, + "grad_norm": 17.478137969970703, + "learning_rate": 3.264e-05, + "loss": 31.5386, + "step": 102 + }, + { + "epoch": 0.02482525909857797, + "grad_norm": 15.252176284790039, + "learning_rate": 3.296e-05, + "loss": 33.3645, + "step": 103 + }, + { + "epoch": 0.025066281031573875, + "grad_norm": 15.67715072631836, + "learning_rate": 3.328e-05, + "loss": 32.5826, + "step": 104 + }, + { + "epoch": 0.025307302964569775, + "grad_norm": 17.961702346801758, + "learning_rate": 3.3600000000000004e-05, + "loss": 32.7181, + "step": 105 + }, + { + "epoch": 0.02554832489756568, + "grad_norm": 14.762301445007324, + "learning_rate": 3.392e-05, + "loss": 33.7716, + "step": 106 + }, + { + "epoch": 0.025789346830561583, + "grad_norm": 15.59712028503418, + "learning_rate": 3.4240000000000004e-05, + "loss": 32.3544, + "step": 107 + }, + { + "epoch": 0.026030368763557483, + "grad_norm": 15.950053215026855, + "learning_rate": 3.456e-05, + "loss": 31.6144, + "step": 108 + }, + { + "epoch": 0.026271390696553387, + "grad_norm": 17.807863235473633, + "learning_rate": 3.4880000000000005e-05, + "loss": 31.0034, + "step": 109 + }, + { + "epoch": 0.02651241262954929, + "grad_norm": 17.99403953552246, + "learning_rate": 3.52e-05, + "loss": 31.1804, + "step": 110 + }, + { + "epoch": 0.02675343456254519, + "grad_norm": 19.826875686645508, + "learning_rate": 3.5520000000000006e-05, + "loss": 30.8011, + "step": 111 + }, + { + "epoch": 0.026994456495541094, + "grad_norm": 19.535627365112305, + "learning_rate": 3.584e-05, + "loss": 31.2203, + "step": 112 + }, + { + "epoch": 0.027235478428536998, + "grad_norm": 21.132902145385742, + "learning_rate": 3.6160000000000006e-05, + "loss": 31.1572, + "step": 113 + }, + { + "epoch": 0.0274765003615329, + "grad_norm": 18.71970558166504, + "learning_rate": 3.648e-05, + "loss": 32.1978, + "step": 114 + }, + { + "epoch": 0.027717522294528802, + "grad_norm": 19.573566436767578, + "learning_rate": 3.680000000000001e-05, + "loss": 31.2665, + "step": 115 + }, + { + "epoch": 0.027958544227524706, + "grad_norm": 18.028156280517578, + "learning_rate": 3.7120000000000004e-05, + "loss": 31.5384, + "step": 116 + }, + { + "epoch": 0.028199566160520606, + "grad_norm": 17.05816650390625, + "learning_rate": 3.744000000000001e-05, + "loss": 30.4499, + "step": 117 + }, + { + "epoch": 0.02844058809351651, + "grad_norm": 15.874523162841797, + "learning_rate": 3.7760000000000004e-05, + "loss": 30.3916, + "step": 118 + }, + { + "epoch": 0.028681610026512414, + "grad_norm": 14.838794708251953, + "learning_rate": 3.808e-05, + "loss": 29.9008, + "step": 119 + }, + { + "epoch": 0.028922631959508314, + "grad_norm": 14.200540542602539, + "learning_rate": 3.8400000000000005e-05, + "loss": 29.5332, + "step": 120 + }, + { + "epoch": 0.029163653892504218, + "grad_norm": 12.608623504638672, + "learning_rate": 3.872e-05, + "loss": 30.4684, + "step": 121 + }, + { + "epoch": 0.02940467582550012, + "grad_norm": 11.970538139343262, + "learning_rate": 3.9040000000000006e-05, + "loss": 30.2579, + "step": 122 + }, + { + "epoch": 0.02964569775849602, + "grad_norm": 13.8192720413208, + "learning_rate": 3.936e-05, + "loss": 29.5375, + "step": 123 + }, + { + "epoch": 0.029886719691491925, + "grad_norm": 12.825424194335938, + "learning_rate": 3.9680000000000006e-05, + "loss": 29.5506, + "step": 124 + }, + { + "epoch": 0.03012774162448783, + "grad_norm": 12.498834609985352, + "learning_rate": 4e-05, + "loss": 30.9679, + "step": 125 + }, + { + "epoch": 0.03036876355748373, + "grad_norm": 16.673967361450195, + "learning_rate": 3.9999993904858685e-05, + "loss": 29.4312, + "step": 126 + }, + { + "epoch": 0.030609785490479633, + "grad_norm": 16.957353591918945, + "learning_rate": 3.999997561943845e-05, + "loss": 29.9051, + "step": 127 + }, + { + "epoch": 0.030850807423475537, + "grad_norm": 20.182907104492188, + "learning_rate": 3.999994514375044e-05, + "loss": 29.0388, + "step": 128 + }, + { + "epoch": 0.03109182935647144, + "grad_norm": 19.965667724609375, + "learning_rate": 3.999990247781324e-05, + "loss": 31.1998, + "step": 129 + }, + { + "epoch": 0.03133285128946734, + "grad_norm": 20.278669357299805, + "learning_rate": 3.9999847621652847e-05, + "loss": 31.7223, + "step": 130 + }, + { + "epoch": 0.031573873222463245, + "grad_norm": 19.572418212890625, + "learning_rate": 3.999978057530269e-05, + "loss": 30.295, + "step": 131 + }, + { + "epoch": 0.03181489515545915, + "grad_norm": 18.772329330444336, + "learning_rate": 3.999970133880365e-05, + "loss": 28.7684, + "step": 132 + }, + { + "epoch": 0.03205591708845505, + "grad_norm": 16.830596923828125, + "learning_rate": 3.999960991220401e-05, + "loss": 29.1153, + "step": 133 + }, + { + "epoch": 0.03229693902145095, + "grad_norm": 13.721373558044434, + "learning_rate": 3.999950629555951e-05, + "loss": 29.6169, + "step": 134 + }, + { + "epoch": 0.03253796095444685, + "grad_norm": 12.899377822875977, + "learning_rate": 3.9999390488933283e-05, + "loss": 29.8593, + "step": 135 + }, + { + "epoch": 0.032778982887442756, + "grad_norm": 13.411840438842773, + "learning_rate": 3.9999262492395944e-05, + "loss": 29.4892, + "step": 136 + }, + { + "epoch": 0.03302000482043866, + "grad_norm": 13.636588096618652, + "learning_rate": 3.9999122306025484e-05, + "loss": 29.1233, + "step": 137 + }, + { + "epoch": 0.033261026753434564, + "grad_norm": 14.977879524230957, + "learning_rate": 3.999896992990736e-05, + "loss": 28.4617, + "step": 138 + }, + { + "epoch": 0.03350204868643047, + "grad_norm": 14.625834465026855, + "learning_rate": 3.999880536413444e-05, + "loss": 29.6623, + "step": 139 + }, + { + "epoch": 0.033743070619426364, + "grad_norm": 15.034134864807129, + "learning_rate": 3.999862860880704e-05, + "loss": 28.5823, + "step": 140 + }, + { + "epoch": 0.03398409255242227, + "grad_norm": 15.436853408813477, + "learning_rate": 3.999843966403289e-05, + "loss": 28.2604, + "step": 141 + }, + { + "epoch": 0.03422511448541817, + "grad_norm": 13.822096824645996, + "learning_rate": 3.999823852992715e-05, + "loss": 28.4912, + "step": 142 + }, + { + "epoch": 0.034466136418414076, + "grad_norm": 11.686617851257324, + "learning_rate": 3.9998025206612424e-05, + "loss": 28.7861, + "step": 143 + }, + { + "epoch": 0.03470715835140998, + "grad_norm": 14.374902725219727, + "learning_rate": 3.999779969421872e-05, + "loss": 28.3655, + "step": 144 + }, + { + "epoch": 0.03494818028440588, + "grad_norm": 14.3497953414917, + "learning_rate": 3.9997561992883515e-05, + "loss": 27.9743, + "step": 145 + }, + { + "epoch": 0.03518920221740179, + "grad_norm": 16.043867111206055, + "learning_rate": 3.999731210275166e-05, + "loss": 28.4248, + "step": 146 + }, + { + "epoch": 0.035430224150397684, + "grad_norm": 15.994876861572266, + "learning_rate": 3.999705002397549e-05, + "loss": 28.6456, + "step": 147 + }, + { + "epoch": 0.03567124608339359, + "grad_norm": 18.017837524414062, + "learning_rate": 3.999677575671475e-05, + "loss": 28.3539, + "step": 148 + }, + { + "epoch": 0.03591226801638949, + "grad_norm": 17.25051498413086, + "learning_rate": 3.999648930113658e-05, + "loss": 27.5037, + "step": 149 + }, + { + "epoch": 0.036153289949385395, + "grad_norm": 18.328096389770508, + "learning_rate": 3.999619065741561e-05, + "loss": 27.466, + "step": 150 + }, + { + "epoch": 0.0363943118823813, + "grad_norm": 19.160415649414062, + "learning_rate": 3.9995879825733853e-05, + "loss": 27.2118, + "step": 151 + }, + { + "epoch": 0.0366353338153772, + "grad_norm": 17.57060432434082, + "learning_rate": 3.999555680628077e-05, + "loss": 27.5424, + "step": 152 + }, + { + "epoch": 0.0368763557483731, + "grad_norm": 17.778331756591797, + "learning_rate": 3.9995221599253235e-05, + "loss": 27.8953, + "step": 153 + }, + { + "epoch": 0.037117377681369, + "grad_norm": 18.411108016967773, + "learning_rate": 3.999487420485558e-05, + "loss": 27.5028, + "step": 154 + }, + { + "epoch": 0.03735839961436491, + "grad_norm": 15.218093872070312, + "learning_rate": 3.999451462329953e-05, + "loss": 28.3763, + "step": 155 + }, + { + "epoch": 0.03759942154736081, + "grad_norm": 16.500682830810547, + "learning_rate": 3.999414285480426e-05, + "loss": 28.3188, + "step": 156 + }, + { + "epoch": 0.037840443480356714, + "grad_norm": 15.774834632873535, + "learning_rate": 3.999375889959638e-05, + "loss": 28.3748, + "step": 157 + }, + { + "epoch": 0.03808146541335262, + "grad_norm": 12.487804412841797, + "learning_rate": 3.999336275790989e-05, + "loss": 27.9211, + "step": 158 + }, + { + "epoch": 0.038322487346348515, + "grad_norm": 12.656122207641602, + "learning_rate": 3.9992954429986276e-05, + "loss": 27.5348, + "step": 159 + }, + { + "epoch": 0.03856350927934442, + "grad_norm": 15.520419120788574, + "learning_rate": 3.999253391607439e-05, + "loss": 27.3367, + "step": 160 + }, + { + "epoch": 0.03880453121234032, + "grad_norm": 13.615877151489258, + "learning_rate": 3.999210121643057e-05, + "loss": 27.1147, + "step": 161 + }, + { + "epoch": 0.039045553145336226, + "grad_norm": 14.329619407653809, + "learning_rate": 3.999165633131852e-05, + "loss": 27.135, + "step": 162 + }, + { + "epoch": 0.03928657507833213, + "grad_norm": 14.85840129852295, + "learning_rate": 3.999119926100943e-05, + "loss": 27.5826, + "step": 163 + }, + { + "epoch": 0.03952759701132803, + "grad_norm": 15.83386516571045, + "learning_rate": 3.999073000578188e-05, + "loss": 26.9138, + "step": 164 + }, + { + "epoch": 0.03976861894432393, + "grad_norm": 14.653356552124023, + "learning_rate": 3.999024856592189e-05, + "loss": 27.9365, + "step": 165 + }, + { + "epoch": 0.040009640877319834, + "grad_norm": 15.570621490478516, + "learning_rate": 3.9989754941722905e-05, + "loss": 27.321, + "step": 166 + }, + { + "epoch": 0.04025066281031574, + "grad_norm": 13.136109352111816, + "learning_rate": 3.99892491334858e-05, + "loss": 27.4931, + "step": 167 + }, + { + "epoch": 0.04049168474331164, + "grad_norm": 13.216486930847168, + "learning_rate": 3.998873114151886e-05, + "loss": 26.7786, + "step": 168 + }, + { + "epoch": 0.040732706676307545, + "grad_norm": 14.872537612915039, + "learning_rate": 3.998820096613782e-05, + "loss": 26.5462, + "step": 169 + }, + { + "epoch": 0.04097372860930345, + "grad_norm": 13.691864013671875, + "learning_rate": 3.9987658607665816e-05, + "loss": 26.8023, + "step": 170 + }, + { + "epoch": 0.04121475054229935, + "grad_norm": 12.447335243225098, + "learning_rate": 3.9987104066433444e-05, + "loss": 27.4209, + "step": 171 + }, + { + "epoch": 0.04145577247529525, + "grad_norm": 13.265443801879883, + "learning_rate": 3.9986537342778684e-05, + "loss": 27.1379, + "step": 172 + }, + { + "epoch": 0.04169679440829115, + "grad_norm": 14.530879020690918, + "learning_rate": 3.9985958437046976e-05, + "loss": 28.0222, + "step": 173 + }, + { + "epoch": 0.04193781634128706, + "grad_norm": 14.69826602935791, + "learning_rate": 3.998536734959116e-05, + "loss": 27.2624, + "step": 174 + }, + { + "epoch": 0.04217883827428296, + "grad_norm": 17.18415641784668, + "learning_rate": 3.998476408077152e-05, + "loss": 27.5388, + "step": 175 + }, + { + "epoch": 0.042419860207278864, + "grad_norm": 16.87925148010254, + "learning_rate": 3.9984148630955764e-05, + "loss": 27.0561, + "step": 176 + }, + { + "epoch": 0.04266088214027477, + "grad_norm": 17.0130672454834, + "learning_rate": 3.9983521000519e-05, + "loss": 28.0316, + "step": 177 + }, + { + "epoch": 0.042901904073270665, + "grad_norm": 17.169607162475586, + "learning_rate": 3.998288118984379e-05, + "loss": 26.3547, + "step": 178 + }, + { + "epoch": 0.04314292600626657, + "grad_norm": 17.935922622680664, + "learning_rate": 3.99822291993201e-05, + "loss": 27.0158, + "step": 179 + }, + { + "epoch": 0.04338394793926247, + "grad_norm": 16.146257400512695, + "learning_rate": 3.998156502934533e-05, + "loss": 26.8296, + "step": 180 + }, + { + "epoch": 0.043624969872258376, + "grad_norm": 13.183793067932129, + "learning_rate": 3.99808886803243e-05, + "loss": 26.1367, + "step": 181 + }, + { + "epoch": 0.04386599180525428, + "grad_norm": 13.617413520812988, + "learning_rate": 3.9980200152669266e-05, + "loss": 26.2579, + "step": 182 + }, + { + "epoch": 0.044107013738250184, + "grad_norm": 12.817903518676758, + "learning_rate": 3.997949944679988e-05, + "loss": 25.3615, + "step": 183 + }, + { + "epoch": 0.04434803567124608, + "grad_norm": 13.391742706298828, + "learning_rate": 3.997878656314324e-05, + "loss": 25.4919, + "step": 184 + }, + { + "epoch": 0.044589057604241984, + "grad_norm": 13.557109832763672, + "learning_rate": 3.997806150213386e-05, + "loss": 27.3591, + "step": 185 + }, + { + "epoch": 0.04483007953723789, + "grad_norm": 14.877585411071777, + "learning_rate": 3.997732426421366e-05, + "loss": 25.9182, + "step": 186 + }, + { + "epoch": 0.04507110147023379, + "grad_norm": 15.414568901062012, + "learning_rate": 3.997657484983202e-05, + "loss": 26.0983, + "step": 187 + }, + { + "epoch": 0.045312123403229695, + "grad_norm": 15.658160209655762, + "learning_rate": 3.9975813259445704e-05, + "loss": 26.1566, + "step": 188 + }, + { + "epoch": 0.0455531453362256, + "grad_norm": 14.442419052124023, + "learning_rate": 3.997503949351891e-05, + "loss": 26.397, + "step": 189 + }, + { + "epoch": 0.045794167269221496, + "grad_norm": 14.517131805419922, + "learning_rate": 3.997425355252327e-05, + "loss": 26.7809, + "step": 190 + }, + { + "epoch": 0.0460351892022174, + "grad_norm": 15.4188814163208, + "learning_rate": 3.997345543693782e-05, + "loss": 25.9127, + "step": 191 + }, + { + "epoch": 0.046276211135213303, + "grad_norm": 15.157994270324707, + "learning_rate": 3.997264514724902e-05, + "loss": 26.0175, + "step": 192 + }, + { + "epoch": 0.04651723306820921, + "grad_norm": 14.02359676361084, + "learning_rate": 3.9971822683950756e-05, + "loss": 26.23, + "step": 193 + }, + { + "epoch": 0.04675825500120511, + "grad_norm": 13.925785064697266, + "learning_rate": 3.997098804754433e-05, + "loss": 25.227, + "step": 194 + }, + { + "epoch": 0.046999276934201015, + "grad_norm": 14.640610694885254, + "learning_rate": 3.9970141238538475e-05, + "loss": 26.6932, + "step": 195 + }, + { + "epoch": 0.04724029886719691, + "grad_norm": 15.388662338256836, + "learning_rate": 3.996928225744932e-05, + "loss": 25.6808, + "step": 196 + }, + { + "epoch": 0.047481320800192815, + "grad_norm": 12.78682804107666, + "learning_rate": 3.996841110480043e-05, + "loss": 26.4827, + "step": 197 + }, + { + "epoch": 0.04772234273318872, + "grad_norm": 13.525049209594727, + "learning_rate": 3.9967527781122786e-05, + "loss": 26.5223, + "step": 198 + }, + { + "epoch": 0.04796336466618462, + "grad_norm": 13.552980422973633, + "learning_rate": 3.9966632286954785e-05, + "loss": 25.8618, + "step": 199 + }, + { + "epoch": 0.048204386599180526, + "grad_norm": 14.936887741088867, + "learning_rate": 3.996572462284224e-05, + "loss": 26.2941, + "step": 200 + }, + { + "epoch": 0.048204386599180526, + "eval_cc_pretrain_accuracy": 0.7899999618530273, + "eval_cc_pretrain_loss": 2.311469793319702, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 16.9777, + "eval_cc_pretrain_samples_per_second": 5.89, + "eval_cc_pretrain_steps_per_second": 0.059, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 200 + }, + { + "epoch": 0.048204386599180526, + "eval_mscoco_pretrain_accuracy": 0.7999999523162842, + "eval_mscoco_pretrain_loss": 2.395975112915039, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 13.6212, + "eval_mscoco_pretrain_samples_per_second": 7.341, + "eval_mscoco_pretrain_steps_per_second": 0.073, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 200 + }, + { + "epoch": 0.04844540853217643, + "grad_norm": 13.414412498474121, + "learning_rate": 3.9964804789338394e-05, + "loss": 26.0554, + "step": 201 + }, + { + "epoch": 0.048686430465172334, + "grad_norm": 14.821207046508789, + "learning_rate": 3.996387278700389e-05, + "loss": 25.6164, + "step": 202 + }, + { + "epoch": 0.04892745239816823, + "grad_norm": 13.152141571044922, + "learning_rate": 3.99629286164068e-05, + "loss": 26.0036, + "step": 203 + }, + { + "epoch": 0.049168474331164135, + "grad_norm": 13.421805381774902, + "learning_rate": 3.996197227812261e-05, + "loss": 25.5057, + "step": 204 + }, + { + "epoch": 0.04940949626416004, + "grad_norm": 13.401187896728516, + "learning_rate": 3.996100377273422e-05, + "loss": 25.8152, + "step": 205 + }, + { + "epoch": 0.04965051819715594, + "grad_norm": 15.327949523925781, + "learning_rate": 3.996002310083195e-05, + "loss": 25.0105, + "step": 206 + }, + { + "epoch": 0.049891540130151846, + "grad_norm": 15.262767791748047, + "learning_rate": 3.9959030263013525e-05, + "loss": 25.7281, + "step": 207 + }, + { + "epoch": 0.05013256206314775, + "grad_norm": 15.069193840026855, + "learning_rate": 3.9958025259884116e-05, + "loss": 26.6116, + "step": 208 + }, + { + "epoch": 0.050373583996143646, + "grad_norm": 15.514892578125, + "learning_rate": 3.995700809205626e-05, + "loss": 25.8614, + "step": 209 + }, + { + "epoch": 0.05061460592913955, + "grad_norm": 14.814537048339844, + "learning_rate": 3.995597876014995e-05, + "loss": 25.6065, + "step": 210 + }, + { + "epoch": 0.050855627862135454, + "grad_norm": 14.24145793914795, + "learning_rate": 3.995493726479257e-05, + "loss": 24.3171, + "step": 211 + }, + { + "epoch": 0.05109664979513136, + "grad_norm": 12.01323413848877, + "learning_rate": 3.995388360661894e-05, + "loss": 25.8361, + "step": 212 + }, + { + "epoch": 0.05133767172812726, + "grad_norm": 13.413566589355469, + "learning_rate": 3.9952817786271264e-05, + "loss": 25.1902, + "step": 213 + }, + { + "epoch": 0.051578693661123165, + "grad_norm": 13.360679626464844, + "learning_rate": 3.9951739804399176e-05, + "loss": 23.994, + "step": 214 + }, + { + "epoch": 0.05181971559411906, + "grad_norm": 14.619837760925293, + "learning_rate": 3.995064966165974e-05, + "loss": 25.1717, + "step": 215 + }, + { + "epoch": 0.052060737527114966, + "grad_norm": 15.737160682678223, + "learning_rate": 3.9949547358717385e-05, + "loss": 25.2489, + "step": 216 + }, + { + "epoch": 0.05230175946011087, + "grad_norm": 15.4391450881958, + "learning_rate": 3.994843289624401e-05, + "loss": 24.5718, + "step": 217 + }, + { + "epoch": 0.05254278139310677, + "grad_norm": 13.87382698059082, + "learning_rate": 3.994730627491887e-05, + "loss": 25.2503, + "step": 218 + }, + { + "epoch": 0.05278380332610268, + "grad_norm": 14.796367645263672, + "learning_rate": 3.9946167495428666e-05, + "loss": 24.2357, + "step": 219 + }, + { + "epoch": 0.05302482525909858, + "grad_norm": 12.805338859558105, + "learning_rate": 3.9945016558467506e-05, + "loss": 24.8181, + "step": 220 + }, + { + "epoch": 0.05326584719209448, + "grad_norm": 10.51312255859375, + "learning_rate": 3.994385346473689e-05, + "loss": 26.6583, + "step": 221 + }, + { + "epoch": 0.05350686912509038, + "grad_norm": 13.083112716674805, + "learning_rate": 3.9942678214945764e-05, + "loss": 24.4384, + "step": 222 + }, + { + "epoch": 0.053747891058086285, + "grad_norm": 10.181856155395508, + "learning_rate": 3.9941490809810426e-05, + "loss": 24.9412, + "step": 223 + }, + { + "epoch": 0.05398891299108219, + "grad_norm": 10.954217910766602, + "learning_rate": 3.9940291250054646e-05, + "loss": 24.6963, + "step": 224 + }, + { + "epoch": 0.05422993492407809, + "grad_norm": 10.894026756286621, + "learning_rate": 3.993907953640955e-05, + "loss": 26.2943, + "step": 225 + }, + { + "epoch": 0.054470956857073996, + "grad_norm": 13.06076717376709, + "learning_rate": 3.993785566961372e-05, + "loss": 25.7481, + "step": 226 + }, + { + "epoch": 0.0547119787900699, + "grad_norm": 11.821154594421387, + "learning_rate": 3.993661965041309e-05, + "loss": 24.6256, + "step": 227 + }, + { + "epoch": 0.0549530007230658, + "grad_norm": 12.034224510192871, + "learning_rate": 3.993537147956104e-05, + "loss": 24.5503, + "step": 228 + }, + { + "epoch": 0.0551940226560617, + "grad_norm": 11.258167266845703, + "learning_rate": 3.993411115781837e-05, + "loss": 25.0687, + "step": 229 + }, + { + "epoch": 0.055435044589057604, + "grad_norm": 13.486194610595703, + "learning_rate": 3.9932838685953236e-05, + "loss": 25.1615, + "step": 230 + }, + { + "epoch": 0.05567606652205351, + "grad_norm": 14.006240844726562, + "learning_rate": 3.9931554064741245e-05, + "loss": 24.3381, + "step": 231 + }, + { + "epoch": 0.05591708845504941, + "grad_norm": 12.44616985321045, + "learning_rate": 3.993025729496538e-05, + "loss": 25.0097, + "step": 232 + }, + { + "epoch": 0.056158110388045315, + "grad_norm": 17.08452033996582, + "learning_rate": 3.9928948377416056e-05, + "loss": 25.0683, + "step": 233 + }, + { + "epoch": 0.05639913232104121, + "grad_norm": 15.792698860168457, + "learning_rate": 3.992762731289106e-05, + "loss": 24.2425, + "step": 234 + }, + { + "epoch": 0.056640154254037116, + "grad_norm": 14.505329132080078, + "learning_rate": 3.992629410219561e-05, + "loss": 25.8084, + "step": 235 + }, + { + "epoch": 0.05688117618703302, + "grad_norm": 15.732985496520996, + "learning_rate": 3.992494874614232e-05, + "loss": 25.9162, + "step": 236 + }, + { + "epoch": 0.05712219812002892, + "grad_norm": 14.045899391174316, + "learning_rate": 3.992359124555118e-05, + "loss": 24.0995, + "step": 237 + }, + { + "epoch": 0.05736322005302483, + "grad_norm": 13.147685050964355, + "learning_rate": 3.9922221601249634e-05, + "loss": 25.3903, + "step": 238 + }, + { + "epoch": 0.05760424198602073, + "grad_norm": 12.343921661376953, + "learning_rate": 3.992083981407248e-05, + "loss": 23.5627, + "step": 239 + }, + { + "epoch": 0.05784526391901663, + "grad_norm": 13.649635314941406, + "learning_rate": 3.991944588486195e-05, + "loss": 24.047, + "step": 240 + }, + { + "epoch": 0.05808628585201253, + "grad_norm": 12.454002380371094, + "learning_rate": 3.991803981446766e-05, + "loss": 24.6633, + "step": 241 + }, + { + "epoch": 0.058327307785008435, + "grad_norm": 11.128416061401367, + "learning_rate": 3.991662160374663e-05, + "loss": 25.2441, + "step": 242 + }, + { + "epoch": 0.05856832971800434, + "grad_norm": 11.012476921081543, + "learning_rate": 3.991519125356328e-05, + "loss": 24.6249, + "step": 243 + }, + { + "epoch": 0.05880935165100024, + "grad_norm": 11.606329917907715, + "learning_rate": 3.991374876478942e-05, + "loss": 24.6021, + "step": 244 + }, + { + "epoch": 0.059050373583996146, + "grad_norm": 11.752840042114258, + "learning_rate": 3.9912294138304276e-05, + "loss": 24.2769, + "step": 245 + }, + { + "epoch": 0.05929139551699204, + "grad_norm": 11.832447052001953, + "learning_rate": 3.991082737499446e-05, + "loss": 24.8649, + "step": 246 + }, + { + "epoch": 0.05953241744998795, + "grad_norm": 13.297032356262207, + "learning_rate": 3.990934847575399e-05, + "loss": 24.5315, + "step": 247 + }, + { + "epoch": 0.05977343938298385, + "grad_norm": 13.912251472473145, + "learning_rate": 3.9907857441484265e-05, + "loss": 24.2015, + "step": 248 + }, + { + "epoch": 0.060014461315979754, + "grad_norm": 14.264988899230957, + "learning_rate": 3.9906354273094104e-05, + "loss": 24.3201, + "step": 249 + }, + { + "epoch": 0.06025548324897566, + "grad_norm": 14.634635925292969, + "learning_rate": 3.99048389714997e-05, + "loss": 23.7596, + "step": 250 + }, + { + "epoch": 0.06049650518197156, + "grad_norm": 14.219120979309082, + "learning_rate": 3.990331153762466e-05, + "loss": 24.2854, + "step": 251 + }, + { + "epoch": 0.06073752711496746, + "grad_norm": 14.509066581726074, + "learning_rate": 3.990177197239996e-05, + "loss": 23.7568, + "step": 252 + }, + { + "epoch": 0.06097854904796336, + "grad_norm": 14.387089729309082, + "learning_rate": 3.9900220276764013e-05, + "loss": 24.9065, + "step": 253 + }, + { + "epoch": 0.061219570980959266, + "grad_norm": 15.730871200561523, + "learning_rate": 3.9898656451662576e-05, + "loss": 24.7954, + "step": 254 + }, + { + "epoch": 0.06146059291395517, + "grad_norm": 15.183639526367188, + "learning_rate": 3.9897080498048834e-05, + "loss": 23.5938, + "step": 255 + }, + { + "epoch": 0.061701614846951074, + "grad_norm": 15.3148832321167, + "learning_rate": 3.989549241688335e-05, + "loss": 23.8184, + "step": 256 + }, + { + "epoch": 0.06194263677994698, + "grad_norm": 12.688843727111816, + "learning_rate": 3.989389220913408e-05, + "loss": 24.3221, + "step": 257 + }, + { + "epoch": 0.06218365871294288, + "grad_norm": 11.817489624023438, + "learning_rate": 3.9892279875776373e-05, + "loss": 24.2426, + "step": 258 + }, + { + "epoch": 0.06242468064593878, + "grad_norm": 9.916736602783203, + "learning_rate": 3.9890655417792975e-05, + "loss": 23.2282, + "step": 259 + }, + { + "epoch": 0.06266570257893468, + "grad_norm": 7.669207572937012, + "learning_rate": 3.988901883617401e-05, + "loss": 23.9778, + "step": 260 + }, + { + "epoch": 0.06290672451193059, + "grad_norm": 9.680610656738281, + "learning_rate": 3.9887370131917e-05, + "loss": 22.9789, + "step": 261 + }, + { + "epoch": 0.06314774644492649, + "grad_norm": 10.612390518188477, + "learning_rate": 3.988570930602685e-05, + "loss": 23.5804, + "step": 262 + }, + { + "epoch": 0.06338876837792239, + "grad_norm": 10.746492385864258, + "learning_rate": 3.988403635951587e-05, + "loss": 23.8435, + "step": 263 + }, + { + "epoch": 0.0636297903109183, + "grad_norm": 11.980363845825195, + "learning_rate": 3.988235129340373e-05, + "loss": 22.814, + "step": 264 + }, + { + "epoch": 0.0638708122439142, + "grad_norm": 11.366969108581543, + "learning_rate": 3.98806541087175e-05, + "loss": 24.4343, + "step": 265 + }, + { + "epoch": 0.0641118341769101, + "grad_norm": 15.055169105529785, + "learning_rate": 3.987894480649165e-05, + "loss": 24.0406, + "step": 266 + }, + { + "epoch": 0.064352856109906, + "grad_norm": 13.967504501342773, + "learning_rate": 3.987722338776802e-05, + "loss": 24.0474, + "step": 267 + }, + { + "epoch": 0.0645938780429019, + "grad_norm": 13.483022689819336, + "learning_rate": 3.987548985359583e-05, + "loss": 24.7477, + "step": 268 + }, + { + "epoch": 0.06483489997589781, + "grad_norm": 15.369538307189941, + "learning_rate": 3.987374420503171e-05, + "loss": 23.3102, + "step": 269 + }, + { + "epoch": 0.0650759219088937, + "grad_norm": 14.985185623168945, + "learning_rate": 3.987198644313964e-05, + "loss": 23.6755, + "step": 270 + }, + { + "epoch": 0.06531694384188962, + "grad_norm": 15.454153060913086, + "learning_rate": 3.9870216568991015e-05, + "loss": 23.4524, + "step": 271 + }, + { + "epoch": 0.06555796577488551, + "grad_norm": 15.49449634552002, + "learning_rate": 3.9868434583664585e-05, + "loss": 24.468, + "step": 272 + }, + { + "epoch": 0.06579898770788142, + "grad_norm": 14.017498970031738, + "learning_rate": 3.986664048824651e-05, + "loss": 22.9448, + "step": 273 + }, + { + "epoch": 0.06604000964087732, + "grad_norm": 11.01783561706543, + "learning_rate": 3.98648342838303e-05, + "loss": 23.2619, + "step": 274 + }, + { + "epoch": 0.06628103157387322, + "grad_norm": 10.586702346801758, + "learning_rate": 3.986301597151688e-05, + "loss": 22.9029, + "step": 275 + }, + { + "epoch": 0.06652205350686913, + "grad_norm": 12.61590576171875, + "learning_rate": 3.986118555241453e-05, + "loss": 22.7675, + "step": 276 + }, + { + "epoch": 0.06676307543986502, + "grad_norm": 10.1846923828125, + "learning_rate": 3.985934302763891e-05, + "loss": 23.2185, + "step": 277 + }, + { + "epoch": 0.06700409737286094, + "grad_norm": 9.819538116455078, + "learning_rate": 3.985748839831307e-05, + "loss": 22.196, + "step": 278 + }, + { + "epoch": 0.06724511930585683, + "grad_norm": 8.765714645385742, + "learning_rate": 3.985562166556743e-05, + "loss": 22.8264, + "step": 279 + }, + { + "epoch": 0.06748614123885273, + "grad_norm": 9.656952857971191, + "learning_rate": 3.98537428305398e-05, + "loss": 23.3721, + "step": 280 + }, + { + "epoch": 0.06772716317184864, + "grad_norm": 10.143611907958984, + "learning_rate": 3.985185189437534e-05, + "loss": 24.3637, + "step": 281 + }, + { + "epoch": 0.06796818510484454, + "grad_norm": 9.379727363586426, + "learning_rate": 3.9849948858226614e-05, + "loss": 24.1918, + "step": 282 + }, + { + "epoch": 0.06820920703784045, + "grad_norm": 11.473142623901367, + "learning_rate": 3.984803372325354e-05, + "loss": 24.5326, + "step": 283 + }, + { + "epoch": 0.06845022897083634, + "grad_norm": 12.223827362060547, + "learning_rate": 3.984610649062344e-05, + "loss": 23.9957, + "step": 284 + }, + { + "epoch": 0.06869125090383225, + "grad_norm": 13.45671558380127, + "learning_rate": 3.9844167161510965e-05, + "loss": 24.2994, + "step": 285 + }, + { + "epoch": 0.06893227283682815, + "grad_norm": 13.718855857849121, + "learning_rate": 3.984221573709818e-05, + "loss": 24.486, + "step": 286 + }, + { + "epoch": 0.06917329476982405, + "grad_norm": 13.333930969238281, + "learning_rate": 3.9840252218574495e-05, + "loss": 23.9028, + "step": 287 + }, + { + "epoch": 0.06941431670281996, + "grad_norm": 13.308971405029297, + "learning_rate": 3.983827660713671e-05, + "loss": 23.1011, + "step": 288 + }, + { + "epoch": 0.06965533863581586, + "grad_norm": 12.980266571044922, + "learning_rate": 3.9836288903988985e-05, + "loss": 23.3169, + "step": 289 + }, + { + "epoch": 0.06989636056881177, + "grad_norm": 12.549450874328613, + "learning_rate": 3.9834289110342856e-05, + "loss": 24.0511, + "step": 290 + }, + { + "epoch": 0.07013738250180766, + "grad_norm": 12.811077117919922, + "learning_rate": 3.9832277227417225e-05, + "loss": 23.1615, + "step": 291 + }, + { + "epoch": 0.07037840443480357, + "grad_norm": 13.172994613647461, + "learning_rate": 3.983025325643835e-05, + "loss": 23.26, + "step": 292 + }, + { + "epoch": 0.07061942636779947, + "grad_norm": 12.895195007324219, + "learning_rate": 3.9828217198639884e-05, + "loss": 23.6596, + "step": 293 + }, + { + "epoch": 0.07086044830079537, + "grad_norm": 12.642952919006348, + "learning_rate": 3.982616905526284e-05, + "loss": 22.9567, + "step": 294 + }, + { + "epoch": 0.07110147023379128, + "grad_norm": 11.653997421264648, + "learning_rate": 3.982410882755558e-05, + "loss": 23.4526, + "step": 295 + }, + { + "epoch": 0.07134249216678717, + "grad_norm": 11.522258758544922, + "learning_rate": 3.982203651677383e-05, + "loss": 23.4217, + "step": 296 + }, + { + "epoch": 0.07158351409978309, + "grad_norm": 11.652542114257812, + "learning_rate": 3.981995212418071e-05, + "loss": 23.0352, + "step": 297 + }, + { + "epoch": 0.07182453603277898, + "grad_norm": 12.772671699523926, + "learning_rate": 3.981785565104669e-05, + "loss": 24.0736, + "step": 298 + }, + { + "epoch": 0.07206555796577488, + "grad_norm": 10.755510330200195, + "learning_rate": 3.9815747098649585e-05, + "loss": 23.1737, + "step": 299 + }, + { + "epoch": 0.07230657989877079, + "grad_norm": 11.35267162322998, + "learning_rate": 3.9813626468274595e-05, + "loss": 22.6304, + "step": 300 + }, + { + "epoch": 0.07254760183176669, + "grad_norm": 10.722809791564941, + "learning_rate": 3.981149376121427e-05, + "loss": 23.256, + "step": 301 + }, + { + "epoch": 0.0727886237647626, + "grad_norm": 8.690351486206055, + "learning_rate": 3.980934897876853e-05, + "loss": 22.6456, + "step": 302 + }, + { + "epoch": 0.0730296456977585, + "grad_norm": 10.150249481201172, + "learning_rate": 3.980719212224465e-05, + "loss": 23.3892, + "step": 303 + }, + { + "epoch": 0.0732706676307544, + "grad_norm": 9.339855194091797, + "learning_rate": 3.980502319295727e-05, + "loss": 23.5813, + "step": 304 + }, + { + "epoch": 0.0735116895637503, + "grad_norm": 11.799015045166016, + "learning_rate": 3.980284219222836e-05, + "loss": 23.2126, + "step": 305 + }, + { + "epoch": 0.0737527114967462, + "grad_norm": 11.219136238098145, + "learning_rate": 3.9800649121387296e-05, + "loss": 22.7697, + "step": 306 + }, + { + "epoch": 0.07399373342974211, + "grad_norm": 13.062129974365234, + "learning_rate": 3.979844398177078e-05, + "loss": 23.7915, + "step": 307 + }, + { + "epoch": 0.074234755362738, + "grad_norm": 13.509801864624023, + "learning_rate": 3.979622677472287e-05, + "loss": 22.6744, + "step": 308 + }, + { + "epoch": 0.07447577729573392, + "grad_norm": 13.75188159942627, + "learning_rate": 3.9793997501594985e-05, + "loss": 22.7697, + "step": 309 + }, + { + "epoch": 0.07471679922872981, + "grad_norm": 11.820225715637207, + "learning_rate": 3.97917561637459e-05, + "loss": 22.9635, + "step": 310 + }, + { + "epoch": 0.07495782116172572, + "grad_norm": 12.03287124633789, + "learning_rate": 3.978950276254175e-05, + "loss": 22.808, + "step": 311 + }, + { + "epoch": 0.07519884309472162, + "grad_norm": 9.880515098571777, + "learning_rate": 3.9787237299356004e-05, + "loss": 23.2821, + "step": 312 + }, + { + "epoch": 0.07543986502771752, + "grad_norm": 7.323331356048584, + "learning_rate": 3.97849597755695e-05, + "loss": 23.4699, + "step": 313 + }, + { + "epoch": 0.07568088696071343, + "grad_norm": 8.071771621704102, + "learning_rate": 3.978267019257041e-05, + "loss": 22.6763, + "step": 314 + }, + { + "epoch": 0.07592190889370933, + "grad_norm": 8.995155334472656, + "learning_rate": 3.978036855175429e-05, + "loss": 22.9958, + "step": 315 + }, + { + "epoch": 0.07616293082670524, + "grad_norm": 10.754206657409668, + "learning_rate": 3.9778054854524e-05, + "loss": 22.7704, + "step": 316 + }, + { + "epoch": 0.07640395275970113, + "grad_norm": 11.264116287231445, + "learning_rate": 3.9775729102289784e-05, + "loss": 22.8144, + "step": 317 + }, + { + "epoch": 0.07664497469269703, + "grad_norm": 12.6996431350708, + "learning_rate": 3.9773391296469205e-05, + "loss": 22.9947, + "step": 318 + }, + { + "epoch": 0.07688599662569294, + "grad_norm": 12.943449020385742, + "learning_rate": 3.977104143848721e-05, + "loss": 23.1905, + "step": 319 + }, + { + "epoch": 0.07712701855868884, + "grad_norm": 14.961601257324219, + "learning_rate": 3.976867952977606e-05, + "loss": 23.2969, + "step": 320 + }, + { + "epoch": 0.07736804049168475, + "grad_norm": 14.906577110290527, + "learning_rate": 3.9766305571775374e-05, + "loss": 24.4058, + "step": 321 + }, + { + "epoch": 0.07760906242468064, + "grad_norm": 15.248797416687012, + "learning_rate": 3.976391956593211e-05, + "loss": 22.904, + "step": 322 + }, + { + "epoch": 0.07785008435767655, + "grad_norm": 13.901290893554688, + "learning_rate": 3.976152151370057e-05, + "loss": 23.3243, + "step": 323 + }, + { + "epoch": 0.07809110629067245, + "grad_norm": 14.363203048706055, + "learning_rate": 3.975911141654241e-05, + "loss": 23.379, + "step": 324 + }, + { + "epoch": 0.07833212822366835, + "grad_norm": 13.350197792053223, + "learning_rate": 3.975668927592661e-05, + "loss": 22.7161, + "step": 325 + }, + { + "epoch": 0.07857315015666426, + "grad_norm": 11.422146797180176, + "learning_rate": 3.97542550933295e-05, + "loss": 20.8181, + "step": 326 + }, + { + "epoch": 0.07881417208966016, + "grad_norm": 9.758254051208496, + "learning_rate": 3.9751808870234755e-05, + "loss": 22.5681, + "step": 327 + }, + { + "epoch": 0.07905519402265607, + "grad_norm": 9.507939338684082, + "learning_rate": 3.9749350608133375e-05, + "loss": 22.9906, + "step": 328 + }, + { + "epoch": 0.07929621595565196, + "grad_norm": 10.832972526550293, + "learning_rate": 3.974688030852371e-05, + "loss": 21.9435, + "step": 329 + }, + { + "epoch": 0.07953723788864786, + "grad_norm": 9.748761177062988, + "learning_rate": 3.974439797291144e-05, + "loss": 22.974, + "step": 330 + }, + { + "epoch": 0.07977825982164377, + "grad_norm": 9.270492553710938, + "learning_rate": 3.9741903602809586e-05, + "loss": 22.9647, + "step": 331 + }, + { + "epoch": 0.08001928175463967, + "grad_norm": 11.954438209533691, + "learning_rate": 3.97393971997385e-05, + "loss": 22.4362, + "step": 332 + }, + { + "epoch": 0.08026030368763558, + "grad_norm": 10.028216361999512, + "learning_rate": 3.973687876522587e-05, + "loss": 22.4707, + "step": 333 + }, + { + "epoch": 0.08050132562063148, + "grad_norm": 11.204946517944336, + "learning_rate": 3.973434830080671e-05, + "loss": 23.4629, + "step": 334 + }, + { + "epoch": 0.08074234755362739, + "grad_norm": 11.43058967590332, + "learning_rate": 3.97318058080234e-05, + "loss": 22.7034, + "step": 335 + }, + { + "epoch": 0.08098336948662328, + "grad_norm": 11.052437782287598, + "learning_rate": 3.972925128842559e-05, + "loss": 22.9194, + "step": 336 + }, + { + "epoch": 0.08122439141961918, + "grad_norm": 11.88228702545166, + "learning_rate": 3.9726684743570315e-05, + "loss": 21.9981, + "step": 337 + }, + { + "epoch": 0.08146541335261509, + "grad_norm": 10.540339469909668, + "learning_rate": 3.972410617502193e-05, + "loss": 23.3471, + "step": 338 + }, + { + "epoch": 0.08170643528561099, + "grad_norm": 10.546243667602539, + "learning_rate": 3.972151558435209e-05, + "loss": 22.1096, + "step": 339 + }, + { + "epoch": 0.0819474572186069, + "grad_norm": 7.988236904144287, + "learning_rate": 3.9718912973139805e-05, + "loss": 23.0158, + "step": 340 + }, + { + "epoch": 0.0821884791516028, + "grad_norm": 9.042425155639648, + "learning_rate": 3.97162983429714e-05, + "loss": 23.1129, + "step": 341 + }, + { + "epoch": 0.0824295010845987, + "grad_norm": 10.429259300231934, + "learning_rate": 3.971367169544053e-05, + "loss": 22.6241, + "step": 342 + }, + { + "epoch": 0.0826705230175946, + "grad_norm": 10.065314292907715, + "learning_rate": 3.971103303214818e-05, + "loss": 22.4852, + "step": 343 + }, + { + "epoch": 0.0829115449505905, + "grad_norm": 8.6956148147583, + "learning_rate": 3.970838235470264e-05, + "loss": 22.1078, + "step": 344 + }, + { + "epoch": 0.08315256688358641, + "grad_norm": 12.21730899810791, + "learning_rate": 3.970571966471955e-05, + "loss": 22.9591, + "step": 345 + }, + { + "epoch": 0.0833935888165823, + "grad_norm": 9.874788284301758, + "learning_rate": 3.9703044963821846e-05, + "loss": 22.8317, + "step": 346 + }, + { + "epoch": 0.08363461074957822, + "grad_norm": 11.182596206665039, + "learning_rate": 3.97003582536398e-05, + "loss": 22.7306, + "step": 347 + }, + { + "epoch": 0.08387563268257411, + "grad_norm": 12.13841438293457, + "learning_rate": 3.9697659535811005e-05, + "loss": 23.5083, + "step": 348 + }, + { + "epoch": 0.08411665461557001, + "grad_norm": 10.653072357177734, + "learning_rate": 3.9694948811980355e-05, + "loss": 22.4855, + "step": 349 + }, + { + "epoch": 0.08435767654856592, + "grad_norm": 13.815437316894531, + "learning_rate": 3.969222608380009e-05, + "loss": 22.5161, + "step": 350 + }, + { + "epoch": 0.08459869848156182, + "grad_norm": 13.977157592773438, + "learning_rate": 3.968949135292974e-05, + "loss": 22.5366, + "step": 351 + }, + { + "epoch": 0.08483972041455773, + "grad_norm": 13.869813919067383, + "learning_rate": 3.968674462103616e-05, + "loss": 23.1443, + "step": 352 + }, + { + "epoch": 0.08508074234755363, + "grad_norm": 10.584354400634766, + "learning_rate": 3.9683985889793536e-05, + "loss": 23.2354, + "step": 353 + }, + { + "epoch": 0.08532176428054954, + "grad_norm": 12.57889175415039, + "learning_rate": 3.968121516088334e-05, + "loss": 23.0108, + "step": 354 + }, + { + "epoch": 0.08556278621354543, + "grad_norm": 10.132689476013184, + "learning_rate": 3.967843243599438e-05, + "loss": 22.5093, + "step": 355 + }, + { + "epoch": 0.08580380814654133, + "grad_norm": 10.902914047241211, + "learning_rate": 3.967563771682276e-05, + "loss": 21.8914, + "step": 356 + }, + { + "epoch": 0.08604483007953724, + "grad_norm": 10.431751251220703, + "learning_rate": 3.96728310050719e-05, + "loss": 22.4964, + "step": 357 + }, + { + "epoch": 0.08628585201253314, + "grad_norm": 8.59974193572998, + "learning_rate": 3.9670012302452525e-05, + "loss": 22.417, + "step": 358 + }, + { + "epoch": 0.08652687394552905, + "grad_norm": 9.722860336303711, + "learning_rate": 3.9667181610682694e-05, + "loss": 23.1527, + "step": 359 + }, + { + "epoch": 0.08676789587852494, + "grad_norm": 10.46375846862793, + "learning_rate": 3.966433893148774e-05, + "loss": 22.1092, + "step": 360 + }, + { + "epoch": 0.08700891781152084, + "grad_norm": 10.541440963745117, + "learning_rate": 3.966148426660031e-05, + "loss": 23.1821, + "step": 361 + }, + { + "epoch": 0.08724993974451675, + "grad_norm": 13.440645217895508, + "learning_rate": 3.965861761776038e-05, + "loss": 22.4957, + "step": 362 + }, + { + "epoch": 0.08749096167751265, + "grad_norm": 13.69030475616455, + "learning_rate": 3.96557389867152e-05, + "loss": 23.1681, + "step": 363 + }, + { + "epoch": 0.08773198361050856, + "grad_norm": 13.352996826171875, + "learning_rate": 3.965284837521934e-05, + "loss": 23.3508, + "step": 364 + }, + { + "epoch": 0.08797300554350446, + "grad_norm": 12.190091133117676, + "learning_rate": 3.9649945785034667e-05, + "loss": 22.9784, + "step": 365 + }, + { + "epoch": 0.08821402747650037, + "grad_norm": 12.382320404052734, + "learning_rate": 3.964703121793035e-05, + "loss": 21.8373, + "step": 366 + }, + { + "epoch": 0.08845504940949626, + "grad_norm": 13.245556831359863, + "learning_rate": 3.9644104675682864e-05, + "loss": 22.4079, + "step": 367 + }, + { + "epoch": 0.08869607134249216, + "grad_norm": 12.94504451751709, + "learning_rate": 3.964116616007597e-05, + "loss": 21.8986, + "step": 368 + }, + { + "epoch": 0.08893709327548807, + "grad_norm": 13.991476058959961, + "learning_rate": 3.9638215672900744e-05, + "loss": 23.0818, + "step": 369 + }, + { + "epoch": 0.08917811520848397, + "grad_norm": 12.205080032348633, + "learning_rate": 3.9635253215955544e-05, + "loss": 22.359, + "step": 370 + }, + { + "epoch": 0.08941913714147988, + "grad_norm": 10.968507766723633, + "learning_rate": 3.963227879104603e-05, + "loss": 22.8247, + "step": 371 + }, + { + "epoch": 0.08966015907447578, + "grad_norm": 9.406290054321289, + "learning_rate": 3.9629292399985155e-05, + "loss": 22.7223, + "step": 372 + }, + { + "epoch": 0.08990118100747169, + "grad_norm": 9.069541931152344, + "learning_rate": 3.962629404459317e-05, + "loss": 22.6706, + "step": 373 + }, + { + "epoch": 0.09014220294046758, + "grad_norm": 10.338103294372559, + "learning_rate": 3.9623283726697617e-05, + "loss": 22.1665, + "step": 374 + }, + { + "epoch": 0.09038322487346348, + "grad_norm": 10.909104347229004, + "learning_rate": 3.9620261448133315e-05, + "loss": 21.4342, + "step": 375 + }, + { + "epoch": 0.09062424680645939, + "grad_norm": 11.047188758850098, + "learning_rate": 3.96172272107424e-05, + "loss": 22.044, + "step": 376 + }, + { + "epoch": 0.09086526873945529, + "grad_norm": 10.113716125488281, + "learning_rate": 3.961418101637427e-05, + "loss": 22.8166, + "step": 377 + }, + { + "epoch": 0.0911062906724512, + "grad_norm": 10.778718948364258, + "learning_rate": 3.9611122866885635e-05, + "loss": 22.3409, + "step": 378 + }, + { + "epoch": 0.0913473126054471, + "grad_norm": 10.477892875671387, + "learning_rate": 3.960805276414047e-05, + "loss": 22.2415, + "step": 379 + }, + { + "epoch": 0.09158833453844299, + "grad_norm": 9.995824813842773, + "learning_rate": 3.9604970710010056e-05, + "loss": 23.2694, + "step": 380 + }, + { + "epoch": 0.0918293564714389, + "grad_norm": 10.541510581970215, + "learning_rate": 3.960187670637294e-05, + "loss": 22.7109, + "step": 381 + }, + { + "epoch": 0.0920703784044348, + "grad_norm": 10.985747337341309, + "learning_rate": 3.9598770755114974e-05, + "loss": 23.3363, + "step": 382 + }, + { + "epoch": 0.09231140033743071, + "grad_norm": 8.48354721069336, + "learning_rate": 3.959565285812926e-05, + "loss": 22.9229, + "step": 383 + }, + { + "epoch": 0.09255242227042661, + "grad_norm": 8.193015098571777, + "learning_rate": 3.9592523017316213e-05, + "loss": 22.7472, + "step": 384 + }, + { + "epoch": 0.09279344420342252, + "grad_norm": 10.08864688873291, + "learning_rate": 3.958938123458351e-05, + "loss": 22.0423, + "step": 385 + }, + { + "epoch": 0.09303446613641841, + "grad_norm": 9.457194328308105, + "learning_rate": 3.958622751184612e-05, + "loss": 22.1312, + "step": 386 + }, + { + "epoch": 0.09327548806941431, + "grad_norm": 9.769213676452637, + "learning_rate": 3.958306185102627e-05, + "loss": 21.2084, + "step": 387 + }, + { + "epoch": 0.09351651000241022, + "grad_norm": 8.528018951416016, + "learning_rate": 3.957988425405349e-05, + "loss": 22.5847, + "step": 388 + }, + { + "epoch": 0.09375753193540612, + "grad_norm": 7.872664451599121, + "learning_rate": 3.957669472286455e-05, + "loss": 22.7421, + "step": 389 + }, + { + "epoch": 0.09399855386840203, + "grad_norm": 10.458906173706055, + "learning_rate": 3.957349325940353e-05, + "loss": 22.5017, + "step": 390 + }, + { + "epoch": 0.09423957580139793, + "grad_norm": 9.055620193481445, + "learning_rate": 3.957027986562176e-05, + "loss": 22.9132, + "step": 391 + }, + { + "epoch": 0.09448059773439382, + "grad_norm": 10.941350936889648, + "learning_rate": 3.956705454347786e-05, + "loss": 22.3697, + "step": 392 + }, + { + "epoch": 0.09472161966738973, + "grad_norm": 13.55471134185791, + "learning_rate": 3.9563817294937694e-05, + "loss": 22.5943, + "step": 393 + }, + { + "epoch": 0.09496264160038563, + "grad_norm": 12.076177597045898, + "learning_rate": 3.956056812197441e-05, + "loss": 22.413, + "step": 394 + }, + { + "epoch": 0.09520366353338154, + "grad_norm": 13.134650230407715, + "learning_rate": 3.955730702656845e-05, + "loss": 23.1823, + "step": 395 + }, + { + "epoch": 0.09544468546637744, + "grad_norm": 15.475574493408203, + "learning_rate": 3.955403401070747e-05, + "loss": 22.2643, + "step": 396 + }, + { + "epoch": 0.09568570739937335, + "grad_norm": 16.610166549682617, + "learning_rate": 3.9550749076386435e-05, + "loss": 22.7239, + "step": 397 + }, + { + "epoch": 0.09592672933236925, + "grad_norm": 16.209754943847656, + "learning_rate": 3.954745222560755e-05, + "loss": 22.3757, + "step": 398 + }, + { + "epoch": 0.09616775126536514, + "grad_norm": 15.284822463989258, + "learning_rate": 3.9544143460380293e-05, + "loss": 22.2776, + "step": 399 + }, + { + "epoch": 0.09640877319836105, + "grad_norm": 13.668973922729492, + "learning_rate": 3.9540822782721415e-05, + "loss": 23.569, + "step": 400 + }, + { + "epoch": 0.09640877319836105, + "eval_cc_pretrain_accuracy": 0.8499999642372131, + "eval_cc_pretrain_loss": 2.1780176162719727, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 16.8375, + "eval_cc_pretrain_samples_per_second": 5.939, + "eval_cc_pretrain_steps_per_second": 0.059, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 400 + }, + { + "epoch": 0.09640877319836105, + "eval_mscoco_pretrain_accuracy": 0.7999999523162842, + "eval_mscoco_pretrain_loss": 2.345801830291748, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 19.4324, + "eval_mscoco_pretrain_samples_per_second": 5.146, + "eval_mscoco_pretrain_steps_per_second": 0.051, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 400 + }, + { + "epoch": 0.09664979513135695, + "grad_norm": 13.379549980163574, + "learning_rate": 3.95374901946549e-05, + "loss": 23.5596, + "step": 401 + }, + { + "epoch": 0.09689081706435286, + "grad_norm": 11.586038589477539, + "learning_rate": 3.953414569821201e-05, + "loss": 22.6281, + "step": 402 + }, + { + "epoch": 0.09713183899734876, + "grad_norm": 7.799354076385498, + "learning_rate": 3.9530789295431276e-05, + "loss": 22.7018, + "step": 403 + }, + { + "epoch": 0.09737286093034467, + "grad_norm": 9.083105087280273, + "learning_rate": 3.952742098835846e-05, + "loss": 22.568, + "step": 404 + }, + { + "epoch": 0.09761388286334056, + "grad_norm": 5.899194240570068, + "learning_rate": 3.952404077904659e-05, + "loss": 22.8146, + "step": 405 + }, + { + "epoch": 0.09785490479633646, + "grad_norm": 9.04647159576416, + "learning_rate": 3.952064866955598e-05, + "loss": 22.1363, + "step": 406 + }, + { + "epoch": 0.09809592672933237, + "grad_norm": 10.218022346496582, + "learning_rate": 3.951724466195413e-05, + "loss": 21.9077, + "step": 407 + }, + { + "epoch": 0.09833694866232827, + "grad_norm": 11.489651679992676, + "learning_rate": 3.9513828758315846e-05, + "loss": 22.8743, + "step": 408 + }, + { + "epoch": 0.09857797059532418, + "grad_norm": 12.635907173156738, + "learning_rate": 3.9510400960723173e-05, + "loss": 22.2284, + "step": 409 + }, + { + "epoch": 0.09881899252832008, + "grad_norm": 11.998997688293457, + "learning_rate": 3.9506961271265405e-05, + "loss": 22.611, + "step": 410 + }, + { + "epoch": 0.09906001446131597, + "grad_norm": 11.449655532836914, + "learning_rate": 3.950350969203908e-05, + "loss": 23.0263, + "step": 411 + }, + { + "epoch": 0.09930103639431188, + "grad_norm": 12.183481216430664, + "learning_rate": 3.9500046225147975e-05, + "loss": 23.2646, + "step": 412 + }, + { + "epoch": 0.09954205832730778, + "grad_norm": 11.85231876373291, + "learning_rate": 3.949657087270313e-05, + "loss": 22.1627, + "step": 413 + }, + { + "epoch": 0.09978308026030369, + "grad_norm": 8.755213737487793, + "learning_rate": 3.949308363682281e-05, + "loss": 22.6307, + "step": 414 + }, + { + "epoch": 0.10002410219329959, + "grad_norm": 11.325800895690918, + "learning_rate": 3.948958451963256e-05, + "loss": 22.6506, + "step": 415 + }, + { + "epoch": 0.1002651241262955, + "grad_norm": 11.558366775512695, + "learning_rate": 3.9486073523265114e-05, + "loss": 22.3977, + "step": 416 + }, + { + "epoch": 0.1005061460592914, + "grad_norm": 10.094829559326172, + "learning_rate": 3.948255064986049e-05, + "loss": 22.0375, + "step": 417 + }, + { + "epoch": 0.10074716799228729, + "grad_norm": 8.251361846923828, + "learning_rate": 3.9479015901565925e-05, + "loss": 22.0151, + "step": 418 + }, + { + "epoch": 0.1009881899252832, + "grad_norm": 10.153083801269531, + "learning_rate": 3.9475469280535896e-05, + "loss": 22.2874, + "step": 419 + }, + { + "epoch": 0.1012292118582791, + "grad_norm": 10.91353988647461, + "learning_rate": 3.9471910788932116e-05, + "loss": 21.7383, + "step": 420 + }, + { + "epoch": 0.10147023379127501, + "grad_norm": 8.614728927612305, + "learning_rate": 3.946834042892355e-05, + "loss": 21.8587, + "step": 421 + }, + { + "epoch": 0.10171125572427091, + "grad_norm": 9.482758522033691, + "learning_rate": 3.946475820268636e-05, + "loss": 21.8862, + "step": 422 + }, + { + "epoch": 0.1019522776572668, + "grad_norm": 11.208257675170898, + "learning_rate": 3.9461164112403985e-05, + "loss": 22.2993, + "step": 423 + }, + { + "epoch": 0.10219329959026272, + "grad_norm": 13.730256080627441, + "learning_rate": 3.945755816026706e-05, + "loss": 22.3244, + "step": 424 + }, + { + "epoch": 0.10243432152325861, + "grad_norm": 10.94399642944336, + "learning_rate": 3.9453940348473476e-05, + "loss": 21.6446, + "step": 425 + }, + { + "epoch": 0.10267534345625452, + "grad_norm": 10.929226875305176, + "learning_rate": 3.945031067922833e-05, + "loss": 22.1835, + "step": 426 + }, + { + "epoch": 0.10291636538925042, + "grad_norm": 10.430953979492188, + "learning_rate": 3.944666915474396e-05, + "loss": 22.5608, + "step": 427 + }, + { + "epoch": 0.10315738732224633, + "grad_norm": 9.904389381408691, + "learning_rate": 3.944301577723992e-05, + "loss": 22.3006, + "step": 428 + }, + { + "epoch": 0.10339840925524223, + "grad_norm": 9.77066707611084, + "learning_rate": 3.943935054894301e-05, + "loss": 21.4552, + "step": 429 + }, + { + "epoch": 0.10363943118823812, + "grad_norm": 9.526613235473633, + "learning_rate": 3.943567347208723e-05, + "loss": 22.468, + "step": 430 + }, + { + "epoch": 0.10388045312123403, + "grad_norm": 7.911604881286621, + "learning_rate": 3.943198454891381e-05, + "loss": 21.2037, + "step": 431 + }, + { + "epoch": 0.10412147505422993, + "grad_norm": 5.855437755584717, + "learning_rate": 3.94282837816712e-05, + "loss": 21.307, + "step": 432 + }, + { + "epoch": 0.10436249698722584, + "grad_norm": 7.362118244171143, + "learning_rate": 3.942457117261507e-05, + "loss": 21.0598, + "step": 433 + }, + { + "epoch": 0.10460351892022174, + "grad_norm": 9.960506439208984, + "learning_rate": 3.942084672400831e-05, + "loss": 22.0369, + "step": 434 + }, + { + "epoch": 0.10484454085321765, + "grad_norm": 7.77851676940918, + "learning_rate": 3.941711043812102e-05, + "loss": 21.9701, + "step": 435 + }, + { + "epoch": 0.10508556278621355, + "grad_norm": 9.95388412475586, + "learning_rate": 3.941336231723053e-05, + "loss": 22.1024, + "step": 436 + }, + { + "epoch": 0.10532658471920944, + "grad_norm": 9.937992095947266, + "learning_rate": 3.9409602363621366e-05, + "loss": 21.7559, + "step": 437 + }, + { + "epoch": 0.10556760665220535, + "grad_norm": 10.694732666015625, + "learning_rate": 3.9405830579585266e-05, + "loss": 21.637, + "step": 438 + }, + { + "epoch": 0.10580862858520125, + "grad_norm": 12.747437477111816, + "learning_rate": 3.940204696742119e-05, + "loss": 22.1587, + "step": 439 + }, + { + "epoch": 0.10604965051819716, + "grad_norm": 10.867719650268555, + "learning_rate": 3.939825152943531e-05, + "loss": 21.688, + "step": 440 + }, + { + "epoch": 0.10629067245119306, + "grad_norm": 9.086636543273926, + "learning_rate": 3.939444426794099e-05, + "loss": 22.9395, + "step": 441 + }, + { + "epoch": 0.10653169438418895, + "grad_norm": 9.418134689331055, + "learning_rate": 3.939062518525881e-05, + "loss": 22.5349, + "step": 442 + }, + { + "epoch": 0.10677271631718487, + "grad_norm": 9.460171699523926, + "learning_rate": 3.938679428371656e-05, + "loss": 21.1641, + "step": 443 + }, + { + "epoch": 0.10701373825018076, + "grad_norm": 7.826321125030518, + "learning_rate": 3.938295156564923e-05, + "loss": 21.7996, + "step": 444 + }, + { + "epoch": 0.10725476018317667, + "grad_norm": 7.240124702453613, + "learning_rate": 3.9379097033399006e-05, + "loss": 22.2239, + "step": 445 + }, + { + "epoch": 0.10749578211617257, + "grad_norm": 8.141326904296875, + "learning_rate": 3.937523068931529e-05, + "loss": 21.8351, + "step": 446 + }, + { + "epoch": 0.10773680404916848, + "grad_norm": 8.412803649902344, + "learning_rate": 3.937135253575465e-05, + "loss": 22.1016, + "step": 447 + }, + { + "epoch": 0.10797782598216438, + "grad_norm": 9.307853698730469, + "learning_rate": 3.93674625750809e-05, + "loss": 21.962, + "step": 448 + }, + { + "epoch": 0.10821884791516027, + "grad_norm": 11.437176704406738, + "learning_rate": 3.936356080966501e-05, + "loss": 22.0635, + "step": 449 + }, + { + "epoch": 0.10845986984815618, + "grad_norm": 11.843949317932129, + "learning_rate": 3.935964724188517e-05, + "loss": 21.8725, + "step": 450 + }, + { + "epoch": 0.10870089178115208, + "grad_norm": 14.003984451293945, + "learning_rate": 3.935572187412676e-05, + "loss": 22.7161, + "step": 451 + }, + { + "epoch": 0.10894191371414799, + "grad_norm": 16.23893928527832, + "learning_rate": 3.9351784708782325e-05, + "loss": 22.3885, + "step": 452 + }, + { + "epoch": 0.10918293564714389, + "grad_norm": 17.094051361083984, + "learning_rate": 3.9347835748251645e-05, + "loss": 22.3562, + "step": 453 + }, + { + "epoch": 0.1094239575801398, + "grad_norm": 16.95478057861328, + "learning_rate": 3.934387499494166e-05, + "loss": 22.7326, + "step": 454 + }, + { + "epoch": 0.1096649795131357, + "grad_norm": 12.497323036193848, + "learning_rate": 3.9339902451266504e-05, + "loss": 22.3813, + "step": 455 + }, + { + "epoch": 0.1099060014461316, + "grad_norm": 10.132357597351074, + "learning_rate": 3.9335918119647496e-05, + "loss": 22.619, + "step": 456 + }, + { + "epoch": 0.1101470233791275, + "grad_norm": 8.256294250488281, + "learning_rate": 3.9331922002513156e-05, + "loss": 21.1849, + "step": 457 + }, + { + "epoch": 0.1103880453121234, + "grad_norm": 5.878814220428467, + "learning_rate": 3.932791410229915e-05, + "loss": 22.2518, + "step": 458 + }, + { + "epoch": 0.11062906724511931, + "grad_norm": 6.375014305114746, + "learning_rate": 3.932389442144837e-05, + "loss": 20.9422, + "step": 459 + }, + { + "epoch": 0.11087008917811521, + "grad_norm": 7.980260372161865, + "learning_rate": 3.931986296241086e-05, + "loss": 21.5022, + "step": 460 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 6.094304084777832, + "learning_rate": 3.931581972764386e-05, + "loss": 21.3144, + "step": 461 + }, + { + "epoch": 0.11135213304410702, + "grad_norm": 8.260781288146973, + "learning_rate": 3.931176471961177e-05, + "loss": 22.4251, + "step": 462 + }, + { + "epoch": 0.11159315497710291, + "grad_norm": 8.966259956359863, + "learning_rate": 3.9307697940786174e-05, + "loss": 21.3008, + "step": 463 + }, + { + "epoch": 0.11183417691009882, + "grad_norm": 12.75920295715332, + "learning_rate": 3.930361939364583e-05, + "loss": 21.2157, + "step": 464 + }, + { + "epoch": 0.11207519884309472, + "grad_norm": 13.75034236907959, + "learning_rate": 3.9299529080676674e-05, + "loss": 22.2074, + "step": 465 + }, + { + "epoch": 0.11231622077609063, + "grad_norm": 13.067601203918457, + "learning_rate": 3.929542700437181e-05, + "loss": 22.5306, + "step": 466 + }, + { + "epoch": 0.11255724270908653, + "grad_norm": 13.94778823852539, + "learning_rate": 3.9291313167231514e-05, + "loss": 21.5038, + "step": 467 + }, + { + "epoch": 0.11279826464208242, + "grad_norm": 13.549283981323242, + "learning_rate": 3.928718757176322e-05, + "loss": 21.623, + "step": 468 + }, + { + "epoch": 0.11303928657507833, + "grad_norm": 11.18710994720459, + "learning_rate": 3.9283050220481544e-05, + "loss": 22.3632, + "step": 469 + }, + { + "epoch": 0.11328030850807423, + "grad_norm": 11.911925315856934, + "learning_rate": 3.9278901115908257e-05, + "loss": 22.1007, + "step": 470 + }, + { + "epoch": 0.11352133044107014, + "grad_norm": 10.06760025024414, + "learning_rate": 3.92747402605723e-05, + "loss": 21.7768, + "step": 471 + }, + { + "epoch": 0.11376235237406604, + "grad_norm": 8.147149085998535, + "learning_rate": 3.927056765700976e-05, + "loss": 21.8302, + "step": 472 + }, + { + "epoch": 0.11400337430706194, + "grad_norm": 8.985754013061523, + "learning_rate": 3.926638330776392e-05, + "loss": 21.0359, + "step": 473 + }, + { + "epoch": 0.11424439624005785, + "grad_norm": 5.256037712097168, + "learning_rate": 3.926218721538519e-05, + "loss": 21.8908, + "step": 474 + }, + { + "epoch": 0.11448541817305374, + "grad_norm": 4.832596778869629, + "learning_rate": 3.925797938243113e-05, + "loss": 22.2011, + "step": 475 + }, + { + "epoch": 0.11472644010604965, + "grad_norm": 8.159590721130371, + "learning_rate": 3.92537598114665e-05, + "loss": 22.1288, + "step": 476 + }, + { + "epoch": 0.11496746203904555, + "grad_norm": 10.811864852905273, + "learning_rate": 3.924952850506318e-05, + "loss": 21.6604, + "step": 477 + }, + { + "epoch": 0.11520848397204146, + "grad_norm": 13.48187255859375, + "learning_rate": 3.92452854658002e-05, + "loss": 22.5564, + "step": 478 + }, + { + "epoch": 0.11544950590503736, + "grad_norm": 15.646048545837402, + "learning_rate": 3.924103069626377e-05, + "loss": 22.1639, + "step": 479 + }, + { + "epoch": 0.11569052783803326, + "grad_norm": 14.907588005065918, + "learning_rate": 3.923676419904722e-05, + "loss": 23.1264, + "step": 480 + }, + { + "epoch": 0.11593154977102917, + "grad_norm": 17.359418869018555, + "learning_rate": 3.923248597675104e-05, + "loss": 23.5774, + "step": 481 + }, + { + "epoch": 0.11617257170402506, + "grad_norm": 14.601907730102539, + "learning_rate": 3.9228196031982874e-05, + "loss": 23.4639, + "step": 482 + }, + { + "epoch": 0.11641359363702097, + "grad_norm": 14.22500228881836, + "learning_rate": 3.92238943673575e-05, + "loss": 23.2863, + "step": 483 + }, + { + "epoch": 0.11665461557001687, + "grad_norm": 16.113901138305664, + "learning_rate": 3.9219580985496845e-05, + "loss": 22.2958, + "step": 484 + }, + { + "epoch": 0.11689563750301278, + "grad_norm": 12.689380645751953, + "learning_rate": 3.9215255889029966e-05, + "loss": 22.5405, + "step": 485 + }, + { + "epoch": 0.11713665943600868, + "grad_norm": 10.15064525604248, + "learning_rate": 3.92109190805931e-05, + "loss": 22.6491, + "step": 486 + }, + { + "epoch": 0.11737768136900457, + "grad_norm": 7.255312442779541, + "learning_rate": 3.920657056282955e-05, + "loss": 21.8345, + "step": 487 + }, + { + "epoch": 0.11761870330200049, + "grad_norm": 9.178385734558105, + "learning_rate": 3.9202210338389835e-05, + "loss": 21.424, + "step": 488 + }, + { + "epoch": 0.11785972523499638, + "grad_norm": 8.011574745178223, + "learning_rate": 3.919783840993155e-05, + "loss": 21.7573, + "step": 489 + }, + { + "epoch": 0.11810074716799229, + "grad_norm": 6.61487340927124, + "learning_rate": 3.919345478011947e-05, + "loss": 21.8062, + "step": 490 + }, + { + "epoch": 0.11834176910098819, + "grad_norm": 8.556478500366211, + "learning_rate": 3.9189059451625453e-05, + "loss": 21.7122, + "step": 491 + }, + { + "epoch": 0.11858279103398409, + "grad_norm": 10.846809387207031, + "learning_rate": 3.9184652427128534e-05, + "loss": 22.0324, + "step": 492 + }, + { + "epoch": 0.11882381296698, + "grad_norm": 11.312963485717773, + "learning_rate": 3.918023370931485e-05, + "loss": 22.0826, + "step": 493 + }, + { + "epoch": 0.1190648348999759, + "grad_norm": 12.452693939208984, + "learning_rate": 3.917580330087767e-05, + "loss": 22.816, + "step": 494 + }, + { + "epoch": 0.1193058568329718, + "grad_norm": 9.950858116149902, + "learning_rate": 3.9171361204517395e-05, + "loss": 22.0896, + "step": 495 + }, + { + "epoch": 0.1195468787659677, + "grad_norm": 13.052523612976074, + "learning_rate": 3.916690742294153e-05, + "loss": 20.9235, + "step": 496 + }, + { + "epoch": 0.11978790069896361, + "grad_norm": 11.522905349731445, + "learning_rate": 3.916244195886475e-05, + "loss": 22.4712, + "step": 497 + }, + { + "epoch": 0.12002892263195951, + "grad_norm": 10.524886131286621, + "learning_rate": 3.915796481500879e-05, + "loss": 21.4048, + "step": 498 + }, + { + "epoch": 0.1202699445649554, + "grad_norm": 7.521170139312744, + "learning_rate": 3.915347599410254e-05, + "loss": 22.1557, + "step": 499 + }, + { + "epoch": 0.12051096649795132, + "grad_norm": 10.937572479248047, + "learning_rate": 3.914897549888201e-05, + "loss": 22.2604, + "step": 500 + }, + { + "epoch": 0.12075198843094721, + "grad_norm": 9.491349220275879, + "learning_rate": 3.91444633320903e-05, + "loss": 21.7504, + "step": 501 + }, + { + "epoch": 0.12099301036394312, + "grad_norm": 8.36571979522705, + "learning_rate": 3.913993949647765e-05, + "loss": 22.5198, + "step": 502 + }, + { + "epoch": 0.12123403229693902, + "grad_norm": 6.904485702514648, + "learning_rate": 3.9135403994801395e-05, + "loss": 22.0297, + "step": 503 + }, + { + "epoch": 0.12147505422993492, + "grad_norm": 5.0824294090271, + "learning_rate": 3.913085682982598e-05, + "loss": 22.8742, + "step": 504 + }, + { + "epoch": 0.12171607616293083, + "grad_norm": 4.6728835105896, + "learning_rate": 3.912629800432299e-05, + "loss": 21.8646, + "step": 505 + }, + { + "epoch": 0.12195709809592672, + "grad_norm": 7.301826477050781, + "learning_rate": 3.912172752107108e-05, + "loss": 21.9154, + "step": 506 + }, + { + "epoch": 0.12219812002892264, + "grad_norm": 10.076762199401855, + "learning_rate": 3.911714538285602e-05, + "loss": 22.073, + "step": 507 + }, + { + "epoch": 0.12243914196191853, + "grad_norm": 12.620722770690918, + "learning_rate": 3.9112551592470695e-05, + "loss": 22.3477, + "step": 508 + }, + { + "epoch": 0.12268016389491444, + "grad_norm": 14.048612594604492, + "learning_rate": 3.910794615271509e-05, + "loss": 21.6949, + "step": 509 + }, + { + "epoch": 0.12292118582791034, + "grad_norm": 15.230833053588867, + "learning_rate": 3.9103329066396265e-05, + "loss": 22.3893, + "step": 510 + }, + { + "epoch": 0.12316220776090624, + "grad_norm": 15.11613941192627, + "learning_rate": 3.909870033632842e-05, + "loss": 22.9214, + "step": 511 + }, + { + "epoch": 0.12340322969390215, + "grad_norm": 14.820746421813965, + "learning_rate": 3.909405996533282e-05, + "loss": 22.0398, + "step": 512 + }, + { + "epoch": 0.12364425162689804, + "grad_norm": 15.581934928894043, + "learning_rate": 3.908940795623785e-05, + "loss": 22.9037, + "step": 513 + }, + { + "epoch": 0.12388527355989395, + "grad_norm": 10.478958129882812, + "learning_rate": 3.908474431187894e-05, + "loss": 22.2769, + "step": 514 + }, + { + "epoch": 0.12412629549288985, + "grad_norm": 10.074332237243652, + "learning_rate": 3.90800690350987e-05, + "loss": 21.318, + "step": 515 + }, + { + "epoch": 0.12436731742588576, + "grad_norm": 6.275788307189941, + "learning_rate": 3.9075382128746744e-05, + "loss": 21.418, + "step": 516 + }, + { + "epoch": 0.12460833935888166, + "grad_norm": 4.447282791137695, + "learning_rate": 3.907068359567981e-05, + "loss": 22.0653, + "step": 517 + }, + { + "epoch": 0.12484936129187756, + "grad_norm": 5.489899635314941, + "learning_rate": 3.9065973438761726e-05, + "loss": 21.4963, + "step": 518 + }, + { + "epoch": 0.12509038322487345, + "grad_norm": 4.442500591278076, + "learning_rate": 3.90612516608634e-05, + "loss": 20.9282, + "step": 519 + }, + { + "epoch": 0.12533140515786936, + "grad_norm": 4.0666351318359375, + "learning_rate": 3.9056518264862815e-05, + "loss": 21.6399, + "step": 520 + }, + { + "epoch": 0.12557242709086527, + "grad_norm": 9.05472183227539, + "learning_rate": 3.905177325364505e-05, + "loss": 21.088, + "step": 521 + }, + { + "epoch": 0.12581344902386118, + "grad_norm": 10.958237648010254, + "learning_rate": 3.904701663010225e-05, + "loss": 20.8591, + "step": 522 + }, + { + "epoch": 0.12605447095685707, + "grad_norm": 15.23047924041748, + "learning_rate": 3.904224839713365e-05, + "loss": 21.8693, + "step": 523 + }, + { + "epoch": 0.12629549288985298, + "grad_norm": 15.778684616088867, + "learning_rate": 3.9037468557645556e-05, + "loss": 21.5912, + "step": 524 + }, + { + "epoch": 0.1265365148228489, + "grad_norm": 15.298891067504883, + "learning_rate": 3.903267711455134e-05, + "loss": 21.3939, + "step": 525 + }, + { + "epoch": 0.12677753675584477, + "grad_norm": 14.867764472961426, + "learning_rate": 3.902787407077147e-05, + "loss": 21.3977, + "step": 526 + }, + { + "epoch": 0.12701855868884068, + "grad_norm": 13.478158950805664, + "learning_rate": 3.9023059429233455e-05, + "loss": 21.2618, + "step": 527 + }, + { + "epoch": 0.1272595806218366, + "grad_norm": 9.661888122558594, + "learning_rate": 3.9018233192871884e-05, + "loss": 22.3503, + "step": 528 + }, + { + "epoch": 0.12750060255483248, + "grad_norm": 8.945877075195312, + "learning_rate": 3.901339536462843e-05, + "loss": 21.0095, + "step": 529 + }, + { + "epoch": 0.1277416244878284, + "grad_norm": 7.521519184112549, + "learning_rate": 3.90085459474518e-05, + "loss": 21.9681, + "step": 530 + }, + { + "epoch": 0.1279826464208243, + "grad_norm": 5.751709938049316, + "learning_rate": 3.9003684944297805e-05, + "loss": 21.9724, + "step": 531 + }, + { + "epoch": 0.1282236683538202, + "grad_norm": 7.471375465393066, + "learning_rate": 3.899881235812928e-05, + "loss": 21.4449, + "step": 532 + }, + { + "epoch": 0.1284646902868161, + "grad_norm": 8.867966651916504, + "learning_rate": 3.8993928191916134e-05, + "loss": 22.6054, + "step": 533 + }, + { + "epoch": 0.128705712219812, + "grad_norm": 10.351669311523438, + "learning_rate": 3.898903244863535e-05, + "loss": 22.135, + "step": 534 + }, + { + "epoch": 0.1289467341528079, + "grad_norm": 12.625319480895996, + "learning_rate": 3.898412513127093e-05, + "loss": 22.0304, + "step": 535 + }, + { + "epoch": 0.1291877560858038, + "grad_norm": 19.714723587036133, + "learning_rate": 3.8979206242813965e-05, + "loss": 22.1691, + "step": 536 + }, + { + "epoch": 0.1294287780187997, + "grad_norm": 19.829652786254883, + "learning_rate": 3.89742757862626e-05, + "loss": 21.8765, + "step": 537 + }, + { + "epoch": 0.12966979995179562, + "grad_norm": 22.835166931152344, + "learning_rate": 3.896933376462199e-05, + "loss": 22.3908, + "step": 538 + }, + { + "epoch": 0.12991082188479153, + "grad_norm": 23.219650268554688, + "learning_rate": 3.896438018090439e-05, + "loss": 22.5259, + "step": 539 + }, + { + "epoch": 0.1301518438177874, + "grad_norm": 21.852384567260742, + "learning_rate": 3.895941503812908e-05, + "loss": 22.3726, + "step": 540 + }, + { + "epoch": 0.13039286575078332, + "grad_norm": 19.84891128540039, + "learning_rate": 3.8954438339322366e-05, + "loss": 22.3488, + "step": 541 + }, + { + "epoch": 0.13063388768377923, + "grad_norm": 14.865405082702637, + "learning_rate": 3.894945008751763e-05, + "loss": 21.6344, + "step": 542 + }, + { + "epoch": 0.13087490961677511, + "grad_norm": 11.22243881225586, + "learning_rate": 3.894445028575528e-05, + "loss": 21.8559, + "step": 543 + }, + { + "epoch": 0.13111593154977103, + "grad_norm": 8.131175994873047, + "learning_rate": 3.893943893708277e-05, + "loss": 21.7106, + "step": 544 + }, + { + "epoch": 0.13135695348276694, + "grad_norm": 5.71256160736084, + "learning_rate": 3.893441604455457e-05, + "loss": 21.5667, + "step": 545 + }, + { + "epoch": 0.13159797541576285, + "grad_norm": 2.1056246757507324, + "learning_rate": 3.8929381611232224e-05, + "loss": 21.4564, + "step": 546 + }, + { + "epoch": 0.13183899734875873, + "grad_norm": 2.289968729019165, + "learning_rate": 3.892433564018428e-05, + "loss": 21.2226, + "step": 547 + }, + { + "epoch": 0.13208001928175464, + "grad_norm": 4.993649482727051, + "learning_rate": 3.891927813448633e-05, + "loss": 21.6159, + "step": 548 + }, + { + "epoch": 0.13232104121475055, + "grad_norm": 13.52151107788086, + "learning_rate": 3.8914209097221e-05, + "loss": 21.5745, + "step": 549 + }, + { + "epoch": 0.13256206314774643, + "grad_norm": 19.642656326293945, + "learning_rate": 3.890912853147793e-05, + "loss": 21.6464, + "step": 550 + }, + { + "epoch": 0.13280308508074234, + "grad_norm": 26.89068603515625, + "learning_rate": 3.890403644035381e-05, + "loss": 23.2728, + "step": 551 + }, + { + "epoch": 0.13304410701373826, + "grad_norm": 33.88911437988281, + "learning_rate": 3.889893282695233e-05, + "loss": 24.1991, + "step": 552 + }, + { + "epoch": 0.13328512894673417, + "grad_norm": 33.358001708984375, + "learning_rate": 3.889381769438422e-05, + "loss": 24.3387, + "step": 553 + }, + { + "epoch": 0.13352615087973005, + "grad_norm": 34.86769485473633, + "learning_rate": 3.8888691045767224e-05, + "loss": 24.0235, + "step": 554 + }, + { + "epoch": 0.13376717281272596, + "grad_norm": 33.8707389831543, + "learning_rate": 3.888355288422611e-05, + "loss": 24.227, + "step": 555 + }, + { + "epoch": 0.13400819474572187, + "grad_norm": 33.83941650390625, + "learning_rate": 3.8878403212892656e-05, + "loss": 23.7426, + "step": 556 + }, + { + "epoch": 0.13424921667871775, + "grad_norm": 29.475250244140625, + "learning_rate": 3.8873242034905666e-05, + "loss": 23.6188, + "step": 557 + }, + { + "epoch": 0.13449023861171366, + "grad_norm": 28.465063095092773, + "learning_rate": 3.8868069353410935e-05, + "loss": 22.7059, + "step": 558 + }, + { + "epoch": 0.13473126054470957, + "grad_norm": 19.842187881469727, + "learning_rate": 3.88628851715613e-05, + "loss": 22.9965, + "step": 559 + }, + { + "epoch": 0.13497228247770546, + "grad_norm": 17.471546173095703, + "learning_rate": 3.885768949251659e-05, + "loss": 22.558, + "step": 560 + }, + { + "epoch": 0.13521330441070137, + "grad_norm": 13.237724304199219, + "learning_rate": 3.885248231944365e-05, + "loss": 21.4785, + "step": 561 + }, + { + "epoch": 0.13545432634369728, + "grad_norm": 5.605689525604248, + "learning_rate": 3.884726365551631e-05, + "loss": 23.4421, + "step": 562 + }, + { + "epoch": 0.1356953482766932, + "grad_norm": 5.003852844238281, + "learning_rate": 3.8842033503915434e-05, + "loss": 22.7822, + "step": 563 + }, + { + "epoch": 0.13593637020968907, + "grad_norm": 4.622612953186035, + "learning_rate": 3.883679186782887e-05, + "loss": 21.1717, + "step": 564 + }, + { + "epoch": 0.13617739214268498, + "grad_norm": 8.356499671936035, + "learning_rate": 3.8831538750451465e-05, + "loss": 21.7221, + "step": 565 + }, + { + "epoch": 0.1364184140756809, + "grad_norm": 4.788625240325928, + "learning_rate": 3.8826274154985074e-05, + "loss": 22.4566, + "step": 566 + }, + { + "epoch": 0.13665943600867678, + "grad_norm": 6.349889755249023, + "learning_rate": 3.8820998084638535e-05, + "loss": 21.3812, + "step": 567 + }, + { + "epoch": 0.1369004579416727, + "grad_norm": 7.6886305809021, + "learning_rate": 3.881571054262769e-05, + "loss": 21.3398, + "step": 568 + }, + { + "epoch": 0.1371414798746686, + "grad_norm": 8.041238784790039, + "learning_rate": 3.881041153217538e-05, + "loss": 22.035, + "step": 569 + }, + { + "epoch": 0.1373825018076645, + "grad_norm": 14.17418384552002, + "learning_rate": 3.880510105651142e-05, + "loss": 21.4034, + "step": 570 + }, + { + "epoch": 0.1376235237406604, + "grad_norm": 17.104995727539062, + "learning_rate": 3.879977911887261e-05, + "loss": 21.9409, + "step": 571 + }, + { + "epoch": 0.1378645456736563, + "grad_norm": 20.9821834564209, + "learning_rate": 3.879444572250275e-05, + "loss": 22.5435, + "step": 572 + }, + { + "epoch": 0.1381055676066522, + "grad_norm": 29.34518051147461, + "learning_rate": 3.878910087065264e-05, + "loss": 24.1344, + "step": 573 + }, + { + "epoch": 0.1383465895396481, + "grad_norm": 31.948137283325195, + "learning_rate": 3.8783744566580026e-05, + "loss": 23.0683, + "step": 574 + }, + { + "epoch": 0.138587611472644, + "grad_norm": 31.45174789428711, + "learning_rate": 3.877837681354965e-05, + "loss": 23.6142, + "step": 575 + }, + { + "epoch": 0.13882863340563992, + "grad_norm": 34.57921600341797, + "learning_rate": 3.877299761483324e-05, + "loss": 23.8382, + "step": 576 + }, + { + "epoch": 0.13906965533863583, + "grad_norm": 33.83427429199219, + "learning_rate": 3.876760697370948e-05, + "loss": 23.1375, + "step": 577 + }, + { + "epoch": 0.1393106772716317, + "grad_norm": 27.464536666870117, + "learning_rate": 3.8762204893464065e-05, + "loss": 23.2053, + "step": 578 + }, + { + "epoch": 0.13955169920462762, + "grad_norm": 26.839323043823242, + "learning_rate": 3.875679137738962e-05, + "loss": 23.0933, + "step": 579 + }, + { + "epoch": 0.13979272113762353, + "grad_norm": 17.85625457763672, + "learning_rate": 3.875136642878577e-05, + "loss": 23.1029, + "step": 580 + }, + { + "epoch": 0.14003374307061942, + "grad_norm": 13.169098854064941, + "learning_rate": 3.874593005095909e-05, + "loss": 22.3406, + "step": 581 + }, + { + "epoch": 0.14027476500361533, + "grad_norm": 9.713306427001953, + "learning_rate": 3.874048224722313e-05, + "loss": 22.8308, + "step": 582 + }, + { + "epoch": 0.14051578693661124, + "grad_norm": 3.4887795448303223, + "learning_rate": 3.8735023020898414e-05, + "loss": 22.3075, + "step": 583 + }, + { + "epoch": 0.14075680886960715, + "grad_norm": 3.9921114444732666, + "learning_rate": 3.87295523753124e-05, + "loss": 21.1378, + "step": 584 + }, + { + "epoch": 0.14099783080260303, + "grad_norm": 3.8990042209625244, + "learning_rate": 3.8724070313799546e-05, + "loss": 21.7478, + "step": 585 + }, + { + "epoch": 0.14123885273559894, + "grad_norm": 7.221833229064941, + "learning_rate": 3.8718576839701227e-05, + "loss": 21.4554, + "step": 586 + }, + { + "epoch": 0.14147987466859485, + "grad_norm": 12.468117713928223, + "learning_rate": 3.87130719563658e-05, + "loss": 21.8418, + "step": 587 + }, + { + "epoch": 0.14172089660159073, + "grad_norm": 14.471100807189941, + "learning_rate": 3.870755566714857e-05, + "loss": 22.4303, + "step": 588 + }, + { + "epoch": 0.14196191853458665, + "grad_norm": 14.915914535522461, + "learning_rate": 3.87020279754118e-05, + "loss": 21.8491, + "step": 589 + }, + { + "epoch": 0.14220294046758256, + "grad_norm": 15.8550443649292, + "learning_rate": 3.869648888452468e-05, + "loss": 22.0597, + "step": 590 + }, + { + "epoch": 0.14244396240057844, + "grad_norm": 15.378199577331543, + "learning_rate": 3.869093839786337e-05, + "loss": 22.1102, + "step": 591 + }, + { + "epoch": 0.14268498433357435, + "grad_norm": 12.721076011657715, + "learning_rate": 3.8685376518810975e-05, + "loss": 21.808, + "step": 592 + }, + { + "epoch": 0.14292600626657026, + "grad_norm": 11.106048583984375, + "learning_rate": 3.867980325075754e-05, + "loss": 21.6663, + "step": 593 + }, + { + "epoch": 0.14316702819956617, + "grad_norm": 9.952249526977539, + "learning_rate": 3.8674218597100046e-05, + "loss": 22.0667, + "step": 594 + }, + { + "epoch": 0.14340805013256205, + "grad_norm": 6.7364983558654785, + "learning_rate": 3.8668622561242416e-05, + "loss": 21.3717, + "step": 595 + }, + { + "epoch": 0.14364907206555796, + "grad_norm": 2.3522963523864746, + "learning_rate": 3.8663015146595525e-05, + "loss": 21.4651, + "step": 596 + }, + { + "epoch": 0.14389009399855388, + "grad_norm": 3.6684651374816895, + "learning_rate": 3.8657396356577157e-05, + "loss": 21.2508, + "step": 597 + }, + { + "epoch": 0.14413111593154976, + "grad_norm": 7.883078575134277, + "learning_rate": 3.865176619461205e-05, + "loss": 21.4703, + "step": 598 + }, + { + "epoch": 0.14437213786454567, + "grad_norm": 5.80167818069458, + "learning_rate": 3.864612466413187e-05, + "loss": 21.4093, + "step": 599 + }, + { + "epoch": 0.14461315979754158, + "grad_norm": 12.900911331176758, + "learning_rate": 3.8640471768575204e-05, + "loss": 21.2774, + "step": 600 + }, + { + "epoch": 0.14461315979754158, + "eval_cc_pretrain_accuracy": 0.8899999856948853, + "eval_cc_pretrain_loss": 2.0896100997924805, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 16.7204, + "eval_cc_pretrain_samples_per_second": 5.981, + "eval_cc_pretrain_steps_per_second": 0.06, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 600 + }, + { + "epoch": 0.14461315979754158, + "eval_mscoco_pretrain_accuracy": 0.8100000023841858, + "eval_mscoco_pretrain_loss": 2.315831184387207, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 13.9919, + "eval_mscoco_pretrain_samples_per_second": 7.147, + "eval_mscoco_pretrain_steps_per_second": 0.071, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 600 + }, + { + "epoch": 0.1448541817305375, + "grad_norm": 12.599310874938965, + "learning_rate": 3.863480751138757e-05, + "loss": 22.0231, + "step": 601 + }, + { + "epoch": 0.14509520366353337, + "grad_norm": 14.662084579467773, + "learning_rate": 3.862913189602142e-05, + "loss": 21.7869, + "step": 602 + }, + { + "epoch": 0.14533622559652928, + "grad_norm": 18.457077026367188, + "learning_rate": 3.862344492593613e-05, + "loss": 21.6534, + "step": 603 + }, + { + "epoch": 0.1455772475295252, + "grad_norm": 19.209644317626953, + "learning_rate": 3.861774660459796e-05, + "loss": 22.0139, + "step": 604 + }, + { + "epoch": 0.14581826946252108, + "grad_norm": 17.5800838470459, + "learning_rate": 3.861203693548015e-05, + "loss": 22.0524, + "step": 605 + }, + { + "epoch": 0.146059291395517, + "grad_norm": 18.583377838134766, + "learning_rate": 3.86063159220628e-05, + "loss": 22.1585, + "step": 606 + }, + { + "epoch": 0.1463003133285129, + "grad_norm": 14.852075576782227, + "learning_rate": 3.860058356783296e-05, + "loss": 21.6924, + "step": 607 + }, + { + "epoch": 0.1465413352615088, + "grad_norm": 12.50161075592041, + "learning_rate": 3.859483987628458e-05, + "loss": 22.6061, + "step": 608 + }, + { + "epoch": 0.1467823571945047, + "grad_norm": 10.579334259033203, + "learning_rate": 3.858908485091852e-05, + "loss": 21.887, + "step": 609 + }, + { + "epoch": 0.1470233791275006, + "grad_norm": 8.113744735717773, + "learning_rate": 3.858331849524254e-05, + "loss": 22.2629, + "step": 610 + }, + { + "epoch": 0.1472644010604965, + "grad_norm": 6.5386528968811035, + "learning_rate": 3.8577540812771336e-05, + "loss": 22.4119, + "step": 611 + }, + { + "epoch": 0.1475054229934924, + "grad_norm": 6.3505706787109375, + "learning_rate": 3.857175180702647e-05, + "loss": 21.3283, + "step": 612 + }, + { + "epoch": 0.1477464449264883, + "grad_norm": 4.471867561340332, + "learning_rate": 3.856595148153643e-05, + "loss": 22.4743, + "step": 613 + }, + { + "epoch": 0.14798746685948422, + "grad_norm": 5.3676652908325195, + "learning_rate": 3.85601398398366e-05, + "loss": 22.0862, + "step": 614 + }, + { + "epoch": 0.14822848879248013, + "grad_norm": 9.74562931060791, + "learning_rate": 3.8554316885469244e-05, + "loss": 21.859, + "step": 615 + }, + { + "epoch": 0.148469510725476, + "grad_norm": 10.633652687072754, + "learning_rate": 3.8548482621983547e-05, + "loss": 23.2318, + "step": 616 + }, + { + "epoch": 0.14871053265847192, + "grad_norm": 9.811300277709961, + "learning_rate": 3.854263705293557e-05, + "loss": 21.2163, + "step": 617 + }, + { + "epoch": 0.14895155459146783, + "grad_norm": 11.784741401672363, + "learning_rate": 3.853678018188827e-05, + "loss": 21.1137, + "step": 618 + }, + { + "epoch": 0.14919257652446372, + "grad_norm": 12.203115463256836, + "learning_rate": 3.853091201241149e-05, + "loss": 21.6678, + "step": 619 + }, + { + "epoch": 0.14943359845745963, + "grad_norm": 12.984305381774902, + "learning_rate": 3.8525032548081975e-05, + "loss": 21.3522, + "step": 620 + }, + { + "epoch": 0.14967462039045554, + "grad_norm": 11.12298583984375, + "learning_rate": 3.851914179248333e-05, + "loss": 21.4171, + "step": 621 + }, + { + "epoch": 0.14991564232345145, + "grad_norm": 8.890546798706055, + "learning_rate": 3.851323974920605e-05, + "loss": 21.1866, + "step": 622 + }, + { + "epoch": 0.15015666425644733, + "grad_norm": 7.889697551727295, + "learning_rate": 3.850732642184753e-05, + "loss": 21.4095, + "step": 623 + }, + { + "epoch": 0.15039768618944324, + "grad_norm": 7.1496171951293945, + "learning_rate": 3.8501401814012005e-05, + "loss": 21.9061, + "step": 624 + }, + { + "epoch": 0.15063870812243915, + "grad_norm": 8.294866561889648, + "learning_rate": 3.8495465929310626e-05, + "loss": 20.9717, + "step": 625 + }, + { + "epoch": 0.15087973005543504, + "grad_norm": 7.808030128479004, + "learning_rate": 3.8489518771361396e-05, + "loss": 21.5356, + "step": 626 + }, + { + "epoch": 0.15112075198843095, + "grad_norm": 9.875743865966797, + "learning_rate": 3.848356034378919e-05, + "loss": 21.1634, + "step": 627 + }, + { + "epoch": 0.15136177392142686, + "grad_norm": 11.983543395996094, + "learning_rate": 3.8477590650225735e-05, + "loss": 21.9262, + "step": 628 + }, + { + "epoch": 0.15160279585442274, + "grad_norm": 21.233596801757812, + "learning_rate": 3.847160969430967e-05, + "loss": 22.2113, + "step": 629 + }, + { + "epoch": 0.15184381778741865, + "grad_norm": 34.35773849487305, + "learning_rate": 3.846561747968647e-05, + "loss": 23.0444, + "step": 630 + }, + { + "epoch": 0.15208483972041456, + "grad_norm": 44.660301208496094, + "learning_rate": 3.845961401000845e-05, + "loss": 24.3232, + "step": 631 + }, + { + "epoch": 0.15232586165341047, + "grad_norm": 47.99455642700195, + "learning_rate": 3.8453599288934835e-05, + "loss": 24.4091, + "step": 632 + }, + { + "epoch": 0.15256688358640635, + "grad_norm": 48.05781936645508, + "learning_rate": 3.844757332013168e-05, + "loss": 24.8714, + "step": 633 + }, + { + "epoch": 0.15280790551940227, + "grad_norm": 40.0009765625, + "learning_rate": 3.844153610727188e-05, + "loss": 25.1844, + "step": 634 + }, + { + "epoch": 0.15304892745239818, + "grad_norm": 28.70438003540039, + "learning_rate": 3.8435487654035216e-05, + "loss": 22.4725, + "step": 635 + }, + { + "epoch": 0.15328994938539406, + "grad_norm": 17.191543579101562, + "learning_rate": 3.8429427964108305e-05, + "loss": 23.1657, + "step": 636 + }, + { + "epoch": 0.15353097131838997, + "grad_norm": 5.469162464141846, + "learning_rate": 3.842335704118462e-05, + "loss": 21.647, + "step": 637 + }, + { + "epoch": 0.15377199325138588, + "grad_norm": 3.01867938041687, + "learning_rate": 3.841727488896445e-05, + "loss": 21.7468, + "step": 638 + }, + { + "epoch": 0.1540130151843818, + "grad_norm": 4.443301200866699, + "learning_rate": 3.841118151115498e-05, + "loss": 21.7737, + "step": 639 + }, + { + "epoch": 0.15425403711737767, + "grad_norm": 1.8203190565109253, + "learning_rate": 3.840507691147019e-05, + "loss": 21.076, + "step": 640 + }, + { + "epoch": 0.15449505905037358, + "grad_norm": 5.784672737121582, + "learning_rate": 3.839896109363094e-05, + "loss": 21.4373, + "step": 641 + }, + { + "epoch": 0.1547360809833695, + "grad_norm": 11.623886108398438, + "learning_rate": 3.839283406136488e-05, + "loss": 21.9011, + "step": 642 + }, + { + "epoch": 0.15497710291636538, + "grad_norm": 20.83624839782715, + "learning_rate": 3.838669581840656e-05, + "loss": 22.213, + "step": 643 + }, + { + "epoch": 0.1552181248493613, + "grad_norm": 25.33501434326172, + "learning_rate": 3.838054636849728e-05, + "loss": 22.9755, + "step": 644 + }, + { + "epoch": 0.1554591467823572, + "grad_norm": 29.716983795166016, + "learning_rate": 3.837438571538526e-05, + "loss": 23.318, + "step": 645 + }, + { + "epoch": 0.1557001687153531, + "grad_norm": 31.23537254333496, + "learning_rate": 3.836821386282548e-05, + "loss": 23.5088, + "step": 646 + }, + { + "epoch": 0.155941190648349, + "grad_norm": 30.987245559692383, + "learning_rate": 3.836203081457978e-05, + "loss": 23.9883, + "step": 647 + }, + { + "epoch": 0.1561822125813449, + "grad_norm": 30.853906631469727, + "learning_rate": 3.8355836574416804e-05, + "loss": 23.6864, + "step": 648 + }, + { + "epoch": 0.15642323451434081, + "grad_norm": 27.379552841186523, + "learning_rate": 3.8349631146112046e-05, + "loss": 21.7464, + "step": 649 + }, + { + "epoch": 0.1566642564473367, + "grad_norm": 24.8726863861084, + "learning_rate": 3.834341453344779e-05, + "loss": 22.644, + "step": 650 + }, + { + "epoch": 0.1569052783803326, + "grad_norm": 23.09463882446289, + "learning_rate": 3.833718674021315e-05, + "loss": 21.5383, + "step": 651 + }, + { + "epoch": 0.15714630031332852, + "grad_norm": 19.36989974975586, + "learning_rate": 3.833094777020407e-05, + "loss": 22.6058, + "step": 652 + }, + { + "epoch": 0.15738732224632443, + "grad_norm": 14.383389472961426, + "learning_rate": 3.8324697627223263e-05, + "loss": 21.8231, + "step": 653 + }, + { + "epoch": 0.1576283441793203, + "grad_norm": 11.495000839233398, + "learning_rate": 3.8318436315080306e-05, + "loss": 20.8076, + "step": 654 + }, + { + "epoch": 0.15786936611231622, + "grad_norm": 9.896902084350586, + "learning_rate": 3.831216383759153e-05, + "loss": 21.2496, + "step": 655 + }, + { + "epoch": 0.15811038804531213, + "grad_norm": 4.407585144042969, + "learning_rate": 3.830588019858013e-05, + "loss": 21.1061, + "step": 656 + }, + { + "epoch": 0.15835140997830802, + "grad_norm": 1.6661744117736816, + "learning_rate": 3.829958540187605e-05, + "loss": 21.5146, + "step": 657 + }, + { + "epoch": 0.15859243191130393, + "grad_norm": 1.6136219501495361, + "learning_rate": 3.8293279451316065e-05, + "loss": 20.709, + "step": 658 + }, + { + "epoch": 0.15883345384429984, + "grad_norm": 1.7246195077896118, + "learning_rate": 3.828696235074374e-05, + "loss": 21.3059, + "step": 659 + }, + { + "epoch": 0.15907447577729572, + "grad_norm": 4.420144081115723, + "learning_rate": 3.828063410400943e-05, + "loss": 21.4689, + "step": 660 + }, + { + "epoch": 0.15931549771029163, + "grad_norm": 11.138218879699707, + "learning_rate": 3.82742947149703e-05, + "loss": 23.0402, + "step": 661 + }, + { + "epoch": 0.15955651964328754, + "grad_norm": 20.072616577148438, + "learning_rate": 3.826794418749031e-05, + "loss": 22.4638, + "step": 662 + }, + { + "epoch": 0.15979754157628345, + "grad_norm": 24.446651458740234, + "learning_rate": 3.826158252544017e-05, + "loss": 23.453, + "step": 663 + }, + { + "epoch": 0.16003856350927934, + "grad_norm": 28.699077606201172, + "learning_rate": 3.8255209732697415e-05, + "loss": 23.5261, + "step": 664 + }, + { + "epoch": 0.16027958544227525, + "grad_norm": 33.8259162902832, + "learning_rate": 3.824882581314636e-05, + "loss": 22.7255, + "step": 665 + }, + { + "epoch": 0.16052060737527116, + "grad_norm": 36.82908630371094, + "learning_rate": 3.824243077067807e-05, + "loss": 22.7934, + "step": 666 + }, + { + "epoch": 0.16076162930826704, + "grad_norm": 34.831321716308594, + "learning_rate": 3.8236024609190436e-05, + "loss": 23.9019, + "step": 667 + }, + { + "epoch": 0.16100265124126295, + "grad_norm": 31.9533748626709, + "learning_rate": 3.8229607332588107e-05, + "loss": 24.5216, + "step": 668 + }, + { + "epoch": 0.16124367317425886, + "grad_norm": 28.82454490661621, + "learning_rate": 3.8223178944782484e-05, + "loss": 23.0535, + "step": 669 + }, + { + "epoch": 0.16148469510725477, + "grad_norm": 27.838937759399414, + "learning_rate": 3.821673944969177e-05, + "loss": 23.7334, + "step": 670 + }, + { + "epoch": 0.16172571704025065, + "grad_norm": 20.085338592529297, + "learning_rate": 3.821028885124094e-05, + "loss": 22.8132, + "step": 671 + }, + { + "epoch": 0.16196673897324657, + "grad_norm": 11.792143821716309, + "learning_rate": 3.820382715336171e-05, + "loss": 22.1957, + "step": 672 + }, + { + "epoch": 0.16220776090624248, + "grad_norm": 8.066630363464355, + "learning_rate": 3.819735435999257e-05, + "loss": 21.7722, + "step": 673 + }, + { + "epoch": 0.16244878283923836, + "grad_norm": 6.283244609832764, + "learning_rate": 3.8190870475078796e-05, + "loss": 21.8004, + "step": 674 + }, + { + "epoch": 0.16268980477223427, + "grad_norm": 2.8234148025512695, + "learning_rate": 3.81843755025724e-05, + "loss": 21.6177, + "step": 675 + }, + { + "epoch": 0.16293082670523018, + "grad_norm": 2.503181219100952, + "learning_rate": 3.8177869446432165e-05, + "loss": 22.5374, + "step": 676 + }, + { + "epoch": 0.1631718486382261, + "grad_norm": 2.4417335987091064, + "learning_rate": 3.817135231062361e-05, + "loss": 22.1042, + "step": 677 + }, + { + "epoch": 0.16341287057122197, + "grad_norm": 8.5792818069458, + "learning_rate": 3.816482409911903e-05, + "loss": 22.3949, + "step": 678 + }, + { + "epoch": 0.16365389250421788, + "grad_norm": 19.23118019104004, + "learning_rate": 3.815828481589748e-05, + "loss": 21.7052, + "step": 679 + }, + { + "epoch": 0.1638949144372138, + "grad_norm": 22.190013885498047, + "learning_rate": 3.815173446494471e-05, + "loss": 22.5652, + "step": 680 + }, + { + "epoch": 0.16413593637020968, + "grad_norm": 24.93637466430664, + "learning_rate": 3.8145173050253274e-05, + "loss": 22.8322, + "step": 681 + }, + { + "epoch": 0.1643769583032056, + "grad_norm": 26.690996170043945, + "learning_rate": 3.8138600575822445e-05, + "loss": 22.9349, + "step": 682 + }, + { + "epoch": 0.1646179802362015, + "grad_norm": 30.068904876708984, + "learning_rate": 3.8132017045658234e-05, + "loss": 22.7489, + "step": 683 + }, + { + "epoch": 0.1648590021691974, + "grad_norm": 30.451396942138672, + "learning_rate": 3.81254224637734e-05, + "loss": 23.02, + "step": 684 + }, + { + "epoch": 0.1651000241021933, + "grad_norm": 25.083675384521484, + "learning_rate": 3.811881683418744e-05, + "loss": 21.4762, + "step": 685 + }, + { + "epoch": 0.1653410460351892, + "grad_norm": 24.713106155395508, + "learning_rate": 3.811220016092656e-05, + "loss": 22.7285, + "step": 686 + }, + { + "epoch": 0.16558206796818511, + "grad_norm": 23.58387565612793, + "learning_rate": 3.8105572448023725e-05, + "loss": 21.8002, + "step": 687 + }, + { + "epoch": 0.165823089901181, + "grad_norm": 19.85442352294922, + "learning_rate": 3.809893369951863e-05, + "loss": 22.7327, + "step": 688 + }, + { + "epoch": 0.1660641118341769, + "grad_norm": 13.570194244384766, + "learning_rate": 3.8092283919457674e-05, + "loss": 22.2369, + "step": 689 + }, + { + "epoch": 0.16630513376717282, + "grad_norm": 8.647356033325195, + "learning_rate": 3.808562311189399e-05, + "loss": 22.1105, + "step": 690 + }, + { + "epoch": 0.1665461557001687, + "grad_norm": 6.450450897216797, + "learning_rate": 3.8078951280887454e-05, + "loss": 21.2693, + "step": 691 + }, + { + "epoch": 0.1667871776331646, + "grad_norm": 7.959214687347412, + "learning_rate": 3.8072268430504615e-05, + "loss": 21.733, + "step": 692 + }, + { + "epoch": 0.16702819956616052, + "grad_norm": 10.291971206665039, + "learning_rate": 3.806557456481878e-05, + "loss": 22.5987, + "step": 693 + }, + { + "epoch": 0.16726922149915643, + "grad_norm": 16.46646499633789, + "learning_rate": 3.805886968790995e-05, + "loss": 22.0142, + "step": 694 + }, + { + "epoch": 0.16751024343215232, + "grad_norm": 19.71506118774414, + "learning_rate": 3.805215380386484e-05, + "loss": 21.934, + "step": 695 + }, + { + "epoch": 0.16775126536514823, + "grad_norm": 22.317052841186523, + "learning_rate": 3.8045426916776886e-05, + "loss": 21.2411, + "step": 696 + }, + { + "epoch": 0.16799228729814414, + "grad_norm": 24.694547653198242, + "learning_rate": 3.8038689030746214e-05, + "loss": 22.6728, + "step": 697 + }, + { + "epoch": 0.16823330923114002, + "grad_norm": 26.834840774536133, + "learning_rate": 3.803194014987966e-05, + "loss": 23.2732, + "step": 698 + }, + { + "epoch": 0.16847433116413593, + "grad_norm": 29.96207046508789, + "learning_rate": 3.8025180278290766e-05, + "loss": 24.0627, + "step": 699 + }, + { + "epoch": 0.16871535309713184, + "grad_norm": 32.08803939819336, + "learning_rate": 3.8018409420099766e-05, + "loss": 23.7305, + "step": 700 + }, + { + "epoch": 0.16895637503012775, + "grad_norm": 30.34366226196289, + "learning_rate": 3.801162757943359e-05, + "loss": 22.7223, + "step": 701 + }, + { + "epoch": 0.16919739696312364, + "grad_norm": 28.463268280029297, + "learning_rate": 3.8004834760425874e-05, + "loss": 21.8991, + "step": 702 + }, + { + "epoch": 0.16943841889611955, + "grad_norm": 24.844825744628906, + "learning_rate": 3.799803096721693e-05, + "loss": 23.0329, + "step": 703 + }, + { + "epoch": 0.16967944082911546, + "grad_norm": 19.602275848388672, + "learning_rate": 3.799121620395378e-05, + "loss": 22.1545, + "step": 704 + }, + { + "epoch": 0.16992046276211134, + "grad_norm": 11.079865455627441, + "learning_rate": 3.7984390474790096e-05, + "loss": 22.2949, + "step": 705 + }, + { + "epoch": 0.17016148469510725, + "grad_norm": 10.393208503723145, + "learning_rate": 3.7977553783886274e-05, + "loss": 21.748, + "step": 706 + }, + { + "epoch": 0.17040250662810316, + "grad_norm": 3.69429087638855, + "learning_rate": 3.7970706135409366e-05, + "loss": 22.5099, + "step": 707 + }, + { + "epoch": 0.17064352856109907, + "grad_norm": 1.814676284790039, + "learning_rate": 3.796384753353312e-05, + "loss": 22.905, + "step": 708 + }, + { + "epoch": 0.17088455049409496, + "grad_norm": 5.8936872482299805, + "learning_rate": 3.795697798243793e-05, + "loss": 22.0416, + "step": 709 + }, + { + "epoch": 0.17112557242709087, + "grad_norm": 11.424642562866211, + "learning_rate": 3.795009748631091e-05, + "loss": 22.1946, + "step": 710 + }, + { + "epoch": 0.17136659436008678, + "grad_norm": 16.60214614868164, + "learning_rate": 3.79432060493458e-05, + "loss": 22.0025, + "step": 711 + }, + { + "epoch": 0.17160761629308266, + "grad_norm": 19.453977584838867, + "learning_rate": 3.793630367574304e-05, + "loss": 22.8748, + "step": 712 + }, + { + "epoch": 0.17184863822607857, + "grad_norm": 22.570951461791992, + "learning_rate": 3.792939036970972e-05, + "loss": 21.4096, + "step": 713 + }, + { + "epoch": 0.17208966015907448, + "grad_norm": 22.32449722290039, + "learning_rate": 3.79224661354596e-05, + "loss": 22.8406, + "step": 714 + }, + { + "epoch": 0.1723306820920704, + "grad_norm": 21.94466209411621, + "learning_rate": 3.791553097721309e-05, + "loss": 22.7606, + "step": 715 + }, + { + "epoch": 0.17257170402506627, + "grad_norm": 23.948270797729492, + "learning_rate": 3.790858489919728e-05, + "loss": 23.071, + "step": 716 + }, + { + "epoch": 0.17281272595806219, + "grad_norm": 19.707521438598633, + "learning_rate": 3.7901627905645896e-05, + "loss": 23.26, + "step": 717 + }, + { + "epoch": 0.1730537478910581, + "grad_norm": 16.9522705078125, + "learning_rate": 3.789466000079932e-05, + "loss": 22.1513, + "step": 718 + }, + { + "epoch": 0.17329476982405398, + "grad_norm": 17.568666458129883, + "learning_rate": 3.78876811889046e-05, + "loss": 22.2813, + "step": 719 + }, + { + "epoch": 0.1735357917570499, + "grad_norm": 11.588979721069336, + "learning_rate": 3.78806914742154e-05, + "loss": 21.4454, + "step": 720 + }, + { + "epoch": 0.1737768136900458, + "grad_norm": 10.664037704467773, + "learning_rate": 3.787369086099207e-05, + "loss": 22.0272, + "step": 721 + }, + { + "epoch": 0.17401783562304168, + "grad_norm": 7.009206295013428, + "learning_rate": 3.7866679353501574e-05, + "loss": 22.1129, + "step": 722 + }, + { + "epoch": 0.1742588575560376, + "grad_norm": 6.033772945404053, + "learning_rate": 3.785965695601753e-05, + "loss": 22.0184, + "step": 723 + }, + { + "epoch": 0.1744998794890335, + "grad_norm": 6.637179851531982, + "learning_rate": 3.785262367282018e-05, + "loss": 21.5298, + "step": 724 + }, + { + "epoch": 0.17474090142202942, + "grad_norm": 6.603823184967041, + "learning_rate": 3.784557950819642e-05, + "loss": 22.0998, + "step": 725 + }, + { + "epoch": 0.1749819233550253, + "grad_norm": 6.819653511047363, + "learning_rate": 3.783852446643976e-05, + "loss": 22.5907, + "step": 726 + }, + { + "epoch": 0.1752229452880212, + "grad_norm": 8.614082336425781, + "learning_rate": 3.783145855185035e-05, + "loss": 21.1546, + "step": 727 + }, + { + "epoch": 0.17546396722101712, + "grad_norm": 6.990750789642334, + "learning_rate": 3.782438176873496e-05, + "loss": 22.1254, + "step": 728 + }, + { + "epoch": 0.175704989154013, + "grad_norm": 6.768123626708984, + "learning_rate": 3.7817294121407e-05, + "loss": 20.4966, + "step": 729 + }, + { + "epoch": 0.1759460110870089, + "grad_norm": 3.214696168899536, + "learning_rate": 3.781019561418648e-05, + "loss": 20.8156, + "step": 730 + }, + { + "epoch": 0.17618703302000482, + "grad_norm": 1.9351860284805298, + "learning_rate": 3.780308625140005e-05, + "loss": 21.0712, + "step": 731 + }, + { + "epoch": 0.17642805495300073, + "grad_norm": 1.7574968338012695, + "learning_rate": 3.779596603738096e-05, + "loss": 21.0934, + "step": 732 + }, + { + "epoch": 0.17666907688599662, + "grad_norm": 1.7227331399917603, + "learning_rate": 3.7788834976469095e-05, + "loss": 20.1353, + "step": 733 + }, + { + "epoch": 0.17691009881899253, + "grad_norm": 3.3782894611358643, + "learning_rate": 3.778169307301091e-05, + "loss": 21.2254, + "step": 734 + }, + { + "epoch": 0.17715112075198844, + "grad_norm": 8.633805274963379, + "learning_rate": 3.7774540331359524e-05, + "loss": 21.3641, + "step": 735 + }, + { + "epoch": 0.17739214268498432, + "grad_norm": 12.667562484741211, + "learning_rate": 3.776737675587462e-05, + "loss": 21.0232, + "step": 736 + }, + { + "epoch": 0.17763316461798023, + "grad_norm": 17.941787719726562, + "learning_rate": 3.7760202350922494e-05, + "loss": 20.6507, + "step": 737 + }, + { + "epoch": 0.17787418655097614, + "grad_norm": 20.819538116455078, + "learning_rate": 3.7753017120876056e-05, + "loss": 21.2887, + "step": 738 + }, + { + "epoch": 0.17811520848397205, + "grad_norm": 24.191875457763672, + "learning_rate": 3.774582107011481e-05, + "loss": 21.6007, + "step": 739 + }, + { + "epoch": 0.17835623041696794, + "grad_norm": 27.19976234436035, + "learning_rate": 3.773861420302484e-05, + "loss": 22.5691, + "step": 740 + }, + { + "epoch": 0.17859725234996385, + "grad_norm": 26.437135696411133, + "learning_rate": 3.773139652399884e-05, + "loss": 22.0529, + "step": 741 + }, + { + "epoch": 0.17883827428295976, + "grad_norm": 29.443490982055664, + "learning_rate": 3.7724168037436084e-05, + "loss": 22.8792, + "step": 742 + }, + { + "epoch": 0.17907929621595564, + "grad_norm": 25.06574058532715, + "learning_rate": 3.771692874774243e-05, + "loss": 22.4343, + "step": 743 + }, + { + "epoch": 0.17932031814895155, + "grad_norm": 23.953807830810547, + "learning_rate": 3.770967865933035e-05, + "loss": 22.4813, + "step": 744 + }, + { + "epoch": 0.17956134008194746, + "grad_norm": 19.56885528564453, + "learning_rate": 3.770241777661885e-05, + "loss": 21.2266, + "step": 745 + }, + { + "epoch": 0.17980236201494337, + "grad_norm": 14.970714569091797, + "learning_rate": 3.7695146104033546e-05, + "loss": 21.6633, + "step": 746 + }, + { + "epoch": 0.18004338394793926, + "grad_norm": 11.121596336364746, + "learning_rate": 3.768786364600664e-05, + "loss": 20.838, + "step": 747 + }, + { + "epoch": 0.18028440588093517, + "grad_norm": 6.158586502075195, + "learning_rate": 3.768057040697688e-05, + "loss": 21.3234, + "step": 748 + }, + { + "epoch": 0.18052542781393108, + "grad_norm": 1.594093680381775, + "learning_rate": 3.7673266391389596e-05, + "loss": 21.2172, + "step": 749 + }, + { + "epoch": 0.18076644974692696, + "grad_norm": 2.2882637977600098, + "learning_rate": 3.76659516036967e-05, + "loss": 21.1584, + "step": 750 + }, + { + "epoch": 0.18100747167992287, + "grad_norm": 1.9874308109283447, + "learning_rate": 3.7658626048356654e-05, + "loss": 20.7031, + "step": 751 + }, + { + "epoch": 0.18124849361291878, + "grad_norm": 11.824000358581543, + "learning_rate": 3.765128972983448e-05, + "loss": 21.1796, + "step": 752 + }, + { + "epoch": 0.18148951554591466, + "grad_norm": 25.087800979614258, + "learning_rate": 3.764394265260178e-05, + "loss": 22.8578, + "step": 753 + }, + { + "epoch": 0.18173053747891058, + "grad_norm": 48.39779281616211, + "learning_rate": 3.763658482113669e-05, + "loss": 24.4768, + "step": 754 + }, + { + "epoch": 0.18197155941190649, + "grad_norm": 65.24878692626953, + "learning_rate": 3.762921623992392e-05, + "loss": 27.5446, + "step": 755 + }, + { + "epoch": 0.1822125813449024, + "grad_norm": 71.31907653808594, + "learning_rate": 3.762183691345472e-05, + "loss": 29.0114, + "step": 756 + }, + { + "epoch": 0.18245360327789828, + "grad_norm": 70.3539810180664, + "learning_rate": 3.76144468462269e-05, + "loss": 31.3894, + "step": 757 + }, + { + "epoch": 0.1826946252108942, + "grad_norm": 71.35221099853516, + "learning_rate": 3.76070460427448e-05, + "loss": 29.4727, + "step": 758 + }, + { + "epoch": 0.1829356471438901, + "grad_norm": 66.83491516113281, + "learning_rate": 3.759963450751933e-05, + "loss": 28.762, + "step": 759 + }, + { + "epoch": 0.18317666907688598, + "grad_norm": 60.034515380859375, + "learning_rate": 3.7592212245067904e-05, + "loss": 26.1163, + "step": 760 + }, + { + "epoch": 0.1834176910098819, + "grad_norm": 50.22133255004883, + "learning_rate": 3.758477925991452e-05, + "loss": 25.8671, + "step": 761 + }, + { + "epoch": 0.1836587129428778, + "grad_norm": 38.70512390136719, + "learning_rate": 3.7577335556589665e-05, + "loss": 24.1653, + "step": 762 + }, + { + "epoch": 0.18389973487587372, + "grad_norm": 27.275699615478516, + "learning_rate": 3.7569881139630395e-05, + "loss": 22.776, + "step": 763 + }, + { + "epoch": 0.1841407568088696, + "grad_norm": 15.525052070617676, + "learning_rate": 3.756241601358028e-05, + "loss": 23.0677, + "step": 764 + }, + { + "epoch": 0.1843817787418655, + "grad_norm": 10.72882080078125, + "learning_rate": 3.755494018298942e-05, + "loss": 22.5465, + "step": 765 + }, + { + "epoch": 0.18462280067486142, + "grad_norm": 2.634600877761841, + "learning_rate": 3.754745365241443e-05, + "loss": 22.2115, + "step": 766 + }, + { + "epoch": 0.1848638226078573, + "grad_norm": 1.9046841859817505, + "learning_rate": 3.753995642641847e-05, + "loss": 21.9572, + "step": 767 + }, + { + "epoch": 0.18510484454085321, + "grad_norm": 4.206916809082031, + "learning_rate": 3.7532448509571204e-05, + "loss": 20.6759, + "step": 768 + }, + { + "epoch": 0.18534586647384912, + "grad_norm": 8.363510131835938, + "learning_rate": 3.75249299064488e-05, + "loss": 21.7709, + "step": 769 + }, + { + "epoch": 0.18558688840684504, + "grad_norm": 11.81187915802002, + "learning_rate": 3.751740062163397e-05, + "loss": 21.8204, + "step": 770 + }, + { + "epoch": 0.18582791033984092, + "grad_norm": 17.604190826416016, + "learning_rate": 3.7509860659715916e-05, + "loss": 21.7737, + "step": 771 + }, + { + "epoch": 0.18606893227283683, + "grad_norm": 16.235673904418945, + "learning_rate": 3.750231002529033e-05, + "loss": 22.1272, + "step": 772 + }, + { + "epoch": 0.18630995420583274, + "grad_norm": 19.86736297607422, + "learning_rate": 3.749474872295946e-05, + "loss": 22.1305, + "step": 773 + }, + { + "epoch": 0.18655097613882862, + "grad_norm": 22.33451271057129, + "learning_rate": 3.7487176757332006e-05, + "loss": 21.9478, + "step": 774 + }, + { + "epoch": 0.18679199807182453, + "grad_norm": 23.71294593811035, + "learning_rate": 3.747959413302321e-05, + "loss": 21.7048, + "step": 775 + }, + { + "epoch": 0.18703302000482044, + "grad_norm": 18.09027099609375, + "learning_rate": 3.7472000854654756e-05, + "loss": 21.3417, + "step": 776 + }, + { + "epoch": 0.18727404193781635, + "grad_norm": 16.114736557006836, + "learning_rate": 3.746439692685488e-05, + "loss": 22.2453, + "step": 777 + }, + { + "epoch": 0.18751506387081224, + "grad_norm": 13.427240371704102, + "learning_rate": 3.7456782354258284e-05, + "loss": 21.1073, + "step": 778 + }, + { + "epoch": 0.18775608580380815, + "grad_norm": 14.862345695495605, + "learning_rate": 3.744915714150614e-05, + "loss": 21.3811, + "step": 779 + }, + { + "epoch": 0.18799710773680406, + "grad_norm": 6.597433567047119, + "learning_rate": 3.7441521293246135e-05, + "loss": 21.7575, + "step": 780 + }, + { + "epoch": 0.18823812966979994, + "grad_norm": 6.084556579589844, + "learning_rate": 3.743387481413243e-05, + "loss": 21.6616, + "step": 781 + }, + { + "epoch": 0.18847915160279585, + "grad_norm": 4.534776210784912, + "learning_rate": 3.742621770882565e-05, + "loss": 21.0127, + "step": 782 + }, + { + "epoch": 0.18872017353579176, + "grad_norm": 3.0846433639526367, + "learning_rate": 3.741854998199292e-05, + "loss": 20.5795, + "step": 783 + }, + { + "epoch": 0.18896119546878765, + "grad_norm": 2.5708165168762207, + "learning_rate": 3.741087163830781e-05, + "loss": 22.2097, + "step": 784 + }, + { + "epoch": 0.18920221740178356, + "grad_norm": 14.750458717346191, + "learning_rate": 3.740318268245041e-05, + "loss": 21.3423, + "step": 785 + }, + { + "epoch": 0.18944323933477947, + "grad_norm": 4.944022178649902, + "learning_rate": 3.739548311910722e-05, + "loss": 22.1949, + "step": 786 + }, + { + "epoch": 0.18968426126777538, + "grad_norm": 21.416227340698242, + "learning_rate": 3.738777295297125e-05, + "loss": 22.3334, + "step": 787 + }, + { + "epoch": 0.18992528320077126, + "grad_norm": 28.992021560668945, + "learning_rate": 3.738005218874194e-05, + "loss": 22.699, + "step": 788 + }, + { + "epoch": 0.19016630513376717, + "grad_norm": 44.88636016845703, + "learning_rate": 3.737232083112521e-05, + "loss": 23.655, + "step": 789 + }, + { + "epoch": 0.19040732706676308, + "grad_norm": 51.933109283447266, + "learning_rate": 3.736457888483344e-05, + "loss": 25.7663, + "step": 790 + }, + { + "epoch": 0.19064834899975897, + "grad_norm": 61.288856506347656, + "learning_rate": 3.7356826354585446e-05, + "loss": 26.5929, + "step": 791 + }, + { + "epoch": 0.19088937093275488, + "grad_norm": 63.34414291381836, + "learning_rate": 3.734906324510651e-05, + "loss": 27.2456, + "step": 792 + }, + { + "epoch": 0.1911303928657508, + "grad_norm": 59.83623504638672, + "learning_rate": 3.734128956112836e-05, + "loss": 27.1968, + "step": 793 + }, + { + "epoch": 0.1913714147987467, + "grad_norm": 50.08164596557617, + "learning_rate": 3.733350530738915e-05, + "loss": 24.7509, + "step": 794 + }, + { + "epoch": 0.19161243673174258, + "grad_norm": 43.74742126464844, + "learning_rate": 3.732571048863351e-05, + "loss": 25.0161, + "step": 795 + }, + { + "epoch": 0.1918534586647385, + "grad_norm": 33.93825912475586, + "learning_rate": 3.731790510961249e-05, + "loss": 24.5673, + "step": 796 + }, + { + "epoch": 0.1920944805977344, + "grad_norm": 22.395761489868164, + "learning_rate": 3.731008917508357e-05, + "loss": 23.084, + "step": 797 + }, + { + "epoch": 0.19233550253073028, + "grad_norm": 16.649518966674805, + "learning_rate": 3.730226268981068e-05, + "loss": 22.9752, + "step": 798 + }, + { + "epoch": 0.1925765244637262, + "grad_norm": 8.958707809448242, + "learning_rate": 3.729442565856417e-05, + "loss": 22.1614, + "step": 799 + }, + { + "epoch": 0.1928175463967221, + "grad_norm": 5.250572204589844, + "learning_rate": 3.7286578086120824e-05, + "loss": 22.027, + "step": 800 + }, + { + "epoch": 0.1928175463967221, + "eval_cc_pretrain_accuracy": 0.8799999952316284, + "eval_cc_pretrain_loss": 2.166408061981201, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 89.1473, + "eval_cc_pretrain_samples_per_second": 1.122, + "eval_cc_pretrain_steps_per_second": 0.011, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 800 + }, + { + "epoch": 0.1928175463967221, + "eval_mscoco_pretrain_accuracy": 0.8100000023841858, + "eval_mscoco_pretrain_loss": 2.4056732654571533, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 14.0606, + "eval_mscoco_pretrain_samples_per_second": 7.112, + "eval_mscoco_pretrain_steps_per_second": 0.071, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 800 + }, + { + "epoch": 0.19305856832971802, + "grad_norm": 5.869163513183594, + "learning_rate": 3.727871997726385e-05, + "loss": 21.7932, + "step": 801 + }, + { + "epoch": 0.1932995902627139, + "grad_norm": 4.520666122436523, + "learning_rate": 3.727085133678286e-05, + "loss": 21.6059, + "step": 802 + }, + { + "epoch": 0.1935406121957098, + "grad_norm": 5.400993824005127, + "learning_rate": 3.726297216947393e-05, + "loss": 21.844, + "step": 803 + }, + { + "epoch": 0.19378163412870572, + "grad_norm": 10.349540710449219, + "learning_rate": 3.72550824801395e-05, + "loss": 21.048, + "step": 804 + }, + { + "epoch": 0.1940226560617016, + "grad_norm": 14.205220222473145, + "learning_rate": 3.724718227358845e-05, + "loss": 21.849, + "step": 805 + }, + { + "epoch": 0.19426367799469751, + "grad_norm": 17.43669891357422, + "learning_rate": 3.7239271554636085e-05, + "loss": 21.7727, + "step": 806 + }, + { + "epoch": 0.19450469992769343, + "grad_norm": 19.928180694580078, + "learning_rate": 3.723135032810409e-05, + "loss": 22.3003, + "step": 807 + }, + { + "epoch": 0.19474572186068934, + "grad_norm": 20.42397689819336, + "learning_rate": 3.722341859882055e-05, + "loss": 22.2613, + "step": 808 + }, + { + "epoch": 0.19498674379368522, + "grad_norm": 22.04918098449707, + "learning_rate": 3.721547637161999e-05, + "loss": 21.3907, + "step": 809 + }, + { + "epoch": 0.19522776572668113, + "grad_norm": 21.427087783813477, + "learning_rate": 3.7207523651343293e-05, + "loss": 21.8909, + "step": 810 + }, + { + "epoch": 0.19546878765967704, + "grad_norm": 21.42592430114746, + "learning_rate": 3.719956044283776e-05, + "loss": 21.6939, + "step": 811 + }, + { + "epoch": 0.19570980959267292, + "grad_norm": 16.814210891723633, + "learning_rate": 3.7191586750957076e-05, + "loss": 22.3231, + "step": 812 + }, + { + "epoch": 0.19595083152566883, + "grad_norm": 16.694990158081055, + "learning_rate": 3.718360258056133e-05, + "loss": 22.1439, + "step": 813 + }, + { + "epoch": 0.19619185345866474, + "grad_norm": 10.851543426513672, + "learning_rate": 3.7175607936516967e-05, + "loss": 21.5501, + "step": 814 + }, + { + "epoch": 0.19643287539166063, + "grad_norm": 7.846954822540283, + "learning_rate": 3.7167602823696856e-05, + "loss": 22.0426, + "step": 815 + }, + { + "epoch": 0.19667389732465654, + "grad_norm": 4.274360179901123, + "learning_rate": 3.715958724698022e-05, + "loss": 22.1751, + "step": 816 + }, + { + "epoch": 0.19691491925765245, + "grad_norm": 2.3689184188842773, + "learning_rate": 3.715156121125265e-05, + "loss": 22.0063, + "step": 817 + }, + { + "epoch": 0.19715594119064836, + "grad_norm": 3.5172295570373535, + "learning_rate": 3.714352472140616e-05, + "loss": 20.6182, + "step": 818 + }, + { + "epoch": 0.19739696312364424, + "grad_norm": 2.32663893699646, + "learning_rate": 3.713547778233907e-05, + "loss": 21.3083, + "step": 819 + }, + { + "epoch": 0.19763798505664015, + "grad_norm": 6.056518077850342, + "learning_rate": 3.712742039895613e-05, + "loss": 21.1689, + "step": 820 + }, + { + "epoch": 0.19787900698963606, + "grad_norm": 3.4681448936462402, + "learning_rate": 3.711935257616842e-05, + "loss": 20.9934, + "step": 821 + }, + { + "epoch": 0.19812002892263195, + "grad_norm": 8.75568675994873, + "learning_rate": 3.711127431889337e-05, + "loss": 21.7797, + "step": 822 + }, + { + "epoch": 0.19836105085562786, + "grad_norm": 10.47391414642334, + "learning_rate": 3.710318563205483e-05, + "loss": 21.3937, + "step": 823 + }, + { + "epoch": 0.19860207278862377, + "grad_norm": 18.809553146362305, + "learning_rate": 3.709508652058295e-05, + "loss": 20.6569, + "step": 824 + }, + { + "epoch": 0.19884309472161968, + "grad_norm": 17.185203552246094, + "learning_rate": 3.708697698941425e-05, + "loss": 20.5954, + "step": 825 + }, + { + "epoch": 0.19908411665461556, + "grad_norm": 21.055828094482422, + "learning_rate": 3.707885704349161e-05, + "loss": 21.6438, + "step": 826 + }, + { + "epoch": 0.19932513858761147, + "grad_norm": 22.80304718017578, + "learning_rate": 3.7070726687764246e-05, + "loss": 21.1143, + "step": 827 + }, + { + "epoch": 0.19956616052060738, + "grad_norm": 22.047523498535156, + "learning_rate": 3.7062585927187735e-05, + "loss": 20.916, + "step": 828 + }, + { + "epoch": 0.19980718245360327, + "grad_norm": 24.065034866333008, + "learning_rate": 3.705443476672398e-05, + "loss": 22.2895, + "step": 829 + }, + { + "epoch": 0.20004820438659918, + "grad_norm": 22.560169219970703, + "learning_rate": 3.7046273211341216e-05, + "loss": 21.8009, + "step": 830 + }, + { + "epoch": 0.2002892263195951, + "grad_norm": 20.060279846191406, + "learning_rate": 3.703810126601405e-05, + "loss": 21.0667, + "step": 831 + }, + { + "epoch": 0.200530248252591, + "grad_norm": 17.945438385009766, + "learning_rate": 3.7029918935723374e-05, + "loss": 22.4379, + "step": 832 + }, + { + "epoch": 0.20077127018558688, + "grad_norm": 14.320589065551758, + "learning_rate": 3.7021726225456456e-05, + "loss": 21.1856, + "step": 833 + }, + { + "epoch": 0.2010122921185828, + "grad_norm": 12.097529411315918, + "learning_rate": 3.701352314020685e-05, + "loss": 20.5788, + "step": 834 + }, + { + "epoch": 0.2012533140515787, + "grad_norm": 5.979852199554443, + "learning_rate": 3.7005309684974464e-05, + "loss": 21.3946, + "step": 835 + }, + { + "epoch": 0.20149433598457459, + "grad_norm": 4.836956977844238, + "learning_rate": 3.699708586476551e-05, + "loss": 20.9766, + "step": 836 + }, + { + "epoch": 0.2017353579175705, + "grad_norm": 1.612532377243042, + "learning_rate": 3.698885168459254e-05, + "loss": 21.2638, + "step": 837 + }, + { + "epoch": 0.2019763798505664, + "grad_norm": 1.7711461782455444, + "learning_rate": 3.698060714947437e-05, + "loss": 20.6942, + "step": 838 + }, + { + "epoch": 0.20221740178356232, + "grad_norm": 1.9883495569229126, + "learning_rate": 3.6972352264436185e-05, + "loss": 20.8827, + "step": 839 + }, + { + "epoch": 0.2024584237165582, + "grad_norm": 7.466677188873291, + "learning_rate": 3.696408703450945e-05, + "loss": 21.0814, + "step": 840 + }, + { + "epoch": 0.2026994456495541, + "grad_norm": 9.373282432556152, + "learning_rate": 3.695581146473193e-05, + "loss": 22.2259, + "step": 841 + }, + { + "epoch": 0.20294046758255002, + "grad_norm": 14.839093208312988, + "learning_rate": 3.694752556014771e-05, + "loss": 21.0646, + "step": 842 + }, + { + "epoch": 0.2031814895155459, + "grad_norm": 23.69793128967285, + "learning_rate": 3.6939229325807164e-05, + "loss": 21.9857, + "step": 843 + }, + { + "epoch": 0.20342251144854182, + "grad_norm": 29.56319236755371, + "learning_rate": 3.6930922766766976e-05, + "loss": 22.1703, + "step": 844 + }, + { + "epoch": 0.20366353338153773, + "grad_norm": 33.31260299682617, + "learning_rate": 3.692260588809009e-05, + "loss": 22.6728, + "step": 845 + }, + { + "epoch": 0.2039045553145336, + "grad_norm": 37.3075065612793, + "learning_rate": 3.691427869484577e-05, + "loss": 22.0923, + "step": 846 + }, + { + "epoch": 0.20414557724752952, + "grad_norm": 37.30816650390625, + "learning_rate": 3.690594119210957e-05, + "loss": 23.3346, + "step": 847 + }, + { + "epoch": 0.20438659918052543, + "grad_norm": 37.56706237792969, + "learning_rate": 3.6897593384963286e-05, + "loss": 22.8104, + "step": 848 + }, + { + "epoch": 0.20462762111352134, + "grad_norm": 35.06133270263672, + "learning_rate": 3.6889235278495056e-05, + "loss": 23.152, + "step": 849 + }, + { + "epoch": 0.20486864304651722, + "grad_norm": 34.81103515625, + "learning_rate": 3.688086687779925e-05, + "loss": 22.0126, + "step": 850 + }, + { + "epoch": 0.20510966497951313, + "grad_norm": 30.684978485107422, + "learning_rate": 3.687248818797652e-05, + "loss": 22.722, + "step": 851 + }, + { + "epoch": 0.20535068691250905, + "grad_norm": 24.325437545776367, + "learning_rate": 3.6864099214133814e-05, + "loss": 22.1306, + "step": 852 + }, + { + "epoch": 0.20559170884550493, + "grad_norm": 20.07086753845215, + "learning_rate": 3.685569996138431e-05, + "loss": 21.4741, + "step": 853 + }, + { + "epoch": 0.20583273077850084, + "grad_norm": 12.597243309020996, + "learning_rate": 3.6847290434847485e-05, + "loss": 21.9686, + "step": 854 + }, + { + "epoch": 0.20607375271149675, + "grad_norm": 11.477411270141602, + "learning_rate": 3.683887063964906e-05, + "loss": 21.386, + "step": 855 + }, + { + "epoch": 0.20631477464449266, + "grad_norm": 7.649277687072754, + "learning_rate": 3.683044058092102e-05, + "loss": 21.9083, + "step": 856 + }, + { + "epoch": 0.20655579657748854, + "grad_norm": 1.776198148727417, + "learning_rate": 3.68220002638016e-05, + "loss": 21.3838, + "step": 857 + }, + { + "epoch": 0.20679681851048445, + "grad_norm": 2.56734299659729, + "learning_rate": 3.68135496934353e-05, + "loss": 22.3454, + "step": 858 + }, + { + "epoch": 0.20703784044348036, + "grad_norm": 6.8171706199646, + "learning_rate": 3.680508887497286e-05, + "loss": 20.9641, + "step": 859 + }, + { + "epoch": 0.20727886237647625, + "grad_norm": 9.168807983398438, + "learning_rate": 3.679661781357127e-05, + "loss": 20.5635, + "step": 860 + }, + { + "epoch": 0.20751988430947216, + "grad_norm": 9.37685489654541, + "learning_rate": 3.678813651439376e-05, + "loss": 21.6594, + "step": 861 + }, + { + "epoch": 0.20776090624246807, + "grad_norm": 17.446382522583008, + "learning_rate": 3.677964498260979e-05, + "loss": 21.531, + "step": 862 + }, + { + "epoch": 0.20800192817546398, + "grad_norm": 21.31409454345703, + "learning_rate": 3.6771143223395076e-05, + "loss": 21.5732, + "step": 863 + }, + { + "epoch": 0.20824295010845986, + "grad_norm": 20.07172203063965, + "learning_rate": 3.676263124193158e-05, + "loss": 21.5875, + "step": 864 + }, + { + "epoch": 0.20848397204145577, + "grad_norm": 20.935775756835938, + "learning_rate": 3.675410904340745e-05, + "loss": 21.2708, + "step": 865 + }, + { + "epoch": 0.20872499397445168, + "grad_norm": 24.310409545898438, + "learning_rate": 3.6745576633017094e-05, + "loss": 21.9907, + "step": 866 + }, + { + "epoch": 0.20896601590744757, + "grad_norm": 22.56197166442871, + "learning_rate": 3.673703401596114e-05, + "loss": 21.8748, + "step": 867 + }, + { + "epoch": 0.20920703784044348, + "grad_norm": 18.69454574584961, + "learning_rate": 3.672848119744642e-05, + "loss": 21.6945, + "step": 868 + }, + { + "epoch": 0.2094480597734394, + "grad_norm": 16.82071304321289, + "learning_rate": 3.671991818268602e-05, + "loss": 21.7458, + "step": 869 + }, + { + "epoch": 0.2096890817064353, + "grad_norm": 14.270866394042969, + "learning_rate": 3.671134497689921e-05, + "loss": 21.8084, + "step": 870 + }, + { + "epoch": 0.20993010363943118, + "grad_norm": 9.36831283569336, + "learning_rate": 3.670276158531147e-05, + "loss": 22.0947, + "step": 871 + }, + { + "epoch": 0.2101711255724271, + "grad_norm": 8.695694923400879, + "learning_rate": 3.6694168013154514e-05, + "loss": 21.2832, + "step": 872 + }, + { + "epoch": 0.210412147505423, + "grad_norm": 5.749298572540283, + "learning_rate": 3.668556426566624e-05, + "loss": 20.6506, + "step": 873 + }, + { + "epoch": 0.21065316943841889, + "grad_norm": 5.087028503417969, + "learning_rate": 3.667695034809074e-05, + "loss": 20.4068, + "step": 874 + }, + { + "epoch": 0.2108941913714148, + "grad_norm": 1.6642216444015503, + "learning_rate": 3.6668326265678335e-05, + "loss": 20.5114, + "step": 875 + }, + { + "epoch": 0.2111352133044107, + "grad_norm": 2.0640783309936523, + "learning_rate": 3.6659692023685524e-05, + "loss": 20.9862, + "step": 876 + }, + { + "epoch": 0.21137623523740662, + "grad_norm": 1.5853240489959717, + "learning_rate": 3.665104762737499e-05, + "loss": 20.8021, + "step": 877 + }, + { + "epoch": 0.2116172571704025, + "grad_norm": 3.6520843505859375, + "learning_rate": 3.6642393082015626e-05, + "loss": 21.3604, + "step": 878 + }, + { + "epoch": 0.2118582791033984, + "grad_norm": 14.956293106079102, + "learning_rate": 3.663372839288249e-05, + "loss": 21.1431, + "step": 879 + }, + { + "epoch": 0.21209930103639432, + "grad_norm": 18.310239791870117, + "learning_rate": 3.662505356525683e-05, + "loss": 20.5168, + "step": 880 + }, + { + "epoch": 0.2123403229693902, + "grad_norm": 27.563657760620117, + "learning_rate": 3.66163686044261e-05, + "loss": 21.9958, + "step": 881 + }, + { + "epoch": 0.21258134490238612, + "grad_norm": 40.807979583740234, + "learning_rate": 3.6607673515683875e-05, + "loss": 22.8068, + "step": 882 + }, + { + "epoch": 0.21282236683538203, + "grad_norm": 45.815093994140625, + "learning_rate": 3.659896830432995e-05, + "loss": 22.8018, + "step": 883 + }, + { + "epoch": 0.2130633887683779, + "grad_norm": 48.81721496582031, + "learning_rate": 3.659025297567028e-05, + "loss": 23.1674, + "step": 884 + }, + { + "epoch": 0.21330441070137382, + "grad_norm": 50.32554244995117, + "learning_rate": 3.6581527535016966e-05, + "loss": 23.3149, + "step": 885 + }, + { + "epoch": 0.21354543263436973, + "grad_norm": 52.82823181152344, + "learning_rate": 3.65727919876883e-05, + "loss": 24.2589, + "step": 886 + }, + { + "epoch": 0.21378645456736564, + "grad_norm": 49.58094024658203, + "learning_rate": 3.656404633900871e-05, + "loss": 23.8482, + "step": 887 + }, + { + "epoch": 0.21402747650036152, + "grad_norm": 51.08318328857422, + "learning_rate": 3.655529059430881e-05, + "loss": 22.9739, + "step": 888 + }, + { + "epoch": 0.21426849843335743, + "grad_norm": 43.06972885131836, + "learning_rate": 3.654652475892533e-05, + "loss": 24.0088, + "step": 889 + }, + { + "epoch": 0.21450952036635335, + "grad_norm": 37.5738410949707, + "learning_rate": 3.6537748838201184e-05, + "loss": 22.765, + "step": 890 + }, + { + "epoch": 0.21475054229934923, + "grad_norm": 32.31671905517578, + "learning_rate": 3.6528962837485414e-05, + "loss": 22.8902, + "step": 891 + }, + { + "epoch": 0.21499156423234514, + "grad_norm": 22.824617385864258, + "learning_rate": 3.652016676213321e-05, + "loss": 21.5107, + "step": 892 + }, + { + "epoch": 0.21523258616534105, + "grad_norm": 15.791105270385742, + "learning_rate": 3.651136061750592e-05, + "loss": 21.6994, + "step": 893 + }, + { + "epoch": 0.21547360809833696, + "grad_norm": 9.220160484313965, + "learning_rate": 3.6502544408970985e-05, + "loss": 21.4072, + "step": 894 + }, + { + "epoch": 0.21571463003133284, + "grad_norm": 1.815979242324829, + "learning_rate": 3.649371814190203e-05, + "loss": 21.1899, + "step": 895 + }, + { + "epoch": 0.21595565196432875, + "grad_norm": 6.142794609069824, + "learning_rate": 3.6484881821678786e-05, + "loss": 21.8995, + "step": 896 + }, + { + "epoch": 0.21619667389732466, + "grad_norm": 7.2052202224731445, + "learning_rate": 3.647603545368711e-05, + "loss": 21.4858, + "step": 897 + }, + { + "epoch": 0.21643769583032055, + "grad_norm": 3.2237374782562256, + "learning_rate": 3.6467179043319e-05, + "loss": 21.9591, + "step": 898 + }, + { + "epoch": 0.21667871776331646, + "grad_norm": 1.4357439279556274, + "learning_rate": 3.645831259597255e-05, + "loss": 20.8669, + "step": 899 + }, + { + "epoch": 0.21691973969631237, + "grad_norm": 8.675867080688477, + "learning_rate": 3.644943611705199e-05, + "loss": 20.5105, + "step": 900 + }, + { + "epoch": 0.21716076162930828, + "grad_norm": 19.192522048950195, + "learning_rate": 3.6440549611967656e-05, + "loss": 20.9417, + "step": 901 + }, + { + "epoch": 0.21740178356230416, + "grad_norm": 25.062986373901367, + "learning_rate": 3.6431653086136e-05, + "loss": 21.4274, + "step": 902 + }, + { + "epoch": 0.21764280549530007, + "grad_norm": 32.06208801269531, + "learning_rate": 3.642274654497958e-05, + "loss": 21.9505, + "step": 903 + }, + { + "epoch": 0.21788382742829598, + "grad_norm": 37.31444549560547, + "learning_rate": 3.641382999392707e-05, + "loss": 23.0661, + "step": 904 + }, + { + "epoch": 0.21812484936129187, + "grad_norm": 42.57569122314453, + "learning_rate": 3.640490343841322e-05, + "loss": 22.3597, + "step": 905 + }, + { + "epoch": 0.21836587129428778, + "grad_norm": 41.07797622680664, + "learning_rate": 3.639596688387889e-05, + "loss": 22.4857, + "step": 906 + }, + { + "epoch": 0.2186068932272837, + "grad_norm": 37.33319854736328, + "learning_rate": 3.638702033577105e-05, + "loss": 22.8874, + "step": 907 + }, + { + "epoch": 0.2188479151602796, + "grad_norm": 32.21171569824219, + "learning_rate": 3.637806379954273e-05, + "loss": 22.6383, + "step": 908 + }, + { + "epoch": 0.21908893709327548, + "grad_norm": 23.110246658325195, + "learning_rate": 3.636909728065309e-05, + "loss": 22.3739, + "step": 909 + }, + { + "epoch": 0.2193299590262714, + "grad_norm": 16.895479202270508, + "learning_rate": 3.636012078456732e-05, + "loss": 21.3967, + "step": 910 + }, + { + "epoch": 0.2195709809592673, + "grad_norm": 11.63868236541748, + "learning_rate": 3.6351134316756744e-05, + "loss": 20.9016, + "step": 911 + }, + { + "epoch": 0.2198120028922632, + "grad_norm": 2.457843780517578, + "learning_rate": 3.634213788269873e-05, + "loss": 20.4118, + "step": 912 + }, + { + "epoch": 0.2200530248252591, + "grad_norm": 5.580713748931885, + "learning_rate": 3.6333131487876745e-05, + "loss": 20.7661, + "step": 913 + }, + { + "epoch": 0.220294046758255, + "grad_norm": 11.97103214263916, + "learning_rate": 3.63241151377803e-05, + "loss": 22.0967, + "step": 914 + }, + { + "epoch": 0.2205350686912509, + "grad_norm": 17.604930877685547, + "learning_rate": 3.631508883790498e-05, + "loss": 20.9359, + "step": 915 + }, + { + "epoch": 0.2207760906242468, + "grad_norm": 12.8815336227417, + "learning_rate": 3.630605259375247e-05, + "loss": 20.9112, + "step": 916 + }, + { + "epoch": 0.2210171125572427, + "grad_norm": 4.970508575439453, + "learning_rate": 3.629700641083046e-05, + "loss": 21.0697, + "step": 917 + }, + { + "epoch": 0.22125813449023862, + "grad_norm": 2.208366632461548, + "learning_rate": 3.628795029465275e-05, + "loss": 21.5142, + "step": 918 + }, + { + "epoch": 0.2214991564232345, + "grad_norm": 7.31242036819458, + "learning_rate": 3.627888425073915e-05, + "loss": 20.5457, + "step": 919 + }, + { + "epoch": 0.22174017835623042, + "grad_norm": 19.3222713470459, + "learning_rate": 3.6269808284615564e-05, + "loss": 22.0854, + "step": 920 + }, + { + "epoch": 0.22198120028922633, + "grad_norm": 25.554269790649414, + "learning_rate": 3.62607224018139e-05, + "loss": 21.7646, + "step": 921 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 32.82530975341797, + "learning_rate": 3.625162660787214e-05, + "loss": 22.6456, + "step": 922 + }, + { + "epoch": 0.22246324415521812, + "grad_norm": 34.567626953125, + "learning_rate": 3.62425209083343e-05, + "loss": 22.4692, + "step": 923 + }, + { + "epoch": 0.22270426608821403, + "grad_norm": 39.08049392700195, + "learning_rate": 3.6233405308750446e-05, + "loss": 23.6224, + "step": 924 + }, + { + "epoch": 0.22294528802120994, + "grad_norm": 37.83222198486328, + "learning_rate": 3.622427981467664e-05, + "loss": 22.8781, + "step": 925 + }, + { + "epoch": 0.22318630995420582, + "grad_norm": 36.07973861694336, + "learning_rate": 3.6215144431675005e-05, + "loss": 23.1247, + "step": 926 + }, + { + "epoch": 0.22342733188720174, + "grad_norm": 29.961841583251953, + "learning_rate": 3.620599916531371e-05, + "loss": 23.1098, + "step": 927 + }, + { + "epoch": 0.22366835382019765, + "grad_norm": 25.833627700805664, + "learning_rate": 3.619684402116689e-05, + "loss": 21.8955, + "step": 928 + }, + { + "epoch": 0.22390937575319353, + "grad_norm": 21.096147537231445, + "learning_rate": 3.618767900481476e-05, + "loss": 22.6461, + "step": 929 + }, + { + "epoch": 0.22415039768618944, + "grad_norm": 17.73729705810547, + "learning_rate": 3.617850412184352e-05, + "loss": 22.4857, + "step": 930 + }, + { + "epoch": 0.22439141961918535, + "grad_norm": 8.917638778686523, + "learning_rate": 3.616931937784538e-05, + "loss": 22.3732, + "step": 931 + }, + { + "epoch": 0.22463244155218126, + "grad_norm": 1.6201690435409546, + "learning_rate": 3.616012477841858e-05, + "loss": 22.3905, + "step": 932 + }, + { + "epoch": 0.22487346348517714, + "grad_norm": 3.5188074111938477, + "learning_rate": 3.615092032916736e-05, + "loss": 21.6918, + "step": 933 + }, + { + "epoch": 0.22511448541817305, + "grad_norm": 4.96296501159668, + "learning_rate": 3.614170603570197e-05, + "loss": 22.6718, + "step": 934 + }, + { + "epoch": 0.22535550735116897, + "grad_norm": 4.870179176330566, + "learning_rate": 3.613248190363863e-05, + "loss": 21.1603, + "step": 935 + }, + { + "epoch": 0.22559652928416485, + "grad_norm": 3.689265251159668, + "learning_rate": 3.6123247938599596e-05, + "loss": 20.8046, + "step": 936 + }, + { + "epoch": 0.22583755121716076, + "grad_norm": 7.061726093292236, + "learning_rate": 3.61140041462131e-05, + "loss": 21.8837, + "step": 937 + }, + { + "epoch": 0.22607857315015667, + "grad_norm": 14.706998825073242, + "learning_rate": 3.610475053211335e-05, + "loss": 21.1741, + "step": 938 + }, + { + "epoch": 0.22631959508315258, + "grad_norm": 19.316429138183594, + "learning_rate": 3.6095487101940576e-05, + "loss": 20.6701, + "step": 939 + }, + { + "epoch": 0.22656061701614846, + "grad_norm": 27.0603084564209, + "learning_rate": 3.608621386134095e-05, + "loss": 21.6768, + "step": 940 + }, + { + "epoch": 0.22680163894914437, + "grad_norm": 33.565738677978516, + "learning_rate": 3.6076930815966654e-05, + "loss": 22.2974, + "step": 941 + }, + { + "epoch": 0.22704266088214028, + "grad_norm": 33.070281982421875, + "learning_rate": 3.6067637971475824e-05, + "loss": 22.0808, + "step": 942 + }, + { + "epoch": 0.22728368281513617, + "grad_norm": 37.57207107543945, + "learning_rate": 3.6058335333532595e-05, + "loss": 22.281, + "step": 943 + }, + { + "epoch": 0.22752470474813208, + "grad_norm": 38.577308654785156, + "learning_rate": 3.6049022907807046e-05, + "loss": 22.5049, + "step": 944 + }, + { + "epoch": 0.227765726681128, + "grad_norm": 33.57667541503906, + "learning_rate": 3.603970069997524e-05, + "loss": 23.4467, + "step": 945 + }, + { + "epoch": 0.22800674861412387, + "grad_norm": 33.078163146972656, + "learning_rate": 3.603036871571919e-05, + "loss": 23.2744, + "step": 946 + }, + { + "epoch": 0.22824777054711978, + "grad_norm": 28.704524993896484, + "learning_rate": 3.602102696072687e-05, + "loss": 21.6641, + "step": 947 + }, + { + "epoch": 0.2284887924801157, + "grad_norm": 25.95537757873535, + "learning_rate": 3.601167544069221e-05, + "loss": 22.3201, + "step": 948 + }, + { + "epoch": 0.2287298144131116, + "grad_norm": 23.590246200561523, + "learning_rate": 3.60023141613151e-05, + "loss": 22.2132, + "step": 949 + }, + { + "epoch": 0.2289708363461075, + "grad_norm": 16.733867645263672, + "learning_rate": 3.599294312830138e-05, + "loss": 21.3829, + "step": 950 + }, + { + "epoch": 0.2292118582791034, + "grad_norm": 14.630293846130371, + "learning_rate": 3.5983562347362806e-05, + "loss": 22.1841, + "step": 951 + }, + { + "epoch": 0.2294528802120993, + "grad_norm": 11.039003372192383, + "learning_rate": 3.5974171824217104e-05, + "loss": 21.4457, + "step": 952 + }, + { + "epoch": 0.2296939021450952, + "grad_norm": 4.33519172668457, + "learning_rate": 3.596477156458794e-05, + "loss": 21.6713, + "step": 953 + }, + { + "epoch": 0.2299349240780911, + "grad_norm": 2.3615939617156982, + "learning_rate": 3.595536157420489e-05, + "loss": 20.8649, + "step": 954 + }, + { + "epoch": 0.230175946011087, + "grad_norm": 7.1489667892456055, + "learning_rate": 3.5945941858803496e-05, + "loss": 20.4157, + "step": 955 + }, + { + "epoch": 0.23041696794408292, + "grad_norm": 7.502321720123291, + "learning_rate": 3.5936512424125196e-05, + "loss": 21.1113, + "step": 956 + }, + { + "epoch": 0.2306579898770788, + "grad_norm": 12.255635261535645, + "learning_rate": 3.592707327591736e-05, + "loss": 20.5975, + "step": 957 + }, + { + "epoch": 0.23089901181007472, + "grad_norm": 17.354801177978516, + "learning_rate": 3.591762441993329e-05, + "loss": 20.7926, + "step": 958 + }, + { + "epoch": 0.23114003374307063, + "grad_norm": 18.741708755493164, + "learning_rate": 3.590816586193219e-05, + "loss": 21.4677, + "step": 959 + }, + { + "epoch": 0.2313810556760665, + "grad_norm": 21.10553550720215, + "learning_rate": 3.589869760767919e-05, + "loss": 21.6874, + "step": 960 + }, + { + "epoch": 0.23162207760906242, + "grad_norm": 14.746813774108887, + "learning_rate": 3.5889219662945326e-05, + "loss": 20.4155, + "step": 961 + }, + { + "epoch": 0.23186309954205833, + "grad_norm": 15.809584617614746, + "learning_rate": 3.5879732033507535e-05, + "loss": 21.5371, + "step": 962 + }, + { + "epoch": 0.23210412147505424, + "grad_norm": 10.300724029541016, + "learning_rate": 3.587023472514866e-05, + "loss": 21.3097, + "step": 963 + }, + { + "epoch": 0.23234514340805013, + "grad_norm": 6.996102333068848, + "learning_rate": 3.586072774365745e-05, + "loss": 21.9895, + "step": 964 + }, + { + "epoch": 0.23258616534104604, + "grad_norm": 3.374385356903076, + "learning_rate": 3.585121109482854e-05, + "loss": 22.0854, + "step": 965 + }, + { + "epoch": 0.23282718727404195, + "grad_norm": 2.065126657485962, + "learning_rate": 3.584168478446247e-05, + "loss": 20.162, + "step": 966 + }, + { + "epoch": 0.23306820920703783, + "grad_norm": 2.247349500656128, + "learning_rate": 3.583214881836565e-05, + "loss": 21.6428, + "step": 967 + }, + { + "epoch": 0.23330923114003374, + "grad_norm": 1.8116333484649658, + "learning_rate": 3.582260320235039e-05, + "loss": 21.5941, + "step": 968 + }, + { + "epoch": 0.23355025307302965, + "grad_norm": 9.236371994018555, + "learning_rate": 3.581304794223488e-05, + "loss": 21.611, + "step": 969 + }, + { + "epoch": 0.23379127500602556, + "grad_norm": 21.687000274658203, + "learning_rate": 3.580348304384318e-05, + "loss": 20.6919, + "step": 970 + }, + { + "epoch": 0.23403229693902144, + "grad_norm": 23.700218200683594, + "learning_rate": 3.5793908513005245e-05, + "loss": 20.8001, + "step": 971 + }, + { + "epoch": 0.23427331887201736, + "grad_norm": 30.05514144897461, + "learning_rate": 3.5784324355556873e-05, + "loss": 21.5134, + "step": 972 + }, + { + "epoch": 0.23451434080501327, + "grad_norm": 32.29884719848633, + "learning_rate": 3.577473057733975e-05, + "loss": 22.4395, + "step": 973 + }, + { + "epoch": 0.23475536273800915, + "grad_norm": 36.54646682739258, + "learning_rate": 3.576512718420141e-05, + "loss": 21.5121, + "step": 974 + }, + { + "epoch": 0.23499638467100506, + "grad_norm": 36.796321868896484, + "learning_rate": 3.5755514181995266e-05, + "loss": 22.0916, + "step": 975 + }, + { + "epoch": 0.23523740660400097, + "grad_norm": 39.051551818847656, + "learning_rate": 3.574589157658058e-05, + "loss": 22.2231, + "step": 976 + }, + { + "epoch": 0.23547842853699685, + "grad_norm": 39.30126953125, + "learning_rate": 3.573625937382247e-05, + "loss": 21.9954, + "step": 977 + }, + { + "epoch": 0.23571945046999276, + "grad_norm": 38.05337142944336, + "learning_rate": 3.572661757959188e-05, + "loss": 21.3762, + "step": 978 + }, + { + "epoch": 0.23596047240298867, + "grad_norm": 33.306549072265625, + "learning_rate": 3.571696619976564e-05, + "loss": 22.5332, + "step": 979 + }, + { + "epoch": 0.23620149433598459, + "grad_norm": 30.303497314453125, + "learning_rate": 3.570730524022638e-05, + "loss": 21.9455, + "step": 980 + }, + { + "epoch": 0.23644251626898047, + "grad_norm": 28.56393814086914, + "learning_rate": 3.569763470686262e-05, + "loss": 22.3328, + "step": 981 + }, + { + "epoch": 0.23668353820197638, + "grad_norm": 22.815141677856445, + "learning_rate": 3.568795460556867e-05, + "loss": 21.3334, + "step": 982 + }, + { + "epoch": 0.2369245601349723, + "grad_norm": 20.5685977935791, + "learning_rate": 3.567826494224469e-05, + "loss": 20.7669, + "step": 983 + }, + { + "epoch": 0.23716558206796817, + "grad_norm": 13.338984489440918, + "learning_rate": 3.566856572279667e-05, + "loss": 20.2983, + "step": 984 + }, + { + "epoch": 0.23740660400096408, + "grad_norm": 9.199040412902832, + "learning_rate": 3.5658856953136426e-05, + "loss": 20.7857, + "step": 985 + }, + { + "epoch": 0.23764762593396, + "grad_norm": 3.1796629428863525, + "learning_rate": 3.564913863918157e-05, + "loss": 20.5052, + "step": 986 + }, + { + "epoch": 0.2378886478669559, + "grad_norm": 3.2163455486297607, + "learning_rate": 3.563941078685557e-05, + "loss": 21.3819, + "step": 987 + }, + { + "epoch": 0.2381296697999518, + "grad_norm": 1.505488395690918, + "learning_rate": 3.562967340208769e-05, + "loss": 21.2369, + "step": 988 + }, + { + "epoch": 0.2383706917329477, + "grad_norm": 2.5937600135803223, + "learning_rate": 3.5619926490812996e-05, + "loss": 20.6446, + "step": 989 + }, + { + "epoch": 0.2386117136659436, + "grad_norm": 4.670770645141602, + "learning_rate": 3.561017005897236e-05, + "loss": 20.2673, + "step": 990 + }, + { + "epoch": 0.2388527355989395, + "grad_norm": 5.8919878005981445, + "learning_rate": 3.5600404112512494e-05, + "loss": 21.2206, + "step": 991 + }, + { + "epoch": 0.2390937575319354, + "grad_norm": 5.057524681091309, + "learning_rate": 3.559062865738585e-05, + "loss": 20.8188, + "step": 992 + }, + { + "epoch": 0.2393347794649313, + "grad_norm": 5.101719379425049, + "learning_rate": 3.5580843699550716e-05, + "loss": 21.259, + "step": 993 + }, + { + "epoch": 0.23957580139792722, + "grad_norm": 6.483858108520508, + "learning_rate": 3.5571049244971174e-05, + "loss": 21.9059, + "step": 994 + }, + { + "epoch": 0.2398168233309231, + "grad_norm": 7.103264331817627, + "learning_rate": 3.5561245299617065e-05, + "loss": 21.2847, + "step": 995 + }, + { + "epoch": 0.24005784526391902, + "grad_norm": 4.3676934242248535, + "learning_rate": 3.5551431869464045e-05, + "loss": 21.5107, + "step": 996 + }, + { + "epoch": 0.24029886719691493, + "grad_norm": 3.7278223037719727, + "learning_rate": 3.5541608960493526e-05, + "loss": 20.8945, + "step": 997 + }, + { + "epoch": 0.2405398891299108, + "grad_norm": 6.421048641204834, + "learning_rate": 3.553177657869272e-05, + "loss": 20.7896, + "step": 998 + }, + { + "epoch": 0.24078091106290672, + "grad_norm": 11.5415678024292, + "learning_rate": 3.55219347300546e-05, + "loss": 20.7375, + "step": 999 + }, + { + "epoch": 0.24102193299590263, + "grad_norm": 12.665024757385254, + "learning_rate": 3.5512083420577915e-05, + "loss": 21.5208, + "step": 1000 + }, + { + "epoch": 0.24102193299590263, + "eval_cc_pretrain_accuracy": 0.9199999570846558, + "eval_cc_pretrain_loss": 2.142223358154297, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 18.6384, + "eval_cc_pretrain_samples_per_second": 5.365, + "eval_cc_pretrain_steps_per_second": 0.054, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 1000 + }, + { + "epoch": 0.24102193299590263, + "eval_mscoco_pretrain_accuracy": 0.8100000023841858, + "eval_mscoco_pretrain_loss": 2.3884670734405518, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 14.5934, + "eval_mscoco_pretrain_samples_per_second": 6.852, + "eval_mscoco_pretrain_steps_per_second": 0.069, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 1000 + }, + { + "epoch": 0.24126295492889854, + "grad_norm": 17.701807022094727, + "learning_rate": 3.550222265626717e-05, + "loss": 20.4916, + "step": 1001 + }, + { + "epoch": 0.24150397686189443, + "grad_norm": 16.345626831054688, + "learning_rate": 3.549235244313265e-05, + "loss": 21.4634, + "step": 1002 + }, + { + "epoch": 0.24174499879489034, + "grad_norm": 12.412630081176758, + "learning_rate": 3.5482472787190376e-05, + "loss": 21.2312, + "step": 1003 + }, + { + "epoch": 0.24198602072788625, + "grad_norm": 16.0887393951416, + "learning_rate": 3.5472583694462144e-05, + "loss": 21.6906, + "step": 1004 + }, + { + "epoch": 0.24222704266088213, + "grad_norm": 14.725573539733887, + "learning_rate": 3.5462685170975505e-05, + "loss": 21.9677, + "step": 1005 + }, + { + "epoch": 0.24246806459387804, + "grad_norm": 11.373269081115723, + "learning_rate": 3.545277722276374e-05, + "loss": 21.3956, + "step": 1006 + }, + { + "epoch": 0.24270908652687395, + "grad_norm": 11.857497215270996, + "learning_rate": 3.544285985586588e-05, + "loss": 20.612, + "step": 1007 + }, + { + "epoch": 0.24295010845986983, + "grad_norm": 8.23745059967041, + "learning_rate": 3.54329330763267e-05, + "loss": 21.1769, + "step": 1008 + }, + { + "epoch": 0.24319113039286575, + "grad_norm": 9.109553337097168, + "learning_rate": 3.542299689019673e-05, + "loss": 21.8805, + "step": 1009 + }, + { + "epoch": 0.24343215232586166, + "grad_norm": 2.4734878540039062, + "learning_rate": 3.54130513035322e-05, + "loss": 21.9092, + "step": 1010 + }, + { + "epoch": 0.24367317425885757, + "grad_norm": 8.138373374938965, + "learning_rate": 3.540309632239508e-05, + "loss": 20.6876, + "step": 1011 + }, + { + "epoch": 0.24391419619185345, + "grad_norm": 2.3619275093078613, + "learning_rate": 3.539313195285309e-05, + "loss": 21.3876, + "step": 1012 + }, + { + "epoch": 0.24415521812484936, + "grad_norm": 4.846138954162598, + "learning_rate": 3.5383158200979636e-05, + "loss": 21.1847, + "step": 1013 + }, + { + "epoch": 0.24439624005784527, + "grad_norm": 15.581999778747559, + "learning_rate": 3.537317507285388e-05, + "loss": 21.4995, + "step": 1014 + }, + { + "epoch": 0.24463726199084115, + "grad_norm": 32.0710563659668, + "learning_rate": 3.536318257456066e-05, + "loss": 22.1403, + "step": 1015 + }, + { + "epoch": 0.24487828392383706, + "grad_norm": 51.333431243896484, + "learning_rate": 3.535318071219056e-05, + "loss": 22.6505, + "step": 1016 + }, + { + "epoch": 0.24511930585683298, + "grad_norm": 69.13065338134766, + "learning_rate": 3.5343169491839844e-05, + "loss": 25.1537, + "step": 1017 + }, + { + "epoch": 0.24536032778982889, + "grad_norm": 75.64900970458984, + "learning_rate": 3.53331489196105e-05, + "loss": 26.1074, + "step": 1018 + }, + { + "epoch": 0.24560134972282477, + "grad_norm": 78.16192626953125, + "learning_rate": 3.532311900161021e-05, + "loss": 26.2218, + "step": 1019 + }, + { + "epoch": 0.24584237165582068, + "grad_norm": 83.18775177001953, + "learning_rate": 3.531307974395234e-05, + "loss": 27.0304, + "step": 1020 + }, + { + "epoch": 0.2460833935888166, + "grad_norm": 80.67652893066406, + "learning_rate": 3.530303115275597e-05, + "loss": 26.8368, + "step": 1021 + }, + { + "epoch": 0.24632441552181247, + "grad_norm": 75.1666030883789, + "learning_rate": 3.529297323414586e-05, + "loss": 26.1669, + "step": 1022 + }, + { + "epoch": 0.24656543745480838, + "grad_norm": 74.15137481689453, + "learning_rate": 3.528290599425245e-05, + "loss": 25.9959, + "step": 1023 + }, + { + "epoch": 0.2468064593878043, + "grad_norm": 63.63055419921875, + "learning_rate": 3.527282943921186e-05, + "loss": 24.7411, + "step": 1024 + }, + { + "epoch": 0.2470474813208002, + "grad_norm": 54.60872268676758, + "learning_rate": 3.526274357516589e-05, + "loss": 24.1965, + "step": 1025 + }, + { + "epoch": 0.2472885032537961, + "grad_norm": 42.83991241455078, + "learning_rate": 3.5252648408262034e-05, + "loss": 21.7545, + "step": 1026 + }, + { + "epoch": 0.247529525186792, + "grad_norm": 34.33694839477539, + "learning_rate": 3.5242543944653424e-05, + "loss": 22.5449, + "step": 1027 + }, + { + "epoch": 0.2477705471197879, + "grad_norm": 22.08848762512207, + "learning_rate": 3.523243019049888e-05, + "loss": 21.1876, + "step": 1028 + }, + { + "epoch": 0.2480115690527838, + "grad_norm": 9.783548355102539, + "learning_rate": 3.522230715196287e-05, + "loss": 21.5048, + "step": 1029 + }, + { + "epoch": 0.2482525909857797, + "grad_norm": 3.918970823287964, + "learning_rate": 3.521217483521554e-05, + "loss": 20.8358, + "step": 1030 + }, + { + "epoch": 0.2484936129187756, + "grad_norm": 2.4173386096954346, + "learning_rate": 3.520203324643267e-05, + "loss": 20.6262, + "step": 1031 + }, + { + "epoch": 0.24873463485177152, + "grad_norm": 1.5907330513000488, + "learning_rate": 3.5191882391795715e-05, + "loss": 21.0307, + "step": 1032 + }, + { + "epoch": 0.2489756567847674, + "grad_norm": 2.8790700435638428, + "learning_rate": 3.518172227749174e-05, + "loss": 20.658, + "step": 1033 + }, + { + "epoch": 0.24921667871776332, + "grad_norm": 10.532641410827637, + "learning_rate": 3.517155290971351e-05, + "loss": 20.9707, + "step": 1034 + }, + { + "epoch": 0.24945770065075923, + "grad_norm": 21.308048248291016, + "learning_rate": 3.516137429465937e-05, + "loss": 21.4556, + "step": 1035 + }, + { + "epoch": 0.2496987225837551, + "grad_norm": 30.05303955078125, + "learning_rate": 3.5151186438533356e-05, + "loss": 21.803, + "step": 1036 + }, + { + "epoch": 0.24993974451675102, + "grad_norm": 43.31438446044922, + "learning_rate": 3.514098934754509e-05, + "loss": 22.1758, + "step": 1037 + }, + { + "epoch": 0.2501807664497469, + "grad_norm": 46.31341552734375, + "learning_rate": 3.513078302790985e-05, + "loss": 23.8845, + "step": 1038 + }, + { + "epoch": 0.2504217883827428, + "grad_norm": 54.58085632324219, + "learning_rate": 3.5120567485848524e-05, + "loss": 22.7365, + "step": 1039 + }, + { + "epoch": 0.2506628103157387, + "grad_norm": 55.831092834472656, + "learning_rate": 3.511034272758765e-05, + "loss": 23.2844, + "step": 1040 + }, + { + "epoch": 0.25090383224873464, + "grad_norm": 58.843624114990234, + "learning_rate": 3.510010875935934e-05, + "loss": 23.4231, + "step": 1041 + }, + { + "epoch": 0.25114485418173055, + "grad_norm": 54.83195114135742, + "learning_rate": 3.508986558740135e-05, + "loss": 23.3931, + "step": 1042 + }, + { + "epoch": 0.25138587611472646, + "grad_norm": 54.34385299682617, + "learning_rate": 3.507961321795705e-05, + "loss": 23.7059, + "step": 1043 + }, + { + "epoch": 0.25162689804772237, + "grad_norm": 50.088321685791016, + "learning_rate": 3.506935165727539e-05, + "loss": 23.3152, + "step": 1044 + }, + { + "epoch": 0.2518679199807182, + "grad_norm": 39.5730094909668, + "learning_rate": 3.505908091161093e-05, + "loss": 23.3191, + "step": 1045 + }, + { + "epoch": 0.25210894191371414, + "grad_norm": 34.08430862426758, + "learning_rate": 3.5048800987223855e-05, + "loss": 21.9955, + "step": 1046 + }, + { + "epoch": 0.25234996384671005, + "grad_norm": 27.835969924926758, + "learning_rate": 3.5038511890379916e-05, + "loss": 21.8034, + "step": 1047 + }, + { + "epoch": 0.25259098577970596, + "grad_norm": 18.106658935546875, + "learning_rate": 3.502821362735045e-05, + "loss": 21.5712, + "step": 1048 + }, + { + "epoch": 0.25283200771270187, + "grad_norm": 10.46998119354248, + "learning_rate": 3.501790620441241e-05, + "loss": 21.6141, + "step": 1049 + }, + { + "epoch": 0.2530730296456978, + "grad_norm": 4.222621440887451, + "learning_rate": 3.500758962784832e-05, + "loss": 21.7755, + "step": 1050 + }, + { + "epoch": 0.2533140515786937, + "grad_norm": 5.731699466705322, + "learning_rate": 3.499726390394626e-05, + "loss": 22.2216, + "step": 1051 + }, + { + "epoch": 0.25355507351168954, + "grad_norm": 12.878254890441895, + "learning_rate": 3.498692903899992e-05, + "loss": 20.8766, + "step": 1052 + }, + { + "epoch": 0.25379609544468545, + "grad_norm": 15.377541542053223, + "learning_rate": 3.4976585039308535e-05, + "loss": 22.0753, + "step": 1053 + }, + { + "epoch": 0.25403711737768137, + "grad_norm": 18.7191104888916, + "learning_rate": 3.4966231911176925e-05, + "loss": 21.7658, + "step": 1054 + }, + { + "epoch": 0.2542781393106773, + "grad_norm": 20.832656860351562, + "learning_rate": 3.495586966091547e-05, + "loss": 20.4281, + "step": 1055 + }, + { + "epoch": 0.2545191612436732, + "grad_norm": 19.09907341003418, + "learning_rate": 3.494549829484011e-05, + "loss": 20.6698, + "step": 1056 + }, + { + "epoch": 0.2547601831766691, + "grad_norm": 16.841079711914062, + "learning_rate": 3.493511781927233e-05, + "loss": 20.1348, + "step": 1057 + }, + { + "epoch": 0.25500120510966495, + "grad_norm": 14.597882270812988, + "learning_rate": 3.4924728240539174e-05, + "loss": 20.4566, + "step": 1058 + }, + { + "epoch": 0.25524222704266086, + "grad_norm": 12.451142311096191, + "learning_rate": 3.491432956497326e-05, + "loss": 21.1818, + "step": 1059 + }, + { + "epoch": 0.2554832489756568, + "grad_norm": 4.179632186889648, + "learning_rate": 3.4903921798912694e-05, + "loss": 20.6121, + "step": 1060 + }, + { + "epoch": 0.2557242709086527, + "grad_norm": 3.831399917602539, + "learning_rate": 3.4893504948701185e-05, + "loss": 20.9686, + "step": 1061 + }, + { + "epoch": 0.2559652928416486, + "grad_norm": 9.917418479919434, + "learning_rate": 3.488307902068794e-05, + "loss": 21.0047, + "step": 1062 + }, + { + "epoch": 0.2562063147746445, + "grad_norm": 21.066722869873047, + "learning_rate": 3.487264402122771e-05, + "loss": 21.9398, + "step": 1063 + }, + { + "epoch": 0.2564473367076404, + "grad_norm": 27.55527114868164, + "learning_rate": 3.4862199956680776e-05, + "loss": 20.1635, + "step": 1064 + }, + { + "epoch": 0.25668835864063627, + "grad_norm": 31.68132781982422, + "learning_rate": 3.485174683341293e-05, + "loss": 20.6351, + "step": 1065 + }, + { + "epoch": 0.2569293805736322, + "grad_norm": 33.548744201660156, + "learning_rate": 3.4841284657795525e-05, + "loss": 21.0529, + "step": 1066 + }, + { + "epoch": 0.2571704025066281, + "grad_norm": 32.54629135131836, + "learning_rate": 3.483081343620538e-05, + "loss": 22.018, + "step": 1067 + }, + { + "epoch": 0.257411424439624, + "grad_norm": 34.79807662963867, + "learning_rate": 3.4820333175024864e-05, + "loss": 21.4094, + "step": 1068 + }, + { + "epoch": 0.2576524463726199, + "grad_norm": 31.04584503173828, + "learning_rate": 3.4809843880641843e-05, + "loss": 21.2749, + "step": 1069 + }, + { + "epoch": 0.2578934683056158, + "grad_norm": 31.172061920166016, + "learning_rate": 3.4799345559449683e-05, + "loss": 21.3221, + "step": 1070 + }, + { + "epoch": 0.25813449023861174, + "grad_norm": 32.5527458190918, + "learning_rate": 3.478883821784727e-05, + "loss": 20.7105, + "step": 1071 + }, + { + "epoch": 0.2583755121716076, + "grad_norm": 28.55086326599121, + "learning_rate": 3.4778321862238974e-05, + "loss": 20.7478, + "step": 1072 + }, + { + "epoch": 0.2586165341046035, + "grad_norm": 23.553485870361328, + "learning_rate": 3.4767796499034663e-05, + "loss": 21.3074, + "step": 1073 + }, + { + "epoch": 0.2588575560375994, + "grad_norm": 14.886459350585938, + "learning_rate": 3.4757262134649686e-05, + "loss": 20.2521, + "step": 1074 + }, + { + "epoch": 0.2590985779705953, + "grad_norm": 10.406126022338867, + "learning_rate": 3.474671877550489e-05, + "loss": 21.167, + "step": 1075 + }, + { + "epoch": 0.25933959990359123, + "grad_norm": 8.12576675415039, + "learning_rate": 3.473616642802662e-05, + "loss": 21.0025, + "step": 1076 + }, + { + "epoch": 0.25958062183658714, + "grad_norm": 5.860384464263916, + "learning_rate": 3.472560509864665e-05, + "loss": 20.8637, + "step": 1077 + }, + { + "epoch": 0.25982164376958305, + "grad_norm": 4.9385223388671875, + "learning_rate": 3.471503479380228e-05, + "loss": 21.8207, + "step": 1078 + }, + { + "epoch": 0.2600626657025789, + "grad_norm": 3.0662524700164795, + "learning_rate": 3.470445551993626e-05, + "loss": 21.4289, + "step": 1079 + }, + { + "epoch": 0.2603036876355748, + "grad_norm": 4.83866548538208, + "learning_rate": 3.4693867283496804e-05, + "loss": 20.5177, + "step": 1080 + }, + { + "epoch": 0.26054470956857073, + "grad_norm": 2.5639355182647705, + "learning_rate": 3.4683270090937584e-05, + "loss": 20.9853, + "step": 1081 + }, + { + "epoch": 0.26078573150156664, + "grad_norm": 4.179164409637451, + "learning_rate": 3.467266394871775e-05, + "loss": 21.2922, + "step": 1082 + }, + { + "epoch": 0.26102675343456255, + "grad_norm": 15.6422119140625, + "learning_rate": 3.466204886330189e-05, + "loss": 20.3648, + "step": 1083 + }, + { + "epoch": 0.26126777536755846, + "grad_norm": 16.695226669311523, + "learning_rate": 3.4651424841160045e-05, + "loss": 20.9822, + "step": 1084 + }, + { + "epoch": 0.2615087973005544, + "grad_norm": 25.442604064941406, + "learning_rate": 3.4640791888767715e-05, + "loss": 20.4251, + "step": 1085 + }, + { + "epoch": 0.26174981923355023, + "grad_norm": 32.06135559082031, + "learning_rate": 3.4630150012605825e-05, + "loss": 21.8393, + "step": 1086 + }, + { + "epoch": 0.26199084116654614, + "grad_norm": 37.069297790527344, + "learning_rate": 3.4619499219160756e-05, + "loss": 22.1131, + "step": 1087 + }, + { + "epoch": 0.26223186309954205, + "grad_norm": 37.81686782836914, + "learning_rate": 3.460883951492432e-05, + "loss": 20.9635, + "step": 1088 + }, + { + "epoch": 0.26247288503253796, + "grad_norm": 43.07620620727539, + "learning_rate": 3.459817090639375e-05, + "loss": 21.9599, + "step": 1089 + }, + { + "epoch": 0.26271390696553387, + "grad_norm": 38.32603454589844, + "learning_rate": 3.458749340007171e-05, + "loss": 22.246, + "step": 1090 + }, + { + "epoch": 0.2629549288985298, + "grad_norm": 32.321495056152344, + "learning_rate": 3.45768070024663e-05, + "loss": 21.1852, + "step": 1091 + }, + { + "epoch": 0.2631959508315257, + "grad_norm": 29.5781192779541, + "learning_rate": 3.456611172009103e-05, + "loss": 21.6032, + "step": 1092 + }, + { + "epoch": 0.26343697276452155, + "grad_norm": 19.592817306518555, + "learning_rate": 3.4555407559464825e-05, + "loss": 20.5517, + "step": 1093 + }, + { + "epoch": 0.26367799469751746, + "grad_norm": 18.10269546508789, + "learning_rate": 3.454469452711202e-05, + "loss": 20.6713, + "step": 1094 + }, + { + "epoch": 0.26391901663051337, + "grad_norm": 10.181206703186035, + "learning_rate": 3.4533972629562356e-05, + "loss": 20.9684, + "step": 1095 + }, + { + "epoch": 0.2641600385635093, + "grad_norm": 6.2799272537231445, + "learning_rate": 3.4523241873350987e-05, + "loss": 20.6299, + "step": 1096 + }, + { + "epoch": 0.2644010604965052, + "grad_norm": 3.6795523166656494, + "learning_rate": 3.4512502265018467e-05, + "loss": 21.2582, + "step": 1097 + }, + { + "epoch": 0.2646420824295011, + "grad_norm": 8.456650733947754, + "learning_rate": 3.450175381111072e-05, + "loss": 21.4121, + "step": 1098 + }, + { + "epoch": 0.264883104362497, + "grad_norm": 25.11989402770996, + "learning_rate": 3.4490996518179095e-05, + "loss": 21.3056, + "step": 1099 + }, + { + "epoch": 0.26512412629549287, + "grad_norm": 40.715415954589844, + "learning_rate": 3.448023039278031e-05, + "loss": 23.2732, + "step": 1100 + }, + { + "epoch": 0.2653651482284888, + "grad_norm": 50.81378173828125, + "learning_rate": 3.4469455441476475e-05, + "loss": 22.9321, + "step": 1101 + }, + { + "epoch": 0.2656061701614847, + "grad_norm": 51.33253479003906, + "learning_rate": 3.445867167083507e-05, + "loss": 23.1105, + "step": 1102 + }, + { + "epoch": 0.2658471920944806, + "grad_norm": 43.581390380859375, + "learning_rate": 3.4447879087428954e-05, + "loss": 22.3635, + "step": 1103 + }, + { + "epoch": 0.2660882140274765, + "grad_norm": 32.53447341918945, + "learning_rate": 3.443707769783636e-05, + "loss": 21.8484, + "step": 1104 + }, + { + "epoch": 0.2663292359604724, + "grad_norm": 8.793464660644531, + "learning_rate": 3.442626750864089e-05, + "loss": 22.5267, + "step": 1105 + }, + { + "epoch": 0.26657025789346833, + "grad_norm": 3.340601682662964, + "learning_rate": 3.441544852643151e-05, + "loss": 21.3906, + "step": 1106 + }, + { + "epoch": 0.2668112798264642, + "grad_norm": 16.370893478393555, + "learning_rate": 3.440462075780254e-05, + "loss": 21.6821, + "step": 1107 + }, + { + "epoch": 0.2670523017594601, + "grad_norm": 21.08144187927246, + "learning_rate": 3.439378420935366e-05, + "loss": 21.6144, + "step": 1108 + }, + { + "epoch": 0.267293323692456, + "grad_norm": 17.086633682250977, + "learning_rate": 3.438293888768989e-05, + "loss": 21.0185, + "step": 1109 + }, + { + "epoch": 0.2675343456254519, + "grad_norm": 15.961248397827148, + "learning_rate": 3.437208479942162e-05, + "loss": 21.2877, + "step": 1110 + }, + { + "epoch": 0.26777536755844783, + "grad_norm": 17.207378387451172, + "learning_rate": 3.436122195116455e-05, + "loss": 20.9633, + "step": 1111 + }, + { + "epoch": 0.26801638949144374, + "grad_norm": 17.320436477661133, + "learning_rate": 3.4350350349539766e-05, + "loss": 20.434, + "step": 1112 + }, + { + "epoch": 0.26825741142443965, + "grad_norm": 13.834088325500488, + "learning_rate": 3.433947000117364e-05, + "loss": 21.1618, + "step": 1113 + }, + { + "epoch": 0.2684984333574355, + "grad_norm": 9.934250831604004, + "learning_rate": 3.432858091269792e-05, + "loss": 21.3493, + "step": 1114 + }, + { + "epoch": 0.2687394552904314, + "grad_norm": 7.846599102020264, + "learning_rate": 3.431768309074964e-05, + "loss": 20.3912, + "step": 1115 + }, + { + "epoch": 0.26898047722342733, + "grad_norm": 3.3881731033325195, + "learning_rate": 3.430677654197118e-05, + "loss": 20.8841, + "step": 1116 + }, + { + "epoch": 0.26922149915642324, + "grad_norm": 1.7880970239639282, + "learning_rate": 3.429586127301025e-05, + "loss": 20.0946, + "step": 1117 + }, + { + "epoch": 0.26946252108941915, + "grad_norm": 7.102236270904541, + "learning_rate": 3.428493729051984e-05, + "loss": 20.7709, + "step": 1118 + }, + { + "epoch": 0.26970354302241506, + "grad_norm": 10.044575691223145, + "learning_rate": 3.4274004601158285e-05, + "loss": 20.6334, + "step": 1119 + }, + { + "epoch": 0.2699445649554109, + "grad_norm": 12.58563232421875, + "learning_rate": 3.4263063211589216e-05, + "loss": 20.5073, + "step": 1120 + }, + { + "epoch": 0.2701855868884068, + "grad_norm": 16.83108139038086, + "learning_rate": 3.425211312848155e-05, + "loss": 21.1459, + "step": 1121 + }, + { + "epoch": 0.27042660882140274, + "grad_norm": 11.102230072021484, + "learning_rate": 3.424115435850953e-05, + "loss": 22.0918, + "step": 1122 + }, + { + "epoch": 0.27066763075439865, + "grad_norm": 20.93665313720703, + "learning_rate": 3.4230186908352676e-05, + "loss": 21.6194, + "step": 1123 + }, + { + "epoch": 0.27090865268739456, + "grad_norm": 18.186857223510742, + "learning_rate": 3.421921078469581e-05, + "loss": 21.0806, + "step": 1124 + }, + { + "epoch": 0.27114967462039047, + "grad_norm": 20.433889389038086, + "learning_rate": 3.420822599422902e-05, + "loss": 20.8146, + "step": 1125 + }, + { + "epoch": 0.2713906965533864, + "grad_norm": 18.194169998168945, + "learning_rate": 3.4197232543647714e-05, + "loss": 21.492, + "step": 1126 + }, + { + "epoch": 0.27163171848638223, + "grad_norm": 18.191761016845703, + "learning_rate": 3.418623043965253e-05, + "loss": 21.9817, + "step": 1127 + }, + { + "epoch": 0.27187274041937814, + "grad_norm": 14.947210311889648, + "learning_rate": 3.417521968894943e-05, + "loss": 20.9768, + "step": 1128 + }, + { + "epoch": 0.27211376235237406, + "grad_norm": 14.576415061950684, + "learning_rate": 3.41642002982496e-05, + "loss": 20.7858, + "step": 1129 + }, + { + "epoch": 0.27235478428536997, + "grad_norm": 13.961993217468262, + "learning_rate": 3.415317227426953e-05, + "loss": 21.088, + "step": 1130 + }, + { + "epoch": 0.2725958062183659, + "grad_norm": 2.646620988845825, + "learning_rate": 3.4142135623730954e-05, + "loss": 20.5171, + "step": 1131 + }, + { + "epoch": 0.2728368281513618, + "grad_norm": 4.484507083892822, + "learning_rate": 3.4131090353360856e-05, + "loss": 20.7129, + "step": 1132 + }, + { + "epoch": 0.2730778500843577, + "grad_norm": 2.9901514053344727, + "learning_rate": 3.41200364698915e-05, + "loss": 20.7439, + "step": 1133 + }, + { + "epoch": 0.27331887201735355, + "grad_norm": 10.47524356842041, + "learning_rate": 3.4108973980060374e-05, + "loss": 19.4626, + "step": 1134 + }, + { + "epoch": 0.27355989395034946, + "grad_norm": 14.470396041870117, + "learning_rate": 3.409790289061022e-05, + "loss": 21.4933, + "step": 1135 + }, + { + "epoch": 0.2738009158833454, + "grad_norm": 12.385201454162598, + "learning_rate": 3.408682320828903e-05, + "loss": 21.9639, + "step": 1136 + }, + { + "epoch": 0.2740419378163413, + "grad_norm": 11.334243774414062, + "learning_rate": 3.407573493985003e-05, + "loss": 20.9418, + "step": 1137 + }, + { + "epoch": 0.2742829597493372, + "grad_norm": 4.433568000793457, + "learning_rate": 3.406463809205166e-05, + "loss": 20.4476, + "step": 1138 + }, + { + "epoch": 0.2745239816823331, + "grad_norm": 6.092807769775391, + "learning_rate": 3.4053532671657624e-05, + "loss": 20.7382, + "step": 1139 + }, + { + "epoch": 0.274765003615329, + "grad_norm": 21.61514663696289, + "learning_rate": 3.404241868543682e-05, + "loss": 21.3464, + "step": 1140 + }, + { + "epoch": 0.2750060255483249, + "grad_norm": 39.586971282958984, + "learning_rate": 3.403129614016339e-05, + "loss": 20.6867, + "step": 1141 + }, + { + "epoch": 0.2752470474813208, + "grad_norm": 61.61696243286133, + "learning_rate": 3.4020165042616675e-05, + "loss": 22.8829, + "step": 1142 + }, + { + "epoch": 0.2754880694143167, + "grad_norm": 75.14087677001953, + "learning_rate": 3.400902539958124e-05, + "loss": 22.8793, + "step": 1143 + }, + { + "epoch": 0.2757290913473126, + "grad_norm": 87.17111206054688, + "learning_rate": 3.3997877217846844e-05, + "loss": 25.8992, + "step": 1144 + }, + { + "epoch": 0.2759701132803085, + "grad_norm": 92.69955444335938, + "learning_rate": 3.3986720504208484e-05, + "loss": 27.1935, + "step": 1145 + }, + { + "epoch": 0.2762111352133044, + "grad_norm": 98.24156951904297, + "learning_rate": 3.3975555265466316e-05, + "loss": 28.6649, + "step": 1146 + }, + { + "epoch": 0.27645215714630034, + "grad_norm": 98.76190948486328, + "learning_rate": 3.396438150842572e-05, + "loss": 29.1262, + "step": 1147 + }, + { + "epoch": 0.2766931790792962, + "grad_norm": 97.29325866699219, + "learning_rate": 3.395319923989725e-05, + "loss": 28.4407, + "step": 1148 + }, + { + "epoch": 0.2769342010122921, + "grad_norm": 90.28309631347656, + "learning_rate": 3.394200846669666e-05, + "loss": 28.3694, + "step": 1149 + }, + { + "epoch": 0.277175222945288, + "grad_norm": 85.3031234741211, + "learning_rate": 3.393080919564489e-05, + "loss": 25.7273, + "step": 1150 + }, + { + "epoch": 0.2774162448782839, + "grad_norm": 76.29835510253906, + "learning_rate": 3.391960143356804e-05, + "loss": 24.3541, + "step": 1151 + }, + { + "epoch": 0.27765726681127983, + "grad_norm": 55.76112747192383, + "learning_rate": 3.3908385187297424e-05, + "loss": 23.439, + "step": 1152 + }, + { + "epoch": 0.27789828874427575, + "grad_norm": 34.27573776245117, + "learning_rate": 3.3897160463669476e-05, + "loss": 22.4599, + "step": 1153 + }, + { + "epoch": 0.27813931067727166, + "grad_norm": 17.89781379699707, + "learning_rate": 3.3885927269525844e-05, + "loss": 21.673, + "step": 1154 + }, + { + "epoch": 0.2783803326102675, + "grad_norm": 3.7058088779449463, + "learning_rate": 3.3874685611713305e-05, + "loss": 20.9401, + "step": 1155 + }, + { + "epoch": 0.2786213545432634, + "grad_norm": 8.469401359558105, + "learning_rate": 3.386343549708382e-05, + "loss": 21.4511, + "step": 1156 + }, + { + "epoch": 0.27886237647625933, + "grad_norm": 1.8528344631195068, + "learning_rate": 3.385217693249448e-05, + "loss": 20.6358, + "step": 1157 + }, + { + "epoch": 0.27910339840925524, + "grad_norm": 1.7013719081878662, + "learning_rate": 3.384090992480755e-05, + "loss": 21.1543, + "step": 1158 + }, + { + "epoch": 0.27934442034225115, + "grad_norm": 11.963706016540527, + "learning_rate": 3.3829634480890425e-05, + "loss": 21.1525, + "step": 1159 + }, + { + "epoch": 0.27958544227524706, + "grad_norm": 19.05413246154785, + "learning_rate": 3.381835060761564e-05, + "loss": 20.7124, + "step": 1160 + }, + { + "epoch": 0.279826464208243, + "grad_norm": 17.058368682861328, + "learning_rate": 3.38070583118609e-05, + "loss": 20.784, + "step": 1161 + }, + { + "epoch": 0.28006748614123883, + "grad_norm": 20.795507431030273, + "learning_rate": 3.379575760050899e-05, + "loss": 21.7715, + "step": 1162 + }, + { + "epoch": 0.28030850807423474, + "grad_norm": 23.29547119140625, + "learning_rate": 3.378444848044787e-05, + "loss": 21.0833, + "step": 1163 + }, + { + "epoch": 0.28054953000723065, + "grad_norm": 24.172136306762695, + "learning_rate": 3.37731309585706e-05, + "loss": 21.6049, + "step": 1164 + }, + { + "epoch": 0.28079055194022656, + "grad_norm": 25.79604148864746, + "learning_rate": 3.376180504177537e-05, + "loss": 22.2121, + "step": 1165 + }, + { + "epoch": 0.2810315738732225, + "grad_norm": 22.684629440307617, + "learning_rate": 3.37504707369655e-05, + "loss": 22.8907, + "step": 1166 + }, + { + "epoch": 0.2812725958062184, + "grad_norm": 24.292251586914062, + "learning_rate": 3.3739128051049395e-05, + "loss": 20.858, + "step": 1167 + }, + { + "epoch": 0.2815136177392143, + "grad_norm": 22.2999210357666, + "learning_rate": 3.372777699094059e-05, + "loss": 21.3143, + "step": 1168 + }, + { + "epoch": 0.28175463967221015, + "grad_norm": 17.18004608154297, + "learning_rate": 3.371641756355771e-05, + "loss": 20.5658, + "step": 1169 + }, + { + "epoch": 0.28199566160520606, + "grad_norm": 18.92638397216797, + "learning_rate": 3.3705049775824484e-05, + "loss": 20.9662, + "step": 1170 + }, + { + "epoch": 0.28223668353820197, + "grad_norm": 14.064903259277344, + "learning_rate": 3.369367363466976e-05, + "loss": 21.2765, + "step": 1171 + }, + { + "epoch": 0.2824777054711979, + "grad_norm": 12.69619369506836, + "learning_rate": 3.368228914702743e-05, + "loss": 20.6064, + "step": 1172 + }, + { + "epoch": 0.2827187274041938, + "grad_norm": 13.189311981201172, + "learning_rate": 3.367089631983651e-05, + "loss": 20.3139, + "step": 1173 + }, + { + "epoch": 0.2829597493371897, + "grad_norm": 7.8405609130859375, + "learning_rate": 3.365949516004109e-05, + "loss": 20.8374, + "step": 1174 + }, + { + "epoch": 0.2832007712701856, + "grad_norm": 7.064460277557373, + "learning_rate": 3.364808567459035e-05, + "loss": 20.4491, + "step": 1175 + }, + { + "epoch": 0.28344179320318147, + "grad_norm": 5.427570819854736, + "learning_rate": 3.363666787043851e-05, + "loss": 20.9427, + "step": 1176 + }, + { + "epoch": 0.2836828151361774, + "grad_norm": 2.56904673576355, + "learning_rate": 3.36252417545449e-05, + "loss": 21.5324, + "step": 1177 + }, + { + "epoch": 0.2839238370691733, + "grad_norm": 1.4917876720428467, + "learning_rate": 3.361380733387389e-05, + "loss": 21.7747, + "step": 1178 + }, + { + "epoch": 0.2841648590021692, + "grad_norm": 1.4184300899505615, + "learning_rate": 3.3602364615394925e-05, + "loss": 21.1208, + "step": 1179 + }, + { + "epoch": 0.2844058809351651, + "grad_norm": 1.479393482208252, + "learning_rate": 3.3590913606082505e-05, + "loss": 21.6643, + "step": 1180 + }, + { + "epoch": 0.284646902868161, + "grad_norm": 2.6357662677764893, + "learning_rate": 3.357945431291618e-05, + "loss": 21.2384, + "step": 1181 + }, + { + "epoch": 0.2848879248011569, + "grad_norm": 5.982410430908203, + "learning_rate": 3.356798674288055e-05, + "loss": 20.5564, + "step": 1182 + }, + { + "epoch": 0.2851289467341528, + "grad_norm": 15.161651611328125, + "learning_rate": 3.355651090296527e-05, + "loss": 21.6328, + "step": 1183 + }, + { + "epoch": 0.2853699686671487, + "grad_norm": 13.174759864807129, + "learning_rate": 3.354502680016501e-05, + "loss": 20.8218, + "step": 1184 + }, + { + "epoch": 0.2856109906001446, + "grad_norm": 11.291951179504395, + "learning_rate": 3.35335344414795e-05, + "loss": 20.307, + "step": 1185 + }, + { + "epoch": 0.2858520125331405, + "grad_norm": 8.039060592651367, + "learning_rate": 3.35220338339135e-05, + "loss": 20.9716, + "step": 1186 + }, + { + "epoch": 0.28609303446613643, + "grad_norm": 6.37565803527832, + "learning_rate": 3.351052498447679e-05, + "loss": 20.8839, + "step": 1187 + }, + { + "epoch": 0.28633405639913234, + "grad_norm": 3.45827579498291, + "learning_rate": 3.3499007900184175e-05, + "loss": 21.0244, + "step": 1188 + }, + { + "epoch": 0.2865750783321282, + "grad_norm": 6.565638065338135, + "learning_rate": 3.348748258805548e-05, + "loss": 20.9418, + "step": 1189 + }, + { + "epoch": 0.2868161002651241, + "grad_norm": 5.380870342254639, + "learning_rate": 3.3475949055115556e-05, + "loss": 20.0811, + "step": 1190 + }, + { + "epoch": 0.28705712219812, + "grad_norm": 2.079995632171631, + "learning_rate": 3.346440730839423e-05, + "loss": 20.207, + "step": 1191 + }, + { + "epoch": 0.28729814413111593, + "grad_norm": 8.02956771850586, + "learning_rate": 3.345285735492639e-05, + "loss": 19.7866, + "step": 1192 + }, + { + "epoch": 0.28753916606411184, + "grad_norm": 7.627566337585449, + "learning_rate": 3.3441299201751876e-05, + "loss": 19.9253, + "step": 1193 + }, + { + "epoch": 0.28778018799710775, + "grad_norm": 7.012007236480713, + "learning_rate": 3.3429732855915555e-05, + "loss": 20.9619, + "step": 1194 + }, + { + "epoch": 0.28802120993010366, + "grad_norm": 10.800537109375, + "learning_rate": 3.341815832446726e-05, + "loss": 21.0196, + "step": 1195 + }, + { + "epoch": 0.2882622318630995, + "grad_norm": 15.283639907836914, + "learning_rate": 3.340657561446186e-05, + "loss": 20.5163, + "step": 1196 + }, + { + "epoch": 0.2885032537960954, + "grad_norm": 12.22170639038086, + "learning_rate": 3.339498473295916e-05, + "loss": 19.9133, + "step": 1197 + }, + { + "epoch": 0.28874427572909134, + "grad_norm": 16.94639015197754, + "learning_rate": 3.338338568702398e-05, + "loss": 20.5871, + "step": 1198 + }, + { + "epoch": 0.28898529766208725, + "grad_norm": 16.57461166381836, + "learning_rate": 3.337177848372609e-05, + "loss": 20.9253, + "step": 1199 + }, + { + "epoch": 0.28922631959508316, + "grad_norm": 21.571935653686523, + "learning_rate": 3.3360163130140234e-05, + "loss": 21.1169, + "step": 1200 + }, + { + "epoch": 0.28922631959508316, + "eval_cc_pretrain_accuracy": 0.8999999761581421, + "eval_cc_pretrain_loss": 2.103113889694214, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 18.2408, + "eval_cc_pretrain_samples_per_second": 5.482, + "eval_cc_pretrain_steps_per_second": 0.055, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 1200 + }, + { + "epoch": 0.28922631959508316, + "eval_mscoco_pretrain_accuracy": 0.8399999737739563, + "eval_mscoco_pretrain_loss": 2.339060068130493, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 15.6133, + "eval_mscoco_pretrain_samples_per_second": 6.405, + "eval_mscoco_pretrain_steps_per_second": 0.064, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 1200 + }, + { + "epoch": 0.28946734152807907, + "grad_norm": 15.335203170776367, + "learning_rate": 3.3348539633346166e-05, + "loss": 20.9103, + "step": 1201 + }, + { + "epoch": 0.289708363461075, + "grad_norm": 17.334821701049805, + "learning_rate": 3.333690800042855e-05, + "loss": 19.8575, + "step": 1202 + }, + { + "epoch": 0.28994938539407084, + "grad_norm": 11.484573364257812, + "learning_rate": 3.332526823847704e-05, + "loss": 21.5223, + "step": 1203 + }, + { + "epoch": 0.29019040732706675, + "grad_norm": 6.081074237823486, + "learning_rate": 3.331362035458623e-05, + "loss": 21.0794, + "step": 1204 + }, + { + "epoch": 0.29043142926006266, + "grad_norm": 2.7230963706970215, + "learning_rate": 3.330196435585566e-05, + "loss": 21.2993, + "step": 1205 + }, + { + "epoch": 0.29067245119305857, + "grad_norm": 5.686221599578857, + "learning_rate": 3.3290300249389846e-05, + "loss": 21.6889, + "step": 1206 + }, + { + "epoch": 0.2909134731260545, + "grad_norm": 5.886171817779541, + "learning_rate": 3.327862804229821e-05, + "loss": 22.13, + "step": 1207 + }, + { + "epoch": 0.2911544950590504, + "grad_norm": 7.849846839904785, + "learning_rate": 3.326694774169514e-05, + "loss": 21.4839, + "step": 1208 + }, + { + "epoch": 0.2913955169920463, + "grad_norm": 8.135795593261719, + "learning_rate": 3.3255259354699933e-05, + "loss": 21.4839, + "step": 1209 + }, + { + "epoch": 0.29163653892504215, + "grad_norm": 3.594723701477051, + "learning_rate": 3.324356288843683e-05, + "loss": 20.4968, + "step": 1210 + }, + { + "epoch": 0.29187756085803807, + "grad_norm": 3.619638204574585, + "learning_rate": 3.323185835003499e-05, + "loss": 21.6396, + "step": 1211 + }, + { + "epoch": 0.292118582791034, + "grad_norm": 4.486485481262207, + "learning_rate": 3.32201457466285e-05, + "loss": 21.3346, + "step": 1212 + }, + { + "epoch": 0.2923596047240299, + "grad_norm": 10.971065521240234, + "learning_rate": 3.320842508535636e-05, + "loss": 21.6669, + "step": 1213 + }, + { + "epoch": 0.2926006266570258, + "grad_norm": 13.402292251586914, + "learning_rate": 3.3196696373362476e-05, + "loss": 22.1401, + "step": 1214 + }, + { + "epoch": 0.2928416485900217, + "grad_norm": 14.630956649780273, + "learning_rate": 3.318495961779565e-05, + "loss": 20.8062, + "step": 1215 + }, + { + "epoch": 0.2930826705230176, + "grad_norm": 17.695755004882812, + "learning_rate": 3.317321482580963e-05, + "loss": 21.1795, + "step": 1216 + }, + { + "epoch": 0.2933236924560135, + "grad_norm": 18.809019088745117, + "learning_rate": 3.3161462004562994e-05, + "loss": 21.2482, + "step": 1217 + }, + { + "epoch": 0.2935647143890094, + "grad_norm": 20.440954208374023, + "learning_rate": 3.314970116121928e-05, + "loss": 21.7449, + "step": 1218 + }, + { + "epoch": 0.2938057363220053, + "grad_norm": 20.317291259765625, + "learning_rate": 3.313793230294689e-05, + "loss": 21.7688, + "step": 1219 + }, + { + "epoch": 0.2940467582550012, + "grad_norm": 16.33997344970703, + "learning_rate": 3.312615543691909e-05, + "loss": 21.669, + "step": 1220 + }, + { + "epoch": 0.2942877801879971, + "grad_norm": 10.13342571258545, + "learning_rate": 3.311437057031406e-05, + "loss": 21.3418, + "step": 1221 + }, + { + "epoch": 0.294528802120993, + "grad_norm": 10.670565605163574, + "learning_rate": 3.3102577710314844e-05, + "loss": 20.636, + "step": 1222 + }, + { + "epoch": 0.29476982405398894, + "grad_norm": 7.126454830169678, + "learning_rate": 3.309077686410935e-05, + "loss": 21.4506, + "step": 1223 + }, + { + "epoch": 0.2950108459869848, + "grad_norm": 4.566921234130859, + "learning_rate": 3.307896803889036e-05, + "loss": 21.5201, + "step": 1224 + }, + { + "epoch": 0.2952518679199807, + "grad_norm": 3.590477705001831, + "learning_rate": 3.3067151241855527e-05, + "loss": 20.7464, + "step": 1225 + }, + { + "epoch": 0.2954928898529766, + "grad_norm": 1.496376872062683, + "learning_rate": 3.3055326480207345e-05, + "loss": 21.0493, + "step": 1226 + }, + { + "epoch": 0.2957339117859725, + "grad_norm": 3.5954551696777344, + "learning_rate": 3.304349376115318e-05, + "loss": 21.3662, + "step": 1227 + }, + { + "epoch": 0.29597493371896844, + "grad_norm": 5.609082221984863, + "learning_rate": 3.303165309190525e-05, + "loss": 21.6228, + "step": 1228 + }, + { + "epoch": 0.29621595565196435, + "grad_norm": 11.045539855957031, + "learning_rate": 3.3019804479680595e-05, + "loss": 22.3641, + "step": 1229 + }, + { + "epoch": 0.29645697758496026, + "grad_norm": 17.820255279541016, + "learning_rate": 3.300794793170113e-05, + "loss": 21.627, + "step": 1230 + }, + { + "epoch": 0.2966979995179561, + "grad_norm": 25.825254440307617, + "learning_rate": 3.299608345519357e-05, + "loss": 21.0061, + "step": 1231 + }, + { + "epoch": 0.296939021450952, + "grad_norm": 27.925594329833984, + "learning_rate": 3.298421105738948e-05, + "loss": 21.4683, + "step": 1232 + }, + { + "epoch": 0.29718004338394793, + "grad_norm": 34.04878234863281, + "learning_rate": 3.297233074552527e-05, + "loss": 20.7684, + "step": 1233 + }, + { + "epoch": 0.29742106531694384, + "grad_norm": 34.29916763305664, + "learning_rate": 3.2960442526842145e-05, + "loss": 21.3355, + "step": 1234 + }, + { + "epoch": 0.29766208724993976, + "grad_norm": 34.29690170288086, + "learning_rate": 3.294854640858615e-05, + "loss": 21.6883, + "step": 1235 + }, + { + "epoch": 0.29790310918293567, + "grad_norm": 36.29655838012695, + "learning_rate": 3.293664239800814e-05, + "loss": 22.1218, + "step": 1236 + }, + { + "epoch": 0.2981441311159316, + "grad_norm": 30.673803329467773, + "learning_rate": 3.292473050236377e-05, + "loss": 22.0426, + "step": 1237 + }, + { + "epoch": 0.29838515304892743, + "grad_norm": 30.551267623901367, + "learning_rate": 3.2912810728913507e-05, + "loss": 22.0698, + "step": 1238 + }, + { + "epoch": 0.29862617498192334, + "grad_norm": 29.184446334838867, + "learning_rate": 3.290088308492263e-05, + "loss": 21.4699, + "step": 1239 + }, + { + "epoch": 0.29886719691491925, + "grad_norm": 26.687070846557617, + "learning_rate": 3.2888947577661194e-05, + "loss": 21.7592, + "step": 1240 + }, + { + "epoch": 0.29910821884791516, + "grad_norm": 19.46657943725586, + "learning_rate": 3.287700421440408e-05, + "loss": 21.4755, + "step": 1241 + }, + { + "epoch": 0.2993492407809111, + "grad_norm": 15.102832794189453, + "learning_rate": 3.286505300243091e-05, + "loss": 20.9836, + "step": 1242 + }, + { + "epoch": 0.299590262713907, + "grad_norm": 10.242476463317871, + "learning_rate": 3.2853093949026136e-05, + "loss": 20.6052, + "step": 1243 + }, + { + "epoch": 0.2998312846469029, + "grad_norm": 5.075689315795898, + "learning_rate": 3.284112706147897e-05, + "loss": 21.3756, + "step": 1244 + }, + { + "epoch": 0.30007230657989875, + "grad_norm": 3.5132157802581787, + "learning_rate": 3.2829152347083395e-05, + "loss": 20.9926, + "step": 1245 + }, + { + "epoch": 0.30031332851289466, + "grad_norm": 6.65875768661499, + "learning_rate": 3.281716981313816e-05, + "loss": 20.5334, + "step": 1246 + }, + { + "epoch": 0.3005543504458906, + "grad_norm": 7.629268169403076, + "learning_rate": 3.28051794669468e-05, + "loss": 19.7518, + "step": 1247 + }, + { + "epoch": 0.3007953723788865, + "grad_norm": 4.251559734344482, + "learning_rate": 3.279318131581759e-05, + "loss": 20.9906, + "step": 1248 + }, + { + "epoch": 0.3010363943118824, + "grad_norm": 1.6763757467269897, + "learning_rate": 3.2781175367063587e-05, + "loss": 21.8369, + "step": 1249 + }, + { + "epoch": 0.3012774162448783, + "grad_norm": 8.24956226348877, + "learning_rate": 3.276916162800258e-05, + "loss": 20.5364, + "step": 1250 + }, + { + "epoch": 0.30151843817787416, + "grad_norm": 17.946407318115234, + "learning_rate": 3.275714010595711e-05, + "loss": 20.2721, + "step": 1251 + }, + { + "epoch": 0.30175946011087007, + "grad_norm": 24.069671630859375, + "learning_rate": 3.274511080825446e-05, + "loss": 21.1379, + "step": 1252 + }, + { + "epoch": 0.302000482043866, + "grad_norm": 22.697362899780273, + "learning_rate": 3.273307374222667e-05, + "loss": 21.4702, + "step": 1253 + }, + { + "epoch": 0.3022415039768619, + "grad_norm": 25.201139450073242, + "learning_rate": 3.2721028915210484e-05, + "loss": 21.9193, + "step": 1254 + }, + { + "epoch": 0.3024825259098578, + "grad_norm": 24.32606315612793, + "learning_rate": 3.2708976334547415e-05, + "loss": 21.6, + "step": 1255 + }, + { + "epoch": 0.3027235478428537, + "grad_norm": 24.071449279785156, + "learning_rate": 3.269691600758367e-05, + "loss": 21.9541, + "step": 1256 + }, + { + "epoch": 0.3029645697758496, + "grad_norm": 21.073070526123047, + "learning_rate": 3.268484794167019e-05, + "loss": 20.6573, + "step": 1257 + }, + { + "epoch": 0.3032055917088455, + "grad_norm": 17.830936431884766, + "learning_rate": 3.267277214416263e-05, + "loss": 21.0984, + "step": 1258 + }, + { + "epoch": 0.3034466136418414, + "grad_norm": 13.060230255126953, + "learning_rate": 3.2660688622421356e-05, + "loss": 20.865, + "step": 1259 + }, + { + "epoch": 0.3036876355748373, + "grad_norm": 11.44905948638916, + "learning_rate": 3.2648597383811455e-05, + "loss": 20.5757, + "step": 1260 + }, + { + "epoch": 0.3039286575078332, + "grad_norm": 6.360955715179443, + "learning_rate": 3.263649843570271e-05, + "loss": 20.3735, + "step": 1261 + }, + { + "epoch": 0.3041696794408291, + "grad_norm": 1.8064438104629517, + "learning_rate": 3.2624391785469584e-05, + "loss": 21.205, + "step": 1262 + }, + { + "epoch": 0.30441070137382503, + "grad_norm": 4.246091842651367, + "learning_rate": 3.261227744049127e-05, + "loss": 19.8686, + "step": 1263 + }, + { + "epoch": 0.30465172330682094, + "grad_norm": 10.009968757629395, + "learning_rate": 3.260015540815161e-05, + "loss": 21.0964, + "step": 1264 + }, + { + "epoch": 0.3048927452398168, + "grad_norm": 8.5691556930542, + "learning_rate": 3.258802569583918e-05, + "loss": 20.6696, + "step": 1265 + }, + { + "epoch": 0.3051337671728127, + "grad_norm": 5.145843029022217, + "learning_rate": 3.25758883109472e-05, + "loss": 20.9872, + "step": 1266 + }, + { + "epoch": 0.3053747891058086, + "grad_norm": 3.219116687774658, + "learning_rate": 3.256374326087357e-05, + "loss": 20.4141, + "step": 1267 + }, + { + "epoch": 0.30561581103880453, + "grad_norm": 7.6407036781311035, + "learning_rate": 3.2551590553020874e-05, + "loss": 21.0416, + "step": 1268 + }, + { + "epoch": 0.30585683297180044, + "grad_norm": 4.719003200531006, + "learning_rate": 3.2539430194796366e-05, + "loss": 20.7321, + "step": 1269 + }, + { + "epoch": 0.30609785490479635, + "grad_norm": 8.439708709716797, + "learning_rate": 3.252726219361195e-05, + "loss": 20.776, + "step": 1270 + }, + { + "epoch": 0.30633887683779226, + "grad_norm": 6.529338836669922, + "learning_rate": 3.2515086556884195e-05, + "loss": 21.0097, + "step": 1271 + }, + { + "epoch": 0.3065798987707881, + "grad_norm": 8.183405876159668, + "learning_rate": 3.250290329203433e-05, + "loss": 21.4427, + "step": 1272 + }, + { + "epoch": 0.30682092070378403, + "grad_norm": 3.4692769050598145, + "learning_rate": 3.249071240648822e-05, + "loss": 21.6691, + "step": 1273 + }, + { + "epoch": 0.30706194263677994, + "grad_norm": 1.5826505422592163, + "learning_rate": 3.247851390767638e-05, + "loss": 21.4849, + "step": 1274 + }, + { + "epoch": 0.30730296456977585, + "grad_norm": 3.2541022300720215, + "learning_rate": 3.2466307803033974e-05, + "loss": 20.3364, + "step": 1275 + }, + { + "epoch": 0.30754398650277176, + "grad_norm": 4.493983268737793, + "learning_rate": 3.245409410000079e-05, + "loss": 20.451, + "step": 1276 + }, + { + "epoch": 0.30778500843576767, + "grad_norm": 4.035463333129883, + "learning_rate": 3.244187280602126e-05, + "loss": 20.9513, + "step": 1277 + }, + { + "epoch": 0.3080260303687636, + "grad_norm": 2.365455150604248, + "learning_rate": 3.2429643928544435e-05, + "loss": 21.4327, + "step": 1278 + }, + { + "epoch": 0.30826705230175944, + "grad_norm": 5.2006354331970215, + "learning_rate": 3.241740747502397e-05, + "loss": 19.9455, + "step": 1279 + }, + { + "epoch": 0.30850807423475535, + "grad_norm": 10.596508979797363, + "learning_rate": 3.240516345291818e-05, + "loss": 20.8848, + "step": 1280 + }, + { + "epoch": 0.30874909616775126, + "grad_norm": 11.582011222839355, + "learning_rate": 3.239291186968996e-05, + "loss": 20.917, + "step": 1281 + }, + { + "epoch": 0.30899011810074717, + "grad_norm": 18.31144142150879, + "learning_rate": 3.2380652732806816e-05, + "loss": 20.5056, + "step": 1282 + }, + { + "epoch": 0.3092311400337431, + "grad_norm": 19.316181182861328, + "learning_rate": 3.236838604974088e-05, + "loss": 22.2138, + "step": 1283 + }, + { + "epoch": 0.309472161966739, + "grad_norm": 17.56629753112793, + "learning_rate": 3.2356111827968855e-05, + "loss": 20.8343, + "step": 1284 + }, + { + "epoch": 0.3097131838997349, + "grad_norm": 20.063045501708984, + "learning_rate": 3.234383007497206e-05, + "loss": 21.1734, + "step": 1285 + }, + { + "epoch": 0.30995420583273076, + "grad_norm": 13.902097702026367, + "learning_rate": 3.2331540798236406e-05, + "loss": 20.5139, + "step": 1286 + }, + { + "epoch": 0.31019522776572667, + "grad_norm": 8.868664741516113, + "learning_rate": 3.231924400525236e-05, + "loss": 20.4966, + "step": 1287 + }, + { + "epoch": 0.3104362496987226, + "grad_norm": 10.66475772857666, + "learning_rate": 3.2306939703515006e-05, + "loss": 20.5951, + "step": 1288 + }, + { + "epoch": 0.3106772716317185, + "grad_norm": 3.394270420074463, + "learning_rate": 3.229462790052398e-05, + "loss": 21.2353, + "step": 1289 + }, + { + "epoch": 0.3109182935647144, + "grad_norm": 1.5402913093566895, + "learning_rate": 3.2282308603783506e-05, + "loss": 21.4931, + "step": 1290 + }, + { + "epoch": 0.3111593154977103, + "grad_norm": 5.272421836853027, + "learning_rate": 3.226998182080237e-05, + "loss": 22.0476, + "step": 1291 + }, + { + "epoch": 0.3114003374307062, + "grad_norm": 7.2602105140686035, + "learning_rate": 3.225764755909392e-05, + "loss": 20.8412, + "step": 1292 + }, + { + "epoch": 0.3116413593637021, + "grad_norm": 6.783151626586914, + "learning_rate": 3.2245305826176063e-05, + "loss": 20.5457, + "step": 1293 + }, + { + "epoch": 0.311882381296698, + "grad_norm": 3.1205391883850098, + "learning_rate": 3.223295662957126e-05, + "loss": 21.3377, + "step": 1294 + }, + { + "epoch": 0.3121234032296939, + "grad_norm": 2.037216901779175, + "learning_rate": 3.222059997680651e-05, + "loss": 21.2914, + "step": 1295 + }, + { + "epoch": 0.3123644251626898, + "grad_norm": 9.300247192382812, + "learning_rate": 3.2208235875413386e-05, + "loss": 20.9182, + "step": 1296 + }, + { + "epoch": 0.3126054470956857, + "grad_norm": 16.559049606323242, + "learning_rate": 3.219586433292797e-05, + "loss": 20.6043, + "step": 1297 + }, + { + "epoch": 0.31284646902868163, + "grad_norm": 27.042957305908203, + "learning_rate": 3.21834853568909e-05, + "loss": 21.3199, + "step": 1298 + }, + { + "epoch": 0.31308749096167754, + "grad_norm": 29.416393280029297, + "learning_rate": 3.217109895484733e-05, + "loss": 21.1016, + "step": 1299 + }, + { + "epoch": 0.3133285128946734, + "grad_norm": 39.54520034790039, + "learning_rate": 3.215870513434695e-05, + "loss": 22.0794, + "step": 1300 + }, + { + "epoch": 0.3135695348276693, + "grad_norm": 44.29729461669922, + "learning_rate": 3.214630390294396e-05, + "loss": 21.8779, + "step": 1301 + }, + { + "epoch": 0.3138105567606652, + "grad_norm": 48.8061408996582, + "learning_rate": 3.21338952681971e-05, + "loss": 21.9157, + "step": 1302 + }, + { + "epoch": 0.3140515786936611, + "grad_norm": 57.804744720458984, + "learning_rate": 3.212147923766961e-05, + "loss": 22.3703, + "step": 1303 + }, + { + "epoch": 0.31429260062665704, + "grad_norm": 51.300960540771484, + "learning_rate": 3.210905581892922e-05, + "loss": 22.2134, + "step": 1304 + }, + { + "epoch": 0.31453362255965295, + "grad_norm": 46.55207443237305, + "learning_rate": 3.209662501954818e-05, + "loss": 22.0472, + "step": 1305 + }, + { + "epoch": 0.31477464449264886, + "grad_norm": 46.05329513549805, + "learning_rate": 3.208418684710326e-05, + "loss": 21.558, + "step": 1306 + }, + { + "epoch": 0.3150156664256447, + "grad_norm": 38.551090240478516, + "learning_rate": 3.207174130917568e-05, + "loss": 20.5821, + "step": 1307 + }, + { + "epoch": 0.3152566883586406, + "grad_norm": 39.80511474609375, + "learning_rate": 3.205928841335118e-05, + "loss": 22.0285, + "step": 1308 + }, + { + "epoch": 0.31549771029163654, + "grad_norm": 26.312957763671875, + "learning_rate": 3.204682816721997e-05, + "loss": 20.9324, + "step": 1309 + }, + { + "epoch": 0.31573873222463245, + "grad_norm": 21.195674896240234, + "learning_rate": 3.203436057837675e-05, + "loss": 21.906, + "step": 1310 + }, + { + "epoch": 0.31597975415762836, + "grad_norm": 15.27059268951416, + "learning_rate": 3.20218856544207e-05, + "loss": 21.1416, + "step": 1311 + }, + { + "epoch": 0.31622077609062427, + "grad_norm": 1.9604835510253906, + "learning_rate": 3.2009403402955454e-05, + "loss": 20.5332, + "step": 1312 + }, + { + "epoch": 0.3164617980236201, + "grad_norm": 3.6866726875305176, + "learning_rate": 3.199691383158912e-05, + "loss": 21.0244, + "step": 1313 + }, + { + "epoch": 0.31670281995661603, + "grad_norm": 9.84805965423584, + "learning_rate": 3.1984416947934265e-05, + "loss": 20.1307, + "step": 1314 + }, + { + "epoch": 0.31694384188961194, + "grad_norm": 12.973222732543945, + "learning_rate": 3.197191275960792e-05, + "loss": 20.5858, + "step": 1315 + }, + { + "epoch": 0.31718486382260785, + "grad_norm": 15.207507133483887, + "learning_rate": 3.195940127423157e-05, + "loss": 20.9496, + "step": 1316 + }, + { + "epoch": 0.31742588575560376, + "grad_norm": 18.693788528442383, + "learning_rate": 3.194688249943114e-05, + "loss": 20.6099, + "step": 1317 + }, + { + "epoch": 0.3176669076885997, + "grad_norm": 18.192468643188477, + "learning_rate": 3.1934356442836994e-05, + "loss": 20.7871, + "step": 1318 + }, + { + "epoch": 0.3179079296215956, + "grad_norm": 16.4764347076416, + "learning_rate": 3.1921823112083946e-05, + "loss": 21.1429, + "step": 1319 + }, + { + "epoch": 0.31814895155459144, + "grad_norm": 18.325191497802734, + "learning_rate": 3.190928251481123e-05, + "loss": 20.7818, + "step": 1320 + }, + { + "epoch": 0.31838997348758735, + "grad_norm": 17.599693298339844, + "learning_rate": 3.1896734658662526e-05, + "loss": 21.5484, + "step": 1321 + }, + { + "epoch": 0.31863099542058326, + "grad_norm": 12.379862785339355, + "learning_rate": 3.188417955128593e-05, + "loss": 21.9614, + "step": 1322 + }, + { + "epoch": 0.3188720173535792, + "grad_norm": 5.349165916442871, + "learning_rate": 3.1871617200333946e-05, + "loss": 20.9932, + "step": 1323 + }, + { + "epoch": 0.3191130392865751, + "grad_norm": 5.058958530426025, + "learning_rate": 3.185904761346352e-05, + "loss": 20.0862, + "step": 1324 + }, + { + "epoch": 0.319354061219571, + "grad_norm": 11.371088981628418, + "learning_rate": 3.1846470798335985e-05, + "loss": 21.4716, + "step": 1325 + }, + { + "epoch": 0.3195950831525669, + "grad_norm": 14.777135848999023, + "learning_rate": 3.183388676261709e-05, + "loss": 20.9312, + "step": 1326 + }, + { + "epoch": 0.31983610508556276, + "grad_norm": 24.18636703491211, + "learning_rate": 3.1821295513976976e-05, + "loss": 21.4925, + "step": 1327 + }, + { + "epoch": 0.32007712701855867, + "grad_norm": 32.555362701416016, + "learning_rate": 3.18086970600902e-05, + "loss": 21.0286, + "step": 1328 + }, + { + "epoch": 0.3203181489515546, + "grad_norm": 35.803829193115234, + "learning_rate": 3.1796091408635684e-05, + "loss": 20.9661, + "step": 1329 + }, + { + "epoch": 0.3205591708845505, + "grad_norm": 48.559471130371094, + "learning_rate": 3.178347856729676e-05, + "loss": 22.1706, + "step": 1330 + }, + { + "epoch": 0.3208001928175464, + "grad_norm": 47.56074523925781, + "learning_rate": 3.177085854376112e-05, + "loss": 21.5605, + "step": 1331 + }, + { + "epoch": 0.3210412147505423, + "grad_norm": 51.06733322143555, + "learning_rate": 3.175823134572086e-05, + "loss": 22.7234, + "step": 1332 + }, + { + "epoch": 0.3212822366835382, + "grad_norm": 48.31516647338867, + "learning_rate": 3.174559698087244e-05, + "loss": 21.6811, + "step": 1333 + }, + { + "epoch": 0.3215232586165341, + "grad_norm": 49.32585144042969, + "learning_rate": 3.173295545691666e-05, + "loss": 21.6326, + "step": 1334 + }, + { + "epoch": 0.32176428054953, + "grad_norm": 42.817596435546875, + "learning_rate": 3.1720306781558735e-05, + "loss": 21.4104, + "step": 1335 + }, + { + "epoch": 0.3220053024825259, + "grad_norm": 40.574729919433594, + "learning_rate": 3.1707650962508195e-05, + "loss": 20.8194, + "step": 1336 + }, + { + "epoch": 0.3222463244155218, + "grad_norm": 30.575023651123047, + "learning_rate": 3.169498800747895e-05, + "loss": 21.323, + "step": 1337 + }, + { + "epoch": 0.3224873463485177, + "grad_norm": 18.105714797973633, + "learning_rate": 3.168231792418924e-05, + "loss": 21.2014, + "step": 1338 + }, + { + "epoch": 0.32272836828151363, + "grad_norm": 12.250198364257812, + "learning_rate": 3.1669640720361666e-05, + "loss": 21.1184, + "step": 1339 + }, + { + "epoch": 0.32296939021450954, + "grad_norm": 4.6272807121276855, + "learning_rate": 3.1656956403723155e-05, + "loss": 21.8738, + "step": 1340 + }, + { + "epoch": 0.3232104121475054, + "grad_norm": 3.5020339488983154, + "learning_rate": 3.1644264982005e-05, + "loss": 21.7477, + "step": 1341 + }, + { + "epoch": 0.3234514340805013, + "grad_norm": 7.568626403808594, + "learning_rate": 3.163156646294277e-05, + "loss": 21.0905, + "step": 1342 + }, + { + "epoch": 0.3236924560134972, + "grad_norm": 3.521045684814453, + "learning_rate": 3.1618860854276414e-05, + "loss": 20.8808, + "step": 1343 + }, + { + "epoch": 0.32393347794649313, + "grad_norm": 7.265933513641357, + "learning_rate": 3.160614816375017e-05, + "loss": 20.5409, + "step": 1344 + }, + { + "epoch": 0.32417449987948904, + "grad_norm": 3.8664371967315674, + "learning_rate": 3.159342839911261e-05, + "loss": 20.2814, + "step": 1345 + }, + { + "epoch": 0.32441552181248495, + "grad_norm": 2.336228847503662, + "learning_rate": 3.158070156811661e-05, + "loss": 21.4393, + "step": 1346 + }, + { + "epoch": 0.32465654374548086, + "grad_norm": 5.010268688201904, + "learning_rate": 3.1567967678519344e-05, + "loss": 20.7072, + "step": 1347 + }, + { + "epoch": 0.3248975656784767, + "grad_norm": 4.1832780838012695, + "learning_rate": 3.1555226738082315e-05, + "loss": 20.7938, + "step": 1348 + }, + { + "epoch": 0.32513858761147263, + "grad_norm": 13.066758155822754, + "learning_rate": 3.154247875457128e-05, + "loss": 20.2343, + "step": 1349 + }, + { + "epoch": 0.32537960954446854, + "grad_norm": 15.319171905517578, + "learning_rate": 3.152972373575634e-05, + "loss": 21.1457, + "step": 1350 + }, + { + "epoch": 0.32562063147746445, + "grad_norm": 17.8040714263916, + "learning_rate": 3.151696168941184e-05, + "loss": 20.8147, + "step": 1351 + }, + { + "epoch": 0.32586165341046036, + "grad_norm": 25.043079376220703, + "learning_rate": 3.150419262331644e-05, + "loss": 20.6531, + "step": 1352 + }, + { + "epoch": 0.32610267534345627, + "grad_norm": 26.66877555847168, + "learning_rate": 3.1491416545253064e-05, + "loss": 20.7276, + "step": 1353 + }, + { + "epoch": 0.3263436972764522, + "grad_norm": 32.292442321777344, + "learning_rate": 3.1478633463008915e-05, + "loss": 20.9602, + "step": 1354 + }, + { + "epoch": 0.32658471920944804, + "grad_norm": 28.7908878326416, + "learning_rate": 3.146584338437545e-05, + "loss": 20.8859, + "step": 1355 + }, + { + "epoch": 0.32682574114244395, + "grad_norm": 33.540985107421875, + "learning_rate": 3.145304631714842e-05, + "loss": 21.5351, + "step": 1356 + }, + { + "epoch": 0.32706676307543986, + "grad_norm": 32.29022216796875, + "learning_rate": 3.1440242269127805e-05, + "loss": 20.1801, + "step": 1357 + }, + { + "epoch": 0.32730778500843577, + "grad_norm": 24.67045783996582, + "learning_rate": 3.142743124811785e-05, + "loss": 21.2573, + "step": 1358 + }, + { + "epoch": 0.3275488069414317, + "grad_norm": 31.54035186767578, + "learning_rate": 3.1414613261927075e-05, + "loss": 19.6853, + "step": 1359 + }, + { + "epoch": 0.3277898288744276, + "grad_norm": 23.67510986328125, + "learning_rate": 3.14017883183682e-05, + "loss": 21.0609, + "step": 1360 + }, + { + "epoch": 0.3280308508074235, + "grad_norm": 24.42256736755371, + "learning_rate": 3.138895642525822e-05, + "loss": 20.9763, + "step": 1361 + }, + { + "epoch": 0.32827187274041936, + "grad_norm": 18.430713653564453, + "learning_rate": 3.137611759041836e-05, + "loss": 20.4002, + "step": 1362 + }, + { + "epoch": 0.32851289467341527, + "grad_norm": 14.069979667663574, + "learning_rate": 3.1363271821674054e-05, + "loss": 20.4508, + "step": 1363 + }, + { + "epoch": 0.3287539166064112, + "grad_norm": 7.7065324783325195, + "learning_rate": 3.1350419126855e-05, + "loss": 21.0403, + "step": 1364 + }, + { + "epoch": 0.3289949385394071, + "grad_norm": 9.845030784606934, + "learning_rate": 3.1337559513795086e-05, + "loss": 20.0077, + "step": 1365 + }, + { + "epoch": 0.329235960472403, + "grad_norm": 1.3762286901474, + "learning_rate": 3.132469299033243e-05, + "loss": 20.7534, + "step": 1366 + }, + { + "epoch": 0.3294769824053989, + "grad_norm": 1.6755554676055908, + "learning_rate": 3.131181956430936e-05, + "loss": 20.2897, + "step": 1367 + }, + { + "epoch": 0.3297180043383948, + "grad_norm": 5.176748752593994, + "learning_rate": 3.129893924357241e-05, + "loss": 21.0447, + "step": 1368 + }, + { + "epoch": 0.3299590262713907, + "grad_norm": 2.6094484329223633, + "learning_rate": 3.128605203597232e-05, + "loss": 20.5052, + "step": 1369 + }, + { + "epoch": 0.3302000482043866, + "grad_norm": 1.6769565343856812, + "learning_rate": 3.1273157949364014e-05, + "loss": 19.7987, + "step": 1370 + }, + { + "epoch": 0.3304410701373825, + "grad_norm": 2.013998031616211, + "learning_rate": 3.126025699160665e-05, + "loss": 19.7666, + "step": 1371 + }, + { + "epoch": 0.3306820920703784, + "grad_norm": 3.40673565864563, + "learning_rate": 3.124734917056351e-05, + "loss": 20.9094, + "step": 1372 + }, + { + "epoch": 0.3309231140033743, + "grad_norm": 15.132380485534668, + "learning_rate": 3.123443449410211e-05, + "loss": 19.5007, + "step": 1373 + }, + { + "epoch": 0.33116413593637023, + "grad_norm": 18.07408905029297, + "learning_rate": 3.1221512970094114e-05, + "loss": 20.9604, + "step": 1374 + }, + { + "epoch": 0.3314051578693661, + "grad_norm": 28.93204689025879, + "learning_rate": 3.12085846064154e-05, + "loss": 20.5801, + "step": 1375 + }, + { + "epoch": 0.331646179802362, + "grad_norm": 27.56015396118164, + "learning_rate": 3.119564941094596e-05, + "loss": 19.9528, + "step": 1376 + }, + { + "epoch": 0.3318872017353579, + "grad_norm": 33.316349029541016, + "learning_rate": 3.118270739157e-05, + "loss": 21.638, + "step": 1377 + }, + { + "epoch": 0.3321282236683538, + "grad_norm": 39.30671310424805, + "learning_rate": 3.116975855617585e-05, + "loss": 21.4363, + "step": 1378 + }, + { + "epoch": 0.3323692456013497, + "grad_norm": 42.305137634277344, + "learning_rate": 3.115680291265602e-05, + "loss": 21.2893, + "step": 1379 + }, + { + "epoch": 0.33261026753434564, + "grad_norm": 42.55583953857422, + "learning_rate": 3.114384046890714e-05, + "loss": 21.7681, + "step": 1380 + }, + { + "epoch": 0.33285128946734155, + "grad_norm": 35.558624267578125, + "learning_rate": 3.113087123283002e-05, + "loss": 21.3046, + "step": 1381 + }, + { + "epoch": 0.3330923114003374, + "grad_norm": 41.05385208129883, + "learning_rate": 3.1117895212329586e-05, + "loss": 21.1223, + "step": 1382 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 35.062461853027344, + "learning_rate": 3.1104912415314905e-05, + "loss": 21.913, + "step": 1383 + }, + { + "epoch": 0.3335743552663292, + "grad_norm": 27.43922233581543, + "learning_rate": 3.1091922849699176e-05, + "loss": 20.8756, + "step": 1384 + }, + { + "epoch": 0.33381537719932514, + "grad_norm": 26.196046829223633, + "learning_rate": 3.107892652339972e-05, + "loss": 22.3345, + "step": 1385 + }, + { + "epoch": 0.33405639913232105, + "grad_norm": 21.580429077148438, + "learning_rate": 3.106592344433799e-05, + "loss": 20.6701, + "step": 1386 + }, + { + "epoch": 0.33429742106531696, + "grad_norm": 11.685224533081055, + "learning_rate": 3.105291362043954e-05, + "loss": 20.6957, + "step": 1387 + }, + { + "epoch": 0.33453844299831287, + "grad_norm": 12.07873249053955, + "learning_rate": 3.103989705963404e-05, + "loss": 20.869, + "step": 1388 + }, + { + "epoch": 0.3347794649313087, + "grad_norm": 1.800771951675415, + "learning_rate": 3.102687376985527e-05, + "loss": 20.7328, + "step": 1389 + }, + { + "epoch": 0.33502048686430463, + "grad_norm": 1.6751655340194702, + "learning_rate": 3.1013843759041114e-05, + "loss": 20.7712, + "step": 1390 + }, + { + "epoch": 0.33526150879730054, + "grad_norm": 2.367445230484009, + "learning_rate": 3.100080703513354e-05, + "loss": 20.9687, + "step": 1391 + }, + { + "epoch": 0.33550253073029646, + "grad_norm": 6.564997673034668, + "learning_rate": 3.098776360607862e-05, + "loss": 20.9989, + "step": 1392 + }, + { + "epoch": 0.33574355266329237, + "grad_norm": 21.07494354248047, + "learning_rate": 3.0974713479826494e-05, + "loss": 20.2915, + "step": 1393 + }, + { + "epoch": 0.3359845745962883, + "grad_norm": 30.941186904907227, + "learning_rate": 3.096165666433143e-05, + "loss": 20.8009, + "step": 1394 + }, + { + "epoch": 0.3362255965292842, + "grad_norm": 50.311771392822266, + "learning_rate": 3.0948593167551714e-05, + "loss": 21.1185, + "step": 1395 + }, + { + "epoch": 0.33646661846228004, + "grad_norm": 63.81370162963867, + "learning_rate": 3.093552299744974e-05, + "loss": 23.4458, + "step": 1396 + }, + { + "epoch": 0.33670764039527595, + "grad_norm": 75.58320617675781, + "learning_rate": 3.092244616199196e-05, + "loss": 23.5091, + "step": 1397 + }, + { + "epoch": 0.33694866232827186, + "grad_norm": 85.09021759033203, + "learning_rate": 3.09093626691489e-05, + "loss": 23.5022, + "step": 1398 + }, + { + "epoch": 0.3371896842612678, + "grad_norm": 87.59384155273438, + "learning_rate": 3.0896272526895125e-05, + "loss": 24.445, + "step": 1399 + }, + { + "epoch": 0.3374307061942637, + "grad_norm": 99.11080932617188, + "learning_rate": 3.0883175743209254e-05, + "loss": 25.4969, + "step": 1400 + }, + { + "epoch": 0.3374307061942637, + "eval_cc_pretrain_accuracy": 0.9099999666213989, + "eval_cc_pretrain_loss": 2.196439266204834, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 18.6714, + "eval_cc_pretrain_samples_per_second": 5.356, + "eval_cc_pretrain_steps_per_second": 0.054, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 1400 + }, + { + "epoch": 0.3374307061942637, + "eval_mscoco_pretrain_accuracy": 0.7999999523162842, + "eval_mscoco_pretrain_loss": 2.383352279663086, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 15.4749, + "eval_mscoco_pretrain_samples_per_second": 6.462, + "eval_mscoco_pretrain_steps_per_second": 0.065, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 1400 + }, + { + "epoch": 0.3376717281272596, + "grad_norm": 100.6182861328125, + "learning_rate": 3.087007232607398e-05, + "loss": 25.556, + "step": 1401 + }, + { + "epoch": 0.3379127500602555, + "grad_norm": 99.61454772949219, + "learning_rate": 3.0856962283476004e-05, + "loss": 25.6002, + "step": 1402 + }, + { + "epoch": 0.33815377199325136, + "grad_norm": 90.60435485839844, + "learning_rate": 3.08438456234061e-05, + "loss": 25.6582, + "step": 1403 + }, + { + "epoch": 0.3383947939262473, + "grad_norm": 90.6102066040039, + "learning_rate": 3.0830722353859035e-05, + "loss": 25.0173, + "step": 1404 + }, + { + "epoch": 0.3386358158592432, + "grad_norm": 80.09077453613281, + "learning_rate": 3.081759248283364e-05, + "loss": 23.9294, + "step": 1405 + }, + { + "epoch": 0.3388768377922391, + "grad_norm": 71.08293151855469, + "learning_rate": 3.080445601833276e-05, + "loss": 22.8869, + "step": 1406 + }, + { + "epoch": 0.339117859725235, + "grad_norm": 62.07583999633789, + "learning_rate": 3.079131296836326e-05, + "loss": 22.0607, + "step": 1407 + }, + { + "epoch": 0.3393588816582309, + "grad_norm": 47.575923919677734, + "learning_rate": 3.0778163340935994e-05, + "loss": 22.57, + "step": 1408 + }, + { + "epoch": 0.3395999035912268, + "grad_norm": 33.06901931762695, + "learning_rate": 3.0765007144065864e-05, + "loss": 21.4955, + "step": 1409 + }, + { + "epoch": 0.3398409255242227, + "grad_norm": 19.467117309570312, + "learning_rate": 3.075184438577175e-05, + "loss": 21.4371, + "step": 1410 + }, + { + "epoch": 0.3400819474572186, + "grad_norm": 4.747889041900635, + "learning_rate": 3.0738675074076544e-05, + "loss": 20.8065, + "step": 1411 + }, + { + "epoch": 0.3403229693902145, + "grad_norm": 4.249185085296631, + "learning_rate": 3.072549921700713e-05, + "loss": 20.3237, + "step": 1412 + }, + { + "epoch": 0.3405639913232104, + "grad_norm": 16.094860076904297, + "learning_rate": 3.071231682259437e-05, + "loss": 22.0383, + "step": 1413 + }, + { + "epoch": 0.3408050132562063, + "grad_norm": 24.81263542175293, + "learning_rate": 3.069912789887312e-05, + "loss": 21.6985, + "step": 1414 + }, + { + "epoch": 0.34104603518920223, + "grad_norm": 21.808988571166992, + "learning_rate": 3.0685932453882216e-05, + "loss": 20.2199, + "step": 1415 + }, + { + "epoch": 0.34128705712219815, + "grad_norm": 30.80314064025879, + "learning_rate": 3.067273049566448e-05, + "loss": 21.6707, + "step": 1416 + }, + { + "epoch": 0.341528079055194, + "grad_norm": 32.03923797607422, + "learning_rate": 3.065952203226668e-05, + "loss": 20.6001, + "step": 1417 + }, + { + "epoch": 0.3417691009881899, + "grad_norm": 30.17572784423828, + "learning_rate": 3.064630707173957e-05, + "loss": 20.8462, + "step": 1418 + }, + { + "epoch": 0.3420101229211858, + "grad_norm": 28.42971420288086, + "learning_rate": 3.063308562213784e-05, + "loss": 20.5969, + "step": 1419 + }, + { + "epoch": 0.34225114485418173, + "grad_norm": 27.559463500976562, + "learning_rate": 3.061985769152017e-05, + "loss": 21.303, + "step": 1420 + }, + { + "epoch": 0.34249216678717764, + "grad_norm": 25.555932998657227, + "learning_rate": 3.060662328794916e-05, + "loss": 20.6349, + "step": 1421 + }, + { + "epoch": 0.34273318872017355, + "grad_norm": 19.077045440673828, + "learning_rate": 3.059338241949136e-05, + "loss": 21.689, + "step": 1422 + }, + { + "epoch": 0.34297421065316946, + "grad_norm": 23.562686920166016, + "learning_rate": 3.058013509421728e-05, + "loss": 21.3517, + "step": 1423 + }, + { + "epoch": 0.3432152325861653, + "grad_norm": 11.533621788024902, + "learning_rate": 3.056688132020134e-05, + "loss": 20.175, + "step": 1424 + }, + { + "epoch": 0.34345625451916123, + "grad_norm": 10.870552062988281, + "learning_rate": 3.055362110552191e-05, + "loss": 21.4943, + "step": 1425 + }, + { + "epoch": 0.34369727645215714, + "grad_norm": 3.5334136486053467, + "learning_rate": 3.054035445826128e-05, + "loss": 20.0819, + "step": 1426 + }, + { + "epoch": 0.34393829838515305, + "grad_norm": 2.4338431358337402, + "learning_rate": 3.052708138650565e-05, + "loss": 20.464, + "step": 1427 + }, + { + "epoch": 0.34417932031814896, + "grad_norm": 9.234247207641602, + "learning_rate": 3.0513801898345155e-05, + "loss": 20.6039, + "step": 1428 + }, + { + "epoch": 0.3444203422511449, + "grad_norm": 15.25523567199707, + "learning_rate": 3.050051600187382e-05, + "loss": 20.7501, + "step": 1429 + }, + { + "epoch": 0.3446613641841408, + "grad_norm": 20.178754806518555, + "learning_rate": 3.0487223705189594e-05, + "loss": 20.6394, + "step": 1430 + }, + { + "epoch": 0.34490238611713664, + "grad_norm": 32.541748046875, + "learning_rate": 3.0473925016394317e-05, + "loss": 21.0889, + "step": 1431 + }, + { + "epoch": 0.34514340805013255, + "grad_norm": 35.5442008972168, + "learning_rate": 3.046061994359373e-05, + "loss": 21.4467, + "step": 1432 + }, + { + "epoch": 0.34538442998312846, + "grad_norm": 35.54304504394531, + "learning_rate": 3.0447308494897466e-05, + "loss": 20.6922, + "step": 1433 + }, + { + "epoch": 0.34562545191612437, + "grad_norm": 40.04494857788086, + "learning_rate": 3.043399067841903e-05, + "loss": 20.9229, + "step": 1434 + }, + { + "epoch": 0.3458664738491203, + "grad_norm": 41.04765701293945, + "learning_rate": 3.0420666502275832e-05, + "loss": 21.2889, + "step": 1435 + }, + { + "epoch": 0.3461074957821162, + "grad_norm": 37.043663024902344, + "learning_rate": 3.040733597458913e-05, + "loss": 21.7981, + "step": 1436 + }, + { + "epoch": 0.34634851771511205, + "grad_norm": 38.30061340332031, + "learning_rate": 3.039399910348409e-05, + "loss": 20.279, + "step": 1437 + }, + { + "epoch": 0.34658953964810796, + "grad_norm": 34.54994201660156, + "learning_rate": 3.0380655897089707e-05, + "loss": 20.9236, + "step": 1438 + }, + { + "epoch": 0.34683056158110387, + "grad_norm": 29.809053421020508, + "learning_rate": 3.0367306363538856e-05, + "loss": 21.1446, + "step": 1439 + }, + { + "epoch": 0.3470715835140998, + "grad_norm": 22.33033561706543, + "learning_rate": 3.0353950510968273e-05, + "loss": 21.6267, + "step": 1440 + }, + { + "epoch": 0.3473126054470957, + "grad_norm": 21.452713012695312, + "learning_rate": 3.0340588347518536e-05, + "loss": 20.3764, + "step": 1441 + }, + { + "epoch": 0.3475536273800916, + "grad_norm": 5.390617370605469, + "learning_rate": 3.032721988133407e-05, + "loss": 20.8378, + "step": 1442 + }, + { + "epoch": 0.3477946493130875, + "grad_norm": 2.0384159088134766, + "learning_rate": 3.0313845120563147e-05, + "loss": 21.081, + "step": 1443 + }, + { + "epoch": 0.34803567124608337, + "grad_norm": 9.021862983703613, + "learning_rate": 3.030046407335787e-05, + "loss": 21.466, + "step": 1444 + }, + { + "epoch": 0.3482766931790793, + "grad_norm": 4.764529705047607, + "learning_rate": 3.0287076747874176e-05, + "loss": 20.9265, + "step": 1445 + }, + { + "epoch": 0.3485177151120752, + "grad_norm": 4.217637062072754, + "learning_rate": 3.0273683152271833e-05, + "loss": 21.2076, + "step": 1446 + }, + { + "epoch": 0.3487587370450711, + "grad_norm": 5.7294511795043945, + "learning_rate": 3.0260283294714427e-05, + "loss": 20.2941, + "step": 1447 + }, + { + "epoch": 0.348999758978067, + "grad_norm": 5.93262243270874, + "learning_rate": 3.0246877183369352e-05, + "loss": 20.3322, + "step": 1448 + }, + { + "epoch": 0.3492407809110629, + "grad_norm": 10.463139533996582, + "learning_rate": 3.0233464826407837e-05, + "loss": 20.0894, + "step": 1449 + }, + { + "epoch": 0.34948180284405883, + "grad_norm": 16.06177520751953, + "learning_rate": 3.0220046232004887e-05, + "loss": 20.0905, + "step": 1450 + }, + { + "epoch": 0.3497228247770547, + "grad_norm": 13.699326515197754, + "learning_rate": 3.0206621408339334e-05, + "loss": 20.2047, + "step": 1451 + }, + { + "epoch": 0.3499638467100506, + "grad_norm": 25.301389694213867, + "learning_rate": 3.0193190363593802e-05, + "loss": 20.4643, + "step": 1452 + }, + { + "epoch": 0.3502048686430465, + "grad_norm": 19.564783096313477, + "learning_rate": 3.017975310595469e-05, + "loss": 21.0979, + "step": 1453 + }, + { + "epoch": 0.3504458905760424, + "grad_norm": 24.812124252319336, + "learning_rate": 3.0166309643612206e-05, + "loss": 20.4602, + "step": 1454 + }, + { + "epoch": 0.35068691250903833, + "grad_norm": 17.56059455871582, + "learning_rate": 3.0152859984760323e-05, + "loss": 20.196, + "step": 1455 + }, + { + "epoch": 0.35092793444203424, + "grad_norm": 14.578927040100098, + "learning_rate": 3.0139404137596808e-05, + "loss": 21.205, + "step": 1456 + }, + { + "epoch": 0.35116895637503015, + "grad_norm": 10.35690689086914, + "learning_rate": 3.0125942110323182e-05, + "loss": 20.3764, + "step": 1457 + }, + { + "epoch": 0.351409978308026, + "grad_norm": 2.6226367950439453, + "learning_rate": 3.011247391114475e-05, + "loss": 19.8943, + "step": 1458 + }, + { + "epoch": 0.3516510002410219, + "grad_norm": 6.060166835784912, + "learning_rate": 3.0098999548270554e-05, + "loss": 20.6253, + "step": 1459 + }, + { + "epoch": 0.3518920221740178, + "grad_norm": 9.60909652709961, + "learning_rate": 3.0085519029913423e-05, + "loss": 20.5143, + "step": 1460 + }, + { + "epoch": 0.35213304410701374, + "grad_norm": 16.70155906677246, + "learning_rate": 3.0072032364289914e-05, + "loss": 21.0704, + "step": 1461 + }, + { + "epoch": 0.35237406604000965, + "grad_norm": 24.806793212890625, + "learning_rate": 3.0058539559620347e-05, + "loss": 21.7076, + "step": 1462 + }, + { + "epoch": 0.35261508797300556, + "grad_norm": 23.303836822509766, + "learning_rate": 3.0045040624128777e-05, + "loss": 20.5369, + "step": 1463 + }, + { + "epoch": 0.35285610990600147, + "grad_norm": 25.44264030456543, + "learning_rate": 3.0031535566042992e-05, + "loss": 22.2081, + "step": 1464 + }, + { + "epoch": 0.3530971318389973, + "grad_norm": 23.30732536315918, + "learning_rate": 3.001802439359452e-05, + "loss": 21.0383, + "step": 1465 + }, + { + "epoch": 0.35333815377199324, + "grad_norm": 24.305978775024414, + "learning_rate": 3.00045071150186e-05, + "loss": 20.2398, + "step": 1466 + }, + { + "epoch": 0.35357917570498915, + "grad_norm": 19.06390953063965, + "learning_rate": 2.9990983738554225e-05, + "loss": 20.3984, + "step": 1467 + }, + { + "epoch": 0.35382019763798506, + "grad_norm": 12.906436920166016, + "learning_rate": 2.9977454272444064e-05, + "loss": 20.484, + "step": 1468 + }, + { + "epoch": 0.35406121957098097, + "grad_norm": 14.08121109008789, + "learning_rate": 2.996391872493453e-05, + "loss": 20.3192, + "step": 1469 + }, + { + "epoch": 0.3543022415039769, + "grad_norm": 8.00682258605957, + "learning_rate": 2.9950377104275727e-05, + "loss": 20.6926, + "step": 1470 + }, + { + "epoch": 0.3545432634369728, + "grad_norm": 4.84312629699707, + "learning_rate": 2.9936829418721454e-05, + "loss": 20.1052, + "step": 1471 + }, + { + "epoch": 0.35478428536996864, + "grad_norm": 1.570924997329712, + "learning_rate": 2.992327567652924e-05, + "loss": 20.6122, + "step": 1472 + }, + { + "epoch": 0.35502530730296455, + "grad_norm": 6.11937141418457, + "learning_rate": 2.9909715885960264e-05, + "loss": 20.3783, + "step": 1473 + }, + { + "epoch": 0.35526632923596047, + "grad_norm": 21.053916931152344, + "learning_rate": 2.9896150055279417e-05, + "loss": 20.3957, + "step": 1474 + }, + { + "epoch": 0.3555073511689564, + "grad_norm": 23.30117416381836, + "learning_rate": 2.988257819275526e-05, + "loss": 20.9402, + "step": 1475 + }, + { + "epoch": 0.3557483731019523, + "grad_norm": 31.295032501220703, + "learning_rate": 2.9869000306660033e-05, + "loss": 20.3306, + "step": 1476 + }, + { + "epoch": 0.3559893950349482, + "grad_norm": 42.793636322021484, + "learning_rate": 2.9855416405269655e-05, + "loss": 20.9763, + "step": 1477 + }, + { + "epoch": 0.3562304169679441, + "grad_norm": 42.04426574707031, + "learning_rate": 2.984182649686371e-05, + "loss": 20.1947, + "step": 1478 + }, + { + "epoch": 0.35647143890093996, + "grad_norm": 43.29927062988281, + "learning_rate": 2.9828230589725435e-05, + "loss": 21.5002, + "step": 1479 + }, + { + "epoch": 0.3567124608339359, + "grad_norm": 41.050418853759766, + "learning_rate": 2.981462869214172e-05, + "loss": 20.4167, + "step": 1480 + }, + { + "epoch": 0.3569534827669318, + "grad_norm": 40.548797607421875, + "learning_rate": 2.980102081240313e-05, + "loss": 21.3773, + "step": 1481 + }, + { + "epoch": 0.3571945046999277, + "grad_norm": 36.05020523071289, + "learning_rate": 2.9787406958803836e-05, + "loss": 21.2885, + "step": 1482 + }, + { + "epoch": 0.3574355266329236, + "grad_norm": 34.29867172241211, + "learning_rate": 2.9773787139641696e-05, + "loss": 21.4549, + "step": 1483 + }, + { + "epoch": 0.3576765485659195, + "grad_norm": 29.55168914794922, + "learning_rate": 2.9760161363218177e-05, + "loss": 20.4275, + "step": 1484 + }, + { + "epoch": 0.3579175704989154, + "grad_norm": 25.178916931152344, + "learning_rate": 2.9746529637838378e-05, + "loss": 20.2317, + "step": 1485 + }, + { + "epoch": 0.3581585924319113, + "grad_norm": 14.461463928222656, + "learning_rate": 2.973289197181103e-05, + "loss": 20.555, + "step": 1486 + }, + { + "epoch": 0.3583996143649072, + "grad_norm": 10.9751558303833, + "learning_rate": 2.9719248373448478e-05, + "loss": 20.1065, + "step": 1487 + }, + { + "epoch": 0.3586406362979031, + "grad_norm": 4.605621814727783, + "learning_rate": 2.9705598851066705e-05, + "loss": 21.8455, + "step": 1488 + }, + { + "epoch": 0.358881658230899, + "grad_norm": 1.5424895286560059, + "learning_rate": 2.9691943412985266e-05, + "loss": 20.6893, + "step": 1489 + }, + { + "epoch": 0.3591226801638949, + "grad_norm": 6.903262615203857, + "learning_rate": 2.967828206752736e-05, + "loss": 20.0449, + "step": 1490 + }, + { + "epoch": 0.35936370209689084, + "grad_norm": 10.04992389678955, + "learning_rate": 2.9664614823019762e-05, + "loss": 20.2214, + "step": 1491 + }, + { + "epoch": 0.35960472402988675, + "grad_norm": 18.566913604736328, + "learning_rate": 2.965094168779285e-05, + "loss": 20.3891, + "step": 1492 + }, + { + "epoch": 0.3598457459628826, + "grad_norm": 19.070194244384766, + "learning_rate": 2.9637262670180597e-05, + "loss": 20.7175, + "step": 1493 + }, + { + "epoch": 0.3600867678958785, + "grad_norm": 12.845840454101562, + "learning_rate": 2.9623577778520554e-05, + "loss": 20.3157, + "step": 1494 + }, + { + "epoch": 0.3603277898288744, + "grad_norm": 14.224474906921387, + "learning_rate": 2.9609887021153866e-05, + "loss": 20.554, + "step": 1495 + }, + { + "epoch": 0.36056881176187033, + "grad_norm": 15.02224063873291, + "learning_rate": 2.9596190406425225e-05, + "loss": 19.8087, + "step": 1496 + }, + { + "epoch": 0.36080983369486624, + "grad_norm": 6.966197967529297, + "learning_rate": 2.9582487942682924e-05, + "loss": 19.6251, + "step": 1497 + }, + { + "epoch": 0.36105085562786216, + "grad_norm": 2.695448160171509, + "learning_rate": 2.9568779638278807e-05, + "loss": 20.4593, + "step": 1498 + }, + { + "epoch": 0.36129187756085807, + "grad_norm": 7.9432148933410645, + "learning_rate": 2.9555065501568275e-05, + "loss": 21.3526, + "step": 1499 + }, + { + "epoch": 0.3615328994938539, + "grad_norm": 14.201664924621582, + "learning_rate": 2.95413455409103e-05, + "loss": 19.9229, + "step": 1500 + }, + { + "epoch": 0.36177392142684983, + "grad_norm": 20.804256439208984, + "learning_rate": 2.9527619764667376e-05, + "loss": 20.6668, + "step": 1501 + }, + { + "epoch": 0.36201494335984574, + "grad_norm": 30.552570343017578, + "learning_rate": 2.9513888181205565e-05, + "loss": 20.2924, + "step": 1502 + }, + { + "epoch": 0.36225596529284165, + "grad_norm": 26.171714782714844, + "learning_rate": 2.950015079889446e-05, + "loss": 20.4364, + "step": 1503 + }, + { + "epoch": 0.36249698722583756, + "grad_norm": 33.04325866699219, + "learning_rate": 2.9486407626107194e-05, + "loss": 20.9188, + "step": 1504 + }, + { + "epoch": 0.3627380091588335, + "grad_norm": 32.042049407958984, + "learning_rate": 2.947265867122042e-05, + "loss": 20.571, + "step": 1505 + }, + { + "epoch": 0.36297903109182933, + "grad_norm": 32.29316711425781, + "learning_rate": 2.9458903942614322e-05, + "loss": 21.2716, + "step": 1506 + }, + { + "epoch": 0.36322005302482524, + "grad_norm": 28.424022674560547, + "learning_rate": 2.9445143448672595e-05, + "loss": 21.1091, + "step": 1507 + }, + { + "epoch": 0.36346107495782115, + "grad_norm": 29.796525955200195, + "learning_rate": 2.9431377197782462e-05, + "loss": 21.6771, + "step": 1508 + }, + { + "epoch": 0.36370209689081706, + "grad_norm": 28.551025390625, + "learning_rate": 2.9417605198334653e-05, + "loss": 21.1281, + "step": 1509 + }, + { + "epoch": 0.36394311882381297, + "grad_norm": 18.807798385620117, + "learning_rate": 2.9403827458723383e-05, + "loss": 20.0711, + "step": 1510 + }, + { + "epoch": 0.3641841407568089, + "grad_norm": 17.19504165649414, + "learning_rate": 2.939004398734639e-05, + "loss": 21.4522, + "step": 1511 + }, + { + "epoch": 0.3644251626898048, + "grad_norm": 9.848954200744629, + "learning_rate": 2.937625479260489e-05, + "loss": 20.1649, + "step": 1512 + }, + { + "epoch": 0.36466618462280065, + "grad_norm": 2.2058334350585938, + "learning_rate": 2.9362459882903586e-05, + "loss": 20.5707, + "step": 1513 + }, + { + "epoch": 0.36490720655579656, + "grad_norm": 5.549267292022705, + "learning_rate": 2.9348659266650678e-05, + "loss": 20.3198, + "step": 1514 + }, + { + "epoch": 0.36514822848879247, + "grad_norm": 9.849757194519043, + "learning_rate": 2.9334852952257836e-05, + "loss": 19.9881, + "step": 1515 + }, + { + "epoch": 0.3653892504217884, + "grad_norm": 18.31901741027832, + "learning_rate": 2.9321040948140206e-05, + "loss": 20.6694, + "step": 1516 + }, + { + "epoch": 0.3656302723547843, + "grad_norm": 25.046419143676758, + "learning_rate": 2.9307223262716395e-05, + "loss": 20.065, + "step": 1517 + }, + { + "epoch": 0.3658712942877802, + "grad_norm": 30.302623748779297, + "learning_rate": 2.929339990440848e-05, + "loss": 20.7811, + "step": 1518 + }, + { + "epoch": 0.3661123162207761, + "grad_norm": 28.55877685546875, + "learning_rate": 2.927957088164199e-05, + "loss": 20.5507, + "step": 1519 + }, + { + "epoch": 0.36635333815377197, + "grad_norm": 32.54976272583008, + "learning_rate": 2.9265736202845914e-05, + "loss": 20.79, + "step": 1520 + }, + { + "epoch": 0.3665943600867679, + "grad_norm": 27.43111801147461, + "learning_rate": 2.9251895876452678e-05, + "loss": 20.259, + "step": 1521 + }, + { + "epoch": 0.3668353820197638, + "grad_norm": 28.822019577026367, + "learning_rate": 2.9238049910898162e-05, + "loss": 21.9098, + "step": 1522 + }, + { + "epoch": 0.3670764039527597, + "grad_norm": 24.93014907836914, + "learning_rate": 2.922419831462168e-05, + "loss": 19.9985, + "step": 1523 + }, + { + "epoch": 0.3673174258857556, + "grad_norm": 22.687963485717773, + "learning_rate": 2.9210341096065967e-05, + "loss": 20.2677, + "step": 1524 + }, + { + "epoch": 0.3675584478187515, + "grad_norm": 13.419022560119629, + "learning_rate": 2.9196478263677207e-05, + "loss": 20.0278, + "step": 1525 + }, + { + "epoch": 0.36779946975174743, + "grad_norm": 11.315451622009277, + "learning_rate": 2.9182609825904973e-05, + "loss": 20.7959, + "step": 1526 + }, + { + "epoch": 0.3680404916847433, + "grad_norm": 3.6923937797546387, + "learning_rate": 2.9168735791202295e-05, + "loss": 19.9203, + "step": 1527 + }, + { + "epoch": 0.3682815136177392, + "grad_norm": 5.345596790313721, + "learning_rate": 2.915485616802557e-05, + "loss": 20.4006, + "step": 1528 + }, + { + "epoch": 0.3685225355507351, + "grad_norm": 11.333114624023438, + "learning_rate": 2.914097096483465e-05, + "loss": 20.0265, + "step": 1529 + }, + { + "epoch": 0.368763557483731, + "grad_norm": 18.932947158813477, + "learning_rate": 2.9127080190092746e-05, + "loss": 20.9073, + "step": 1530 + }, + { + "epoch": 0.36900457941672693, + "grad_norm": 19.80172348022461, + "learning_rate": 2.9113183852266487e-05, + "loss": 20.2884, + "step": 1531 + }, + { + "epoch": 0.36924560134972284, + "grad_norm": 29.795146942138672, + "learning_rate": 2.9099281959825883e-05, + "loss": 20.3906, + "step": 1532 + }, + { + "epoch": 0.36948662328271875, + "grad_norm": 32.547119140625, + "learning_rate": 2.9085374521244333e-05, + "loss": 21.4692, + "step": 1533 + }, + { + "epoch": 0.3697276452157146, + "grad_norm": 35.04736328125, + "learning_rate": 2.9071461544998628e-05, + "loss": 21.949, + "step": 1534 + }, + { + "epoch": 0.3699686671487105, + "grad_norm": 36.04890823364258, + "learning_rate": 2.9057543039568916e-05, + "loss": 20.8712, + "step": 1535 + }, + { + "epoch": 0.37020968908170643, + "grad_norm": 34.549644470214844, + "learning_rate": 2.904361901343872e-05, + "loss": 21.0949, + "step": 1536 + }, + { + "epoch": 0.37045071101470234, + "grad_norm": 33.054203033447266, + "learning_rate": 2.9029689475094936e-05, + "loss": 21.1647, + "step": 1537 + }, + { + "epoch": 0.37069173294769825, + "grad_norm": 23.556377410888672, + "learning_rate": 2.9015754433027812e-05, + "loss": 20.4017, + "step": 1538 + }, + { + "epoch": 0.37093275488069416, + "grad_norm": 25.556522369384766, + "learning_rate": 2.9001813895730948e-05, + "loss": 20.307, + "step": 1539 + }, + { + "epoch": 0.37117377681369007, + "grad_norm": 16.32256507873535, + "learning_rate": 2.8987867871701315e-05, + "loss": 20.1099, + "step": 1540 + }, + { + "epoch": 0.3714147987466859, + "grad_norm": 8.920893669128418, + "learning_rate": 2.8973916369439194e-05, + "loss": 19.7658, + "step": 1541 + }, + { + "epoch": 0.37165582067968184, + "grad_norm": 1.6702994108200073, + "learning_rate": 2.8959959397448233e-05, + "loss": 21.4536, + "step": 1542 + }, + { + "epoch": 0.37189684261267775, + "grad_norm": 7.296195030212402, + "learning_rate": 2.8945996964235395e-05, + "loss": 21.061, + "step": 1543 + }, + { + "epoch": 0.37213786454567366, + "grad_norm": 10.851767539978027, + "learning_rate": 2.893202907831099e-05, + "loss": 20.2956, + "step": 1544 + }, + { + "epoch": 0.37237888647866957, + "grad_norm": 16.69705581665039, + "learning_rate": 2.891805574818863e-05, + "loss": 20.0024, + "step": 1545 + }, + { + "epoch": 0.3726199084116655, + "grad_norm": 19.19455337524414, + "learning_rate": 2.890407698238528e-05, + "loss": 20.1927, + "step": 1546 + }, + { + "epoch": 0.3728609303446614, + "grad_norm": 25.68177032470703, + "learning_rate": 2.8890092789421166e-05, + "loss": 20.5242, + "step": 1547 + }, + { + "epoch": 0.37310195227765725, + "grad_norm": 22.813249588012695, + "learning_rate": 2.887610317781987e-05, + "loss": 20.0266, + "step": 1548 + }, + { + "epoch": 0.37334297421065316, + "grad_norm": 19.5789737701416, + "learning_rate": 2.886210815610825e-05, + "loss": 20.504, + "step": 1549 + }, + { + "epoch": 0.37358399614364907, + "grad_norm": 16.8349666595459, + "learning_rate": 2.8848107732816482e-05, + "loss": 19.8231, + "step": 1550 + }, + { + "epoch": 0.373825018076645, + "grad_norm": 17.591854095458984, + "learning_rate": 2.8834101916478006e-05, + "loss": 20.668, + "step": 1551 + }, + { + "epoch": 0.3740660400096409, + "grad_norm": 8.027555465698242, + "learning_rate": 2.8820090715629566e-05, + "loss": 20.4299, + "step": 1552 + }, + { + "epoch": 0.3743070619426368, + "grad_norm": 3.413877487182617, + "learning_rate": 2.88060741388112e-05, + "loss": 20.0975, + "step": 1553 + }, + { + "epoch": 0.3745480838756327, + "grad_norm": 11.719271659851074, + "learning_rate": 2.87920521945662e-05, + "loss": 20.1098, + "step": 1554 + }, + { + "epoch": 0.37478910580862856, + "grad_norm": 19.066082000732422, + "learning_rate": 2.8778024891441143e-05, + "loss": 21.5317, + "step": 1555 + }, + { + "epoch": 0.3750301277416245, + "grad_norm": 28.54660987854004, + "learning_rate": 2.8763992237985866e-05, + "loss": 21.8785, + "step": 1556 + }, + { + "epoch": 0.3752711496746204, + "grad_norm": 33.79404830932617, + "learning_rate": 2.874995424275347e-05, + "loss": 20.3145, + "step": 1557 + }, + { + "epoch": 0.3755121716076163, + "grad_norm": 41.043487548828125, + "learning_rate": 2.873591091430031e-05, + "loss": 21.2737, + "step": 1558 + }, + { + "epoch": 0.3757531935406122, + "grad_norm": 45.796348571777344, + "learning_rate": 2.8721862261185997e-05, + "loss": 21.0037, + "step": 1559 + }, + { + "epoch": 0.3759942154736081, + "grad_norm": 50.554874420166016, + "learning_rate": 2.870780829197338e-05, + "loss": 21.4934, + "step": 1560 + }, + { + "epoch": 0.37623523740660403, + "grad_norm": 49.05250549316406, + "learning_rate": 2.8693749015228555e-05, + "loss": 21.6969, + "step": 1561 + }, + { + "epoch": 0.3764762593395999, + "grad_norm": 49.80152893066406, + "learning_rate": 2.8679684439520848e-05, + "loss": 21.2407, + "step": 1562 + }, + { + "epoch": 0.3767172812725958, + "grad_norm": 42.048065185546875, + "learning_rate": 2.8665614573422817e-05, + "loss": 20.3583, + "step": 1563 + }, + { + "epoch": 0.3769583032055917, + "grad_norm": 40.555057525634766, + "learning_rate": 2.8651539425510245e-05, + "loss": 21.6775, + "step": 1564 + }, + { + "epoch": 0.3771993251385876, + "grad_norm": 36.79814910888672, + "learning_rate": 2.8637459004362128e-05, + "loss": 20.8425, + "step": 1565 + }, + { + "epoch": 0.3774403470715835, + "grad_norm": 33.05589294433594, + "learning_rate": 2.8623373318560693e-05, + "loss": 20.6437, + "step": 1566 + }, + { + "epoch": 0.37768136900457944, + "grad_norm": 24.564809799194336, + "learning_rate": 2.8609282376691352e-05, + "loss": 20.9141, + "step": 1567 + }, + { + "epoch": 0.3779223909375753, + "grad_norm": 18.446107864379883, + "learning_rate": 2.8595186187342738e-05, + "loss": 20.9103, + "step": 1568 + }, + { + "epoch": 0.3781634128705712, + "grad_norm": 5.8441667556762695, + "learning_rate": 2.8581084759106685e-05, + "loss": 21.9844, + "step": 1569 + }, + { + "epoch": 0.3784044348035671, + "grad_norm": 3.7728545665740967, + "learning_rate": 2.85669781005782e-05, + "loss": 21.0247, + "step": 1570 + }, + { + "epoch": 0.378645456736563, + "grad_norm": 10.467411041259766, + "learning_rate": 2.8552866220355505e-05, + "loss": 19.9644, + "step": 1571 + }, + { + "epoch": 0.37888647866955893, + "grad_norm": 12.59469985961914, + "learning_rate": 2.8538749127039972e-05, + "loss": 20.6729, + "step": 1572 + }, + { + "epoch": 0.37912750060255485, + "grad_norm": 24.554025650024414, + "learning_rate": 2.852462682923619e-05, + "loss": 20.7013, + "step": 1573 + }, + { + "epoch": 0.37936852253555076, + "grad_norm": 23.05268669128418, + "learning_rate": 2.8510499335551887e-05, + "loss": 19.9322, + "step": 1574 + }, + { + "epoch": 0.3796095444685466, + "grad_norm": 27.31005096435547, + "learning_rate": 2.8496366654597965e-05, + "loss": 21.3213, + "step": 1575 + }, + { + "epoch": 0.3798505664015425, + "grad_norm": 31.55510902404785, + "learning_rate": 2.8482228794988502e-05, + "loss": 21.1724, + "step": 1576 + }, + { + "epoch": 0.38009158833453843, + "grad_norm": 27.809093475341797, + "learning_rate": 2.846808576534072e-05, + "loss": 20.8552, + "step": 1577 + }, + { + "epoch": 0.38033261026753434, + "grad_norm": 25.061681747436523, + "learning_rate": 2.8453937574275003e-05, + "loss": 20.0312, + "step": 1578 + }, + { + "epoch": 0.38057363220053025, + "grad_norm": 21.21062660217285, + "learning_rate": 2.843978423041486e-05, + "loss": 21.0574, + "step": 1579 + }, + { + "epoch": 0.38081465413352616, + "grad_norm": 16.960458755493164, + "learning_rate": 2.842562574238696e-05, + "loss": 20.7615, + "step": 1580 + }, + { + "epoch": 0.3810556760665221, + "grad_norm": 11.160027503967285, + "learning_rate": 2.84114621188211e-05, + "loss": 18.9538, + "step": 1581 + }, + { + "epoch": 0.38129669799951793, + "grad_norm": 10.508767127990723, + "learning_rate": 2.8397293368350215e-05, + "loss": 21.0594, + "step": 1582 + }, + { + "epoch": 0.38153771993251384, + "grad_norm": 12.551162719726562, + "learning_rate": 2.8383119499610357e-05, + "loss": 20.8798, + "step": 1583 + }, + { + "epoch": 0.38177874186550975, + "grad_norm": 10.176121711730957, + "learning_rate": 2.8368940521240692e-05, + "loss": 20.648, + "step": 1584 + }, + { + "epoch": 0.38201976379850566, + "grad_norm": 16.44469451904297, + "learning_rate": 2.8354756441883514e-05, + "loss": 20.6556, + "step": 1585 + }, + { + "epoch": 0.3822607857315016, + "grad_norm": 24.674057006835938, + "learning_rate": 2.8340567270184216e-05, + "loss": 21.1197, + "step": 1586 + }, + { + "epoch": 0.3825018076644975, + "grad_norm": 27.294158935546875, + "learning_rate": 2.832637301479131e-05, + "loss": 20.4224, + "step": 1587 + }, + { + "epoch": 0.3827428295974934, + "grad_norm": 29.421127319335938, + "learning_rate": 2.8312173684356376e-05, + "loss": 20.9906, + "step": 1588 + }, + { + "epoch": 0.38298385153048925, + "grad_norm": 30.67142677307129, + "learning_rate": 2.8297969287534117e-05, + "loss": 20.6006, + "step": 1589 + }, + { + "epoch": 0.38322487346348516, + "grad_norm": 29.668041229248047, + "learning_rate": 2.8283759832982314e-05, + "loss": 20.4854, + "step": 1590 + }, + { + "epoch": 0.38346589539648107, + "grad_norm": 39.04167175292969, + "learning_rate": 2.826954532936183e-05, + "loss": 20.0361, + "step": 1591 + }, + { + "epoch": 0.383706917329477, + "grad_norm": 29.4206485748291, + "learning_rate": 2.825532578533661e-05, + "loss": 20.5119, + "step": 1592 + }, + { + "epoch": 0.3839479392624729, + "grad_norm": 24.173444747924805, + "learning_rate": 2.8241101209573663e-05, + "loss": 20.1116, + "step": 1593 + }, + { + "epoch": 0.3841889611954688, + "grad_norm": 22.79840850830078, + "learning_rate": 2.8226871610743064e-05, + "loss": 21.1884, + "step": 1594 + }, + { + "epoch": 0.3844299831284647, + "grad_norm": 20.80494499206543, + "learning_rate": 2.821263699751796e-05, + "loss": 21.0069, + "step": 1595 + }, + { + "epoch": 0.38467100506146057, + "grad_norm": 15.506486892700195, + "learning_rate": 2.8198397378574545e-05, + "loss": 20.7445, + "step": 1596 + }, + { + "epoch": 0.3849120269944565, + "grad_norm": 7.406441688537598, + "learning_rate": 2.8184152762592076e-05, + "loss": 20.7318, + "step": 1597 + }, + { + "epoch": 0.3851530489274524, + "grad_norm": 1.7860994338989258, + "learning_rate": 2.816990315825284e-05, + "loss": 20.7196, + "step": 1598 + }, + { + "epoch": 0.3853940708604483, + "grad_norm": 1.9799412488937378, + "learning_rate": 2.8155648574242175e-05, + "loss": 20.3337, + "step": 1599 + }, + { + "epoch": 0.3856350927934442, + "grad_norm": 4.286228656768799, + "learning_rate": 2.8141389019248452e-05, + "loss": 20.5726, + "step": 1600 + }, + { + "epoch": 0.3856350927934442, + "eval_cc_pretrain_accuracy": 0.9399999976158142, + "eval_cc_pretrain_loss": 2.145751476287842, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 16.9397, + "eval_cc_pretrain_samples_per_second": 5.903, + "eval_cc_pretrain_steps_per_second": 0.059, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 1600 + }, + { + "epoch": 0.3856350927934442, + "eval_mscoco_pretrain_accuracy": 0.7999999523162842, + "eval_mscoco_pretrain_loss": 2.4378254413604736, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 13.2896, + "eval_mscoco_pretrain_samples_per_second": 7.525, + "eval_mscoco_pretrain_steps_per_second": 0.075, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 1600 + }, + { + "epoch": 0.3858761147264401, + "grad_norm": 10.371548652648926, + "learning_rate": 2.812712450196307e-05, + "loss": 20.6856, + "step": 1601 + }, + { + "epoch": 0.38611713665943603, + "grad_norm": 8.215214729309082, + "learning_rate": 2.811285503108045e-05, + "loss": 20.2828, + "step": 1602 + }, + { + "epoch": 0.3863581585924319, + "grad_norm": 7.466929912567139, + "learning_rate": 2.809858061529805e-05, + "loss": 20.2624, + "step": 1603 + }, + { + "epoch": 0.3865991805254278, + "grad_norm": 2.6624181270599365, + "learning_rate": 2.808430126331631e-05, + "loss": 20.4773, + "step": 1604 + }, + { + "epoch": 0.3868402024584237, + "grad_norm": 7.871978759765625, + "learning_rate": 2.807001698383871e-05, + "loss": 20.3783, + "step": 1605 + }, + { + "epoch": 0.3870812243914196, + "grad_norm": 19.809232711791992, + "learning_rate": 2.8055727785571715e-05, + "loss": 20.2164, + "step": 1606 + }, + { + "epoch": 0.38732224632441553, + "grad_norm": 21.54745101928711, + "learning_rate": 2.8041433677224792e-05, + "loss": 19.6909, + "step": 1607 + }, + { + "epoch": 0.38756326825741144, + "grad_norm": 30.79233169555664, + "learning_rate": 2.802713466751041e-05, + "loss": 21.2479, + "step": 1608 + }, + { + "epoch": 0.38780429019040735, + "grad_norm": 30.543725967407227, + "learning_rate": 2.8012830765144007e-05, + "loss": 20.0771, + "step": 1609 + }, + { + "epoch": 0.3880453121234032, + "grad_norm": 35.7928466796875, + "learning_rate": 2.799852197884402e-05, + "loss": 20.5518, + "step": 1610 + }, + { + "epoch": 0.3882863340563991, + "grad_norm": 34.04623794555664, + "learning_rate": 2.7984208317331848e-05, + "loss": 20.4628, + "step": 1611 + }, + { + "epoch": 0.38852735598939503, + "grad_norm": 32.79801940917969, + "learning_rate": 2.796988978933188e-05, + "loss": 20.8763, + "step": 1612 + }, + { + "epoch": 0.38876837792239094, + "grad_norm": 32.29470443725586, + "learning_rate": 2.7955566403571464e-05, + "loss": 21.3619, + "step": 1613 + }, + { + "epoch": 0.38900939985538685, + "grad_norm": 36.041561126708984, + "learning_rate": 2.7941238168780893e-05, + "loss": 21.4396, + "step": 1614 + }, + { + "epoch": 0.38925042178838276, + "grad_norm": 33.54945373535156, + "learning_rate": 2.792690509369344e-05, + "loss": 20.5716, + "step": 1615 + }, + { + "epoch": 0.38949144372137867, + "grad_norm": 30.173303604125977, + "learning_rate": 2.7912567187045306e-05, + "loss": 19.8054, + "step": 1616 + }, + { + "epoch": 0.3897324656543745, + "grad_norm": 26.05309295654297, + "learning_rate": 2.789822445757566e-05, + "loss": 20.0772, + "step": 1617 + }, + { + "epoch": 0.38997348758737044, + "grad_norm": 20.687585830688477, + "learning_rate": 2.7883876914026592e-05, + "loss": 20.3166, + "step": 1618 + }, + { + "epoch": 0.39021450952036635, + "grad_norm": 16.822052001953125, + "learning_rate": 2.7869524565143127e-05, + "loss": 20.7471, + "step": 1619 + }, + { + "epoch": 0.39045553145336226, + "grad_norm": 7.696086883544922, + "learning_rate": 2.7855167419673238e-05, + "loss": 20.4046, + "step": 1620 + }, + { + "epoch": 0.39069655338635817, + "grad_norm": 1.4723519086837769, + "learning_rate": 2.7840805486367792e-05, + "loss": 19.0024, + "step": 1621 + }, + { + "epoch": 0.3909375753193541, + "grad_norm": 13.090544700622559, + "learning_rate": 2.7826438773980603e-05, + "loss": 20.7533, + "step": 1622 + }, + { + "epoch": 0.39117859725235, + "grad_norm": 19.558670043945312, + "learning_rate": 2.781206729126838e-05, + "loss": 19.9555, + "step": 1623 + }, + { + "epoch": 0.39141961918534585, + "grad_norm": 25.306337356567383, + "learning_rate": 2.7797691046990744e-05, + "loss": 20.4562, + "step": 1624 + }, + { + "epoch": 0.39166064111834176, + "grad_norm": 30.048534393310547, + "learning_rate": 2.7783310049910227e-05, + "loss": 20.381, + "step": 1625 + }, + { + "epoch": 0.39190166305133767, + "grad_norm": 32.04338073730469, + "learning_rate": 2.7768924308792234e-05, + "loss": 19.6596, + "step": 1626 + }, + { + "epoch": 0.3921426849843336, + "grad_norm": 32.30269241333008, + "learning_rate": 2.7754533832405095e-05, + "loss": 20.1991, + "step": 1627 + }, + { + "epoch": 0.3923837069173295, + "grad_norm": 32.30233383178711, + "learning_rate": 2.7740138629519995e-05, + "loss": 20.3917, + "step": 1628 + }, + { + "epoch": 0.3926247288503254, + "grad_norm": 32.80160140991211, + "learning_rate": 2.7725738708911027e-05, + "loss": 20.7419, + "step": 1629 + }, + { + "epoch": 0.39286575078332125, + "grad_norm": 31.575252532958984, + "learning_rate": 2.771133407935513e-05, + "loss": 22.1366, + "step": 1630 + }, + { + "epoch": 0.39310677271631717, + "grad_norm": 26.807491302490234, + "learning_rate": 2.769692474963215e-05, + "loss": 20.3544, + "step": 1631 + }, + { + "epoch": 0.3933477946493131, + "grad_norm": 21.580474853515625, + "learning_rate": 2.7682510728524746e-05, + "loss": 20.7171, + "step": 1632 + }, + { + "epoch": 0.393588816582309, + "grad_norm": 22.194595336914062, + "learning_rate": 2.76680920248185e-05, + "loss": 20.1616, + "step": 1633 + }, + { + "epoch": 0.3938298385153049, + "grad_norm": 13.477630615234375, + "learning_rate": 2.7653668647301797e-05, + "loss": 20.0433, + "step": 1634 + }, + { + "epoch": 0.3940708604483008, + "grad_norm": 16.96648597717285, + "learning_rate": 2.7639240604765896e-05, + "loss": 21.6048, + "step": 1635 + }, + { + "epoch": 0.3943118823812967, + "grad_norm": 1.5714850425720215, + "learning_rate": 2.7624807906004895e-05, + "loss": 20.1419, + "step": 1636 + }, + { + "epoch": 0.3945529043142926, + "grad_norm": 10.470202445983887, + "learning_rate": 2.7610370559815717e-05, + "loss": 20.598, + "step": 1637 + }, + { + "epoch": 0.3947939262472885, + "grad_norm": 24.292104721069336, + "learning_rate": 2.759592857499814e-05, + "loss": 20.1074, + "step": 1638 + }, + { + "epoch": 0.3950349481802844, + "grad_norm": 34.04033660888672, + "learning_rate": 2.7581481960354756e-05, + "loss": 20.9786, + "step": 1639 + }, + { + "epoch": 0.3952759701132803, + "grad_norm": 56.046295166015625, + "learning_rate": 2.7567030724690975e-05, + "loss": 21.1675, + "step": 1640 + }, + { + "epoch": 0.3955169920462762, + "grad_norm": 58.0461311340332, + "learning_rate": 2.7552574876815036e-05, + "loss": 22.619, + "step": 1641 + }, + { + "epoch": 0.3957580139792721, + "grad_norm": 59.55189895629883, + "learning_rate": 2.7538114425537973e-05, + "loss": 21.9609, + "step": 1642 + }, + { + "epoch": 0.39599903591226804, + "grad_norm": 69.5550308227539, + "learning_rate": 2.7523649379673644e-05, + "loss": 23.0059, + "step": 1643 + }, + { + "epoch": 0.3962400578452639, + "grad_norm": 70.0625228881836, + "learning_rate": 2.7509179748038696e-05, + "loss": 21.6444, + "step": 1644 + }, + { + "epoch": 0.3964810797782598, + "grad_norm": 72.06565856933594, + "learning_rate": 2.7494705539452583e-05, + "loss": 22.8722, + "step": 1645 + }, + { + "epoch": 0.3967221017112557, + "grad_norm": 73.5696029663086, + "learning_rate": 2.7480226762737522e-05, + "loss": 22.4627, + "step": 1646 + }, + { + "epoch": 0.3969631236442516, + "grad_norm": 72.067626953125, + "learning_rate": 2.746574342671854e-05, + "loss": 22.6529, + "step": 1647 + }, + { + "epoch": 0.39720414557724754, + "grad_norm": 66.5684585571289, + "learning_rate": 2.745125554022344e-05, + "loss": 22.3877, + "step": 1648 + }, + { + "epoch": 0.39744516751024345, + "grad_norm": 60.06290054321289, + "learning_rate": 2.7436763112082795e-05, + "loss": 22.7722, + "step": 1649 + }, + { + "epoch": 0.39768618944323936, + "grad_norm": 48.06271743774414, + "learning_rate": 2.7422266151129933e-05, + "loss": 20.5242, + "step": 1650 + }, + { + "epoch": 0.3979272113762352, + "grad_norm": 39.80897521972656, + "learning_rate": 2.740776466620096e-05, + "loss": 21.9048, + "step": 1651 + }, + { + "epoch": 0.3981682333092311, + "grad_norm": 37.07662582397461, + "learning_rate": 2.7393258666134746e-05, + "loss": 21.1105, + "step": 1652 + }, + { + "epoch": 0.39840925524222703, + "grad_norm": 29.69209098815918, + "learning_rate": 2.7378748159772888e-05, + "loss": 20.7863, + "step": 1653 + }, + { + "epoch": 0.39865027717522294, + "grad_norm": 20.58364486694336, + "learning_rate": 2.736423315595976e-05, + "loss": 20.2347, + "step": 1654 + }, + { + "epoch": 0.39889129910821886, + "grad_norm": 9.206841468811035, + "learning_rate": 2.7349713663542452e-05, + "loss": 19.9338, + "step": 1655 + }, + { + "epoch": 0.39913232104121477, + "grad_norm": 5.201184272766113, + "learning_rate": 2.73351896913708e-05, + "loss": 20.672, + "step": 1656 + }, + { + "epoch": 0.3993733429742107, + "grad_norm": 11.352221488952637, + "learning_rate": 2.7320661248297386e-05, + "loss": 20.2016, + "step": 1657 + }, + { + "epoch": 0.39961436490720653, + "grad_norm": 26.804494857788086, + "learning_rate": 2.7306128343177468e-05, + "loss": 21.2768, + "step": 1658 + }, + { + "epoch": 0.39985538684020244, + "grad_norm": 26.674224853515625, + "learning_rate": 2.729159098486909e-05, + "loss": 20.6548, + "step": 1659 + }, + { + "epoch": 0.40009640877319835, + "grad_norm": 33.548404693603516, + "learning_rate": 2.727704918223296e-05, + "loss": 21.4328, + "step": 1660 + }, + { + "epoch": 0.40033743070619426, + "grad_norm": 35.79685974121094, + "learning_rate": 2.7262502944132526e-05, + "loss": 21.0148, + "step": 1661 + }, + { + "epoch": 0.4005784526391902, + "grad_norm": 37.29298782348633, + "learning_rate": 2.7247952279433905e-05, + "loss": 20.3104, + "step": 1662 + }, + { + "epoch": 0.4008194745721861, + "grad_norm": 37.542205810546875, + "learning_rate": 2.7233397197005946e-05, + "loss": 19.6147, + "step": 1663 + }, + { + "epoch": 0.401060496505182, + "grad_norm": 33.54704284667969, + "learning_rate": 2.7218837705720176e-05, + "loss": 20.188, + "step": 1664 + }, + { + "epoch": 0.40130151843817785, + "grad_norm": 38.045169830322266, + "learning_rate": 2.720427381445081e-05, + "loss": 20.8864, + "step": 1665 + }, + { + "epoch": 0.40154254037117376, + "grad_norm": 34.306175231933594, + "learning_rate": 2.7189705532074752e-05, + "loss": 21.378, + "step": 1666 + }, + { + "epoch": 0.4017835623041697, + "grad_norm": 34.30035400390625, + "learning_rate": 2.7175132867471562e-05, + "loss": 20.5663, + "step": 1667 + }, + { + "epoch": 0.4020245842371656, + "grad_norm": 33.29867935180664, + "learning_rate": 2.7160555829523495e-05, + "loss": 19.7448, + "step": 1668 + }, + { + "epoch": 0.4022656061701615, + "grad_norm": 35.050437927246094, + "learning_rate": 2.7145974427115462e-05, + "loss": 20.9944, + "step": 1669 + }, + { + "epoch": 0.4025066281031574, + "grad_norm": 29.18819808959961, + "learning_rate": 2.713138866913504e-05, + "loss": 21.1771, + "step": 1670 + }, + { + "epoch": 0.4027476500361533, + "grad_norm": 22.689937591552734, + "learning_rate": 2.7116798564472428e-05, + "loss": 19.984, + "step": 1671 + }, + { + "epoch": 0.40298867196914917, + "grad_norm": 10.733839988708496, + "learning_rate": 2.7102204122020528e-05, + "loss": 19.495, + "step": 1672 + }, + { + "epoch": 0.4032296939021451, + "grad_norm": 15.214982986450195, + "learning_rate": 2.7087605350674847e-05, + "loss": 20.8022, + "step": 1673 + }, + { + "epoch": 0.403470715835141, + "grad_norm": 3.092938184738159, + "learning_rate": 2.7073002259333545e-05, + "loss": 19.6658, + "step": 1674 + }, + { + "epoch": 0.4037117377681369, + "grad_norm": 7.201705455780029, + "learning_rate": 2.705839485689742e-05, + "loss": 20.4894, + "step": 1675 + }, + { + "epoch": 0.4039527597011328, + "grad_norm": 14.139400482177734, + "learning_rate": 2.7043783152269877e-05, + "loss": 20.696, + "step": 1676 + }, + { + "epoch": 0.4041937816341287, + "grad_norm": 27.791608810424805, + "learning_rate": 2.7029167154356964e-05, + "loss": 21.3353, + "step": 1677 + }, + { + "epoch": 0.40443480356712463, + "grad_norm": 39.53437423706055, + "learning_rate": 2.7014546872067334e-05, + "loss": 20.2114, + "step": 1678 + }, + { + "epoch": 0.4046758255001205, + "grad_norm": 42.539207458496094, + "learning_rate": 2.699992231431226e-05, + "loss": 20.6339, + "step": 1679 + }, + { + "epoch": 0.4049168474331164, + "grad_norm": 43.037559509277344, + "learning_rate": 2.6985293490005614e-05, + "loss": 21.295, + "step": 1680 + }, + { + "epoch": 0.4051578693661123, + "grad_norm": 42.53929138183594, + "learning_rate": 2.6970660408063875e-05, + "loss": 21.5011, + "step": 1681 + }, + { + "epoch": 0.4053988912991082, + "grad_norm": 50.54296875, + "learning_rate": 2.695602307740611e-05, + "loss": 21.2699, + "step": 1682 + }, + { + "epoch": 0.40563991323210413, + "grad_norm": 49.791629791259766, + "learning_rate": 2.6941381506953973e-05, + "loss": 21.2749, + "step": 1683 + }, + { + "epoch": 0.40588093516510004, + "grad_norm": 54.29286575317383, + "learning_rate": 2.692673570563172e-05, + "loss": 21.0128, + "step": 1684 + }, + { + "epoch": 0.40612195709809595, + "grad_norm": 58.29850769042969, + "learning_rate": 2.6912085682366166e-05, + "loss": 20.7314, + "step": 1685 + }, + { + "epoch": 0.4063629790310918, + "grad_norm": 48.04816436767578, + "learning_rate": 2.6897431446086716e-05, + "loss": 21.4864, + "step": 1686 + }, + { + "epoch": 0.4066040009640877, + "grad_norm": 41.542198181152344, + "learning_rate": 2.6882773005725318e-05, + "loss": 20.359, + "step": 1687 + }, + { + "epoch": 0.40684502289708363, + "grad_norm": 33.79315185546875, + "learning_rate": 2.6868110370216514e-05, + "loss": 21.0905, + "step": 1688 + }, + { + "epoch": 0.40708604483007954, + "grad_norm": 26.176483154296875, + "learning_rate": 2.685344354849738e-05, + "loss": 20.8836, + "step": 1689 + }, + { + "epoch": 0.40732706676307545, + "grad_norm": 21.06539535522461, + "learning_rate": 2.6838772549507547e-05, + "loss": 20.011, + "step": 1690 + }, + { + "epoch": 0.40756808869607136, + "grad_norm": 14.575056076049805, + "learning_rate": 2.6824097382189216e-05, + "loss": 20.8976, + "step": 1691 + }, + { + "epoch": 0.4078091106290672, + "grad_norm": 4.005792617797852, + "learning_rate": 2.6809418055487082e-05, + "loss": 21.0553, + "step": 1692 + }, + { + "epoch": 0.40805013256206313, + "grad_norm": 4.743707180023193, + "learning_rate": 2.679473457834842e-05, + "loss": 19.7884, + "step": 1693 + }, + { + "epoch": 0.40829115449505904, + "grad_norm": 7.877179145812988, + "learning_rate": 2.678004695972301e-05, + "loss": 19.7951, + "step": 1694 + }, + { + "epoch": 0.40853217642805495, + "grad_norm": 11.901662826538086, + "learning_rate": 2.6765355208563164e-05, + "loss": 20.031, + "step": 1695 + }, + { + "epoch": 0.40877319836105086, + "grad_norm": 19.444589614868164, + "learning_rate": 2.6750659333823717e-05, + "loss": 20.6164, + "step": 1696 + }, + { + "epoch": 0.40901422029404677, + "grad_norm": 27.9329776763916, + "learning_rate": 2.6735959344462005e-05, + "loss": 21.2625, + "step": 1697 + }, + { + "epoch": 0.4092552422270427, + "grad_norm": 30.922395706176758, + "learning_rate": 2.6721255249437886e-05, + "loss": 20.4768, + "step": 1698 + }, + { + "epoch": 0.40949626416003854, + "grad_norm": 26.179119110107422, + "learning_rate": 2.6706547057713702e-05, + "loss": 20.0225, + "step": 1699 + }, + { + "epoch": 0.40973728609303445, + "grad_norm": 29.800405502319336, + "learning_rate": 2.6691834778254323e-05, + "loss": 19.7246, + "step": 1700 + }, + { + "epoch": 0.40997830802603036, + "grad_norm": 28.055761337280273, + "learning_rate": 2.667711842002707e-05, + "loss": 20.2155, + "step": 1701 + }, + { + "epoch": 0.41021932995902627, + "grad_norm": 19.82750701904297, + "learning_rate": 2.666239799200178e-05, + "loss": 20.4209, + "step": 1702 + }, + { + "epoch": 0.4104603518920222, + "grad_norm": 20.190141677856445, + "learning_rate": 2.6647673503150776e-05, + "loss": 19.9061, + "step": 1703 + }, + { + "epoch": 0.4107013738250181, + "grad_norm": 22.569286346435547, + "learning_rate": 2.6632944962448824e-05, + "loss": 20.4864, + "step": 1704 + }, + { + "epoch": 0.410942395758014, + "grad_norm": 20.06157112121582, + "learning_rate": 2.6618212378873177e-05, + "loss": 19.7565, + "step": 1705 + }, + { + "epoch": 0.41118341769100986, + "grad_norm": 8.836545944213867, + "learning_rate": 2.660347576140356e-05, + "loss": 20.1275, + "step": 1706 + }, + { + "epoch": 0.41142443962400577, + "grad_norm": 5.988829135894775, + "learning_rate": 2.6588735119022157e-05, + "loss": 20.2693, + "step": 1707 + }, + { + "epoch": 0.4116654615570017, + "grad_norm": 13.894827842712402, + "learning_rate": 2.657399046071358e-05, + "loss": 20.0423, + "step": 1708 + }, + { + "epoch": 0.4119064834899976, + "grad_norm": 12.343192100524902, + "learning_rate": 2.6559241795464918e-05, + "loss": 20.46, + "step": 1709 + }, + { + "epoch": 0.4121475054229935, + "grad_norm": 16.44587516784668, + "learning_rate": 2.654448913226569e-05, + "loss": 20.117, + "step": 1710 + }, + { + "epoch": 0.4123885273559894, + "grad_norm": 21.176149368286133, + "learning_rate": 2.6529732480107848e-05, + "loss": 19.6685, + "step": 1711 + }, + { + "epoch": 0.4126295492889853, + "grad_norm": 31.792823791503906, + "learning_rate": 2.651497184798579e-05, + "loss": 21.0867, + "step": 1712 + }, + { + "epoch": 0.4128705712219812, + "grad_norm": 31.040081024169922, + "learning_rate": 2.650020724489632e-05, + "loss": 20.4385, + "step": 1713 + }, + { + "epoch": 0.4131115931549771, + "grad_norm": 28.666217803955078, + "learning_rate": 2.648543867983868e-05, + "loss": 20.3419, + "step": 1714 + }, + { + "epoch": 0.413352615087973, + "grad_norm": 30.539920806884766, + "learning_rate": 2.6470666161814505e-05, + "loss": 19.3804, + "step": 1715 + }, + { + "epoch": 0.4135936370209689, + "grad_norm": 28.666786193847656, + "learning_rate": 2.6455889699827872e-05, + "loss": 20.0901, + "step": 1716 + }, + { + "epoch": 0.4138346589539648, + "grad_norm": 32.043880462646484, + "learning_rate": 2.6441109302885227e-05, + "loss": 21.0666, + "step": 1717 + }, + { + "epoch": 0.41407568088696073, + "grad_norm": 28.543638229370117, + "learning_rate": 2.6426324979995443e-05, + "loss": 20.2895, + "step": 1718 + }, + { + "epoch": 0.41431670281995664, + "grad_norm": 22.04825210571289, + "learning_rate": 2.641153674016977e-05, + "loss": 19.9732, + "step": 1719 + }, + { + "epoch": 0.4145577247529525, + "grad_norm": 20.8031063079834, + "learning_rate": 2.6396744592421844e-05, + "loss": 19.8613, + "step": 1720 + }, + { + "epoch": 0.4147987466859484, + "grad_norm": 15.755630493164062, + "learning_rate": 2.63819485457677e-05, + "loss": 20.1486, + "step": 1721 + }, + { + "epoch": 0.4150397686189443, + "grad_norm": 7.440023899078369, + "learning_rate": 2.636714860922572e-05, + "loss": 20.0265, + "step": 1722 + }, + { + "epoch": 0.4152807905519402, + "grad_norm": 1.4032973051071167, + "learning_rate": 2.6352344791816698e-05, + "loss": 20.5949, + "step": 1723 + }, + { + "epoch": 0.41552181248493614, + "grad_norm": 1.9755672216415405, + "learning_rate": 2.6337537102563744e-05, + "loss": 20.6017, + "step": 1724 + }, + { + "epoch": 0.41576283441793205, + "grad_norm": 8.812068939208984, + "learning_rate": 2.632272555049237e-05, + "loss": 20.9012, + "step": 1725 + }, + { + "epoch": 0.41600385635092796, + "grad_norm": 9.98613452911377, + "learning_rate": 2.630791014463042e-05, + "loss": 20.0798, + "step": 1726 + }, + { + "epoch": 0.4162448782839238, + "grad_norm": 8.63389778137207, + "learning_rate": 2.6293090894008097e-05, + "loss": 20.3813, + "step": 1727 + }, + { + "epoch": 0.4164859002169197, + "grad_norm": 10.738114356994629, + "learning_rate": 2.6278267807657952e-05, + "loss": 20.3291, + "step": 1728 + }, + { + "epoch": 0.41672692214991564, + "grad_norm": 6.338959693908691, + "learning_rate": 2.6263440894614843e-05, + "loss": 18.9627, + "step": 1729 + }, + { + "epoch": 0.41696794408291155, + "grad_norm": 1.9649852514266968, + "learning_rate": 2.6248610163916e-05, + "loss": 19.6087, + "step": 1730 + }, + { + "epoch": 0.41720896601590746, + "grad_norm": 4.858740329742432, + "learning_rate": 2.6233775624600958e-05, + "loss": 19.7708, + "step": 1731 + }, + { + "epoch": 0.41744998794890337, + "grad_norm": 8.800115585327148, + "learning_rate": 2.6218937285711577e-05, + "loss": 19.5257, + "step": 1732 + }, + { + "epoch": 0.4176910098818993, + "grad_norm": 16.073280334472656, + "learning_rate": 2.6204095156292048e-05, + "loss": 20.6934, + "step": 1733 + }, + { + "epoch": 0.41793203181489513, + "grad_norm": 16.443336486816406, + "learning_rate": 2.6189249245388837e-05, + "loss": 20.3518, + "step": 1734 + }, + { + "epoch": 0.41817305374789104, + "grad_norm": 22.433931350708008, + "learning_rate": 2.6174399562050754e-05, + "loss": 20.6261, + "step": 1735 + }, + { + "epoch": 0.41841407568088695, + "grad_norm": 29.296772003173828, + "learning_rate": 2.615954611532887e-05, + "loss": 20.7488, + "step": 1736 + }, + { + "epoch": 0.41865509761388287, + "grad_norm": 25.555997848510742, + "learning_rate": 2.61446889142766e-05, + "loss": 21.1154, + "step": 1737 + }, + { + "epoch": 0.4188961195468788, + "grad_norm": 31.42162322998047, + "learning_rate": 2.6129827967949593e-05, + "loss": 21.2318, + "step": 1738 + }, + { + "epoch": 0.4191371414798747, + "grad_norm": 31.179109573364258, + "learning_rate": 2.6114963285405816e-05, + "loss": 21.194, + "step": 1739 + }, + { + "epoch": 0.4193781634128706, + "grad_norm": 22.432605743408203, + "learning_rate": 2.6100094875705502e-05, + "loss": 21.2679, + "step": 1740 + }, + { + "epoch": 0.41961918534586645, + "grad_norm": 28.552104949951172, + "learning_rate": 2.6085222747911155e-05, + "loss": 21.6513, + "step": 1741 + }, + { + "epoch": 0.41986020727886236, + "grad_norm": 28.930131912231445, + "learning_rate": 2.6070346911087553e-05, + "loss": 19.9146, + "step": 1742 + }, + { + "epoch": 0.4201012292118583, + "grad_norm": 24.181055068969727, + "learning_rate": 2.6055467374301726e-05, + "loss": 20.0577, + "step": 1743 + }, + { + "epoch": 0.4203422511448542, + "grad_norm": 17.572296142578125, + "learning_rate": 2.604058414662296e-05, + "loss": 21.4508, + "step": 1744 + }, + { + "epoch": 0.4205832730778501, + "grad_norm": 8.802281379699707, + "learning_rate": 2.602569723712279e-05, + "loss": 20.0183, + "step": 1745 + }, + { + "epoch": 0.420824295010846, + "grad_norm": 2.956613302230835, + "learning_rate": 2.6010806654875e-05, + "loss": 20.28, + "step": 1746 + }, + { + "epoch": 0.4210653169438419, + "grad_norm": 1.4986950159072876, + "learning_rate": 2.5995912408955614e-05, + "loss": 20.397, + "step": 1747 + }, + { + "epoch": 0.42130633887683777, + "grad_norm": 7.147834300994873, + "learning_rate": 2.5981014508442883e-05, + "loss": 20.4899, + "step": 1748 + }, + { + "epoch": 0.4215473608098337, + "grad_norm": 12.029915809631348, + "learning_rate": 2.5966112962417285e-05, + "loss": 20.7954, + "step": 1749 + }, + { + "epoch": 0.4217883827428296, + "grad_norm": 14.949239730834961, + "learning_rate": 2.5951207779961527e-05, + "loss": 20.2219, + "step": 1750 + }, + { + "epoch": 0.4220294046758255, + "grad_norm": 13.588357925415039, + "learning_rate": 2.593629897016053e-05, + "loss": 20.9449, + "step": 1751 + }, + { + "epoch": 0.4222704266088214, + "grad_norm": 17.446346282958984, + "learning_rate": 2.5921386542101412e-05, + "loss": 20.7837, + "step": 1752 + }, + { + "epoch": 0.4225114485418173, + "grad_norm": 13.777868270874023, + "learning_rate": 2.590647050487353e-05, + "loss": 20.5597, + "step": 1753 + }, + { + "epoch": 0.42275247047481324, + "grad_norm": 7.023062229156494, + "learning_rate": 2.5891550867568395e-05, + "loss": 19.9075, + "step": 1754 + }, + { + "epoch": 0.4229934924078091, + "grad_norm": 10.226380348205566, + "learning_rate": 2.5876627639279756e-05, + "loss": 19.8865, + "step": 1755 + }, + { + "epoch": 0.423234514340805, + "grad_norm": 5.835995197296143, + "learning_rate": 2.5861700829103523e-05, + "loss": 20.9417, + "step": 1756 + }, + { + "epoch": 0.4234755362738009, + "grad_norm": 2.6280148029327393, + "learning_rate": 2.5846770446137795e-05, + "loss": 20.754, + "step": 1757 + }, + { + "epoch": 0.4237165582067968, + "grad_norm": 2.132152795791626, + "learning_rate": 2.5831836499482865e-05, + "loss": 21.2556, + "step": 1758 + }, + { + "epoch": 0.42395758013979273, + "grad_norm": 13.196029663085938, + "learning_rate": 2.5816898998241167e-05, + "loss": 19.4128, + "step": 1759 + }, + { + "epoch": 0.42419860207278864, + "grad_norm": 18.432388305664062, + "learning_rate": 2.5801957951517335e-05, + "loss": 21.1415, + "step": 1760 + }, + { + "epoch": 0.4244396240057845, + "grad_norm": 19.056171417236328, + "learning_rate": 2.578701336841813e-05, + "loss": 20.3784, + "step": 1761 + }, + { + "epoch": 0.4246806459387804, + "grad_norm": 34.79256057739258, + "learning_rate": 2.5772065258052505e-05, + "loss": 20.6137, + "step": 1762 + }, + { + "epoch": 0.4249216678717763, + "grad_norm": 37.546077728271484, + "learning_rate": 2.5757113629531532e-05, + "loss": 20.2401, + "step": 1763 + }, + { + "epoch": 0.42516268980477223, + "grad_norm": 32.2938117980957, + "learning_rate": 2.5742158491968443e-05, + "loss": 20.4585, + "step": 1764 + }, + { + "epoch": 0.42540371173776814, + "grad_norm": 31.043460845947266, + "learning_rate": 2.5727199854478615e-05, + "loss": 21.2695, + "step": 1765 + }, + { + "epoch": 0.42564473367076405, + "grad_norm": 35.79366683959961, + "learning_rate": 2.5712237726179532e-05, + "loss": 20.7637, + "step": 1766 + }, + { + "epoch": 0.42588575560375996, + "grad_norm": 32.04559326171875, + "learning_rate": 2.5697272116190835e-05, + "loss": 20.6656, + "step": 1767 + }, + { + "epoch": 0.4261267775367558, + "grad_norm": 30.673603057861328, + "learning_rate": 2.5682303033634274e-05, + "loss": 20.6512, + "step": 1768 + }, + { + "epoch": 0.42636779946975173, + "grad_norm": 22.43357276916504, + "learning_rate": 2.566733048763371e-05, + "loss": 20.5709, + "step": 1769 + }, + { + "epoch": 0.42660882140274764, + "grad_norm": 14.382721900939941, + "learning_rate": 2.5652354487315127e-05, + "loss": 20.3108, + "step": 1770 + }, + { + "epoch": 0.42684984333574355, + "grad_norm": 15.884061813354492, + "learning_rate": 2.5637375041806607e-05, + "loss": 20.5579, + "step": 1771 + }, + { + "epoch": 0.42709086526873946, + "grad_norm": 5.124815940856934, + "learning_rate": 2.5622392160238327e-05, + "loss": 20.7163, + "step": 1772 + }, + { + "epoch": 0.42733188720173537, + "grad_norm": 3.449737071990967, + "learning_rate": 2.5607405851742578e-05, + "loss": 20.3019, + "step": 1773 + }, + { + "epoch": 0.4275729091347313, + "grad_norm": 5.936408996582031, + "learning_rate": 2.5592416125453724e-05, + "loss": 19.5121, + "step": 1774 + }, + { + "epoch": 0.42781393106772714, + "grad_norm": 17.937850952148438, + "learning_rate": 2.557742299050821e-05, + "loss": 20.3415, + "step": 1775 + }, + { + "epoch": 0.42805495300072305, + "grad_norm": 21.058258056640625, + "learning_rate": 2.556242645604456e-05, + "loss": 20.4219, + "step": 1776 + }, + { + "epoch": 0.42829597493371896, + "grad_norm": 22.5670223236084, + "learning_rate": 2.5547426531203377e-05, + "loss": 21.1796, + "step": 1777 + }, + { + "epoch": 0.42853699686671487, + "grad_norm": 22.30674934387207, + "learning_rate": 2.5532423225127333e-05, + "loss": 20.1581, + "step": 1778 + }, + { + "epoch": 0.4287780187997108, + "grad_norm": 23.81880760192871, + "learning_rate": 2.5517416546961156e-05, + "loss": 21.1925, + "step": 1779 + }, + { + "epoch": 0.4290190407327067, + "grad_norm": 25.559518814086914, + "learning_rate": 2.5502406505851615e-05, + "loss": 20.7599, + "step": 1780 + }, + { + "epoch": 0.4292600626657026, + "grad_norm": 17.694242477416992, + "learning_rate": 2.5487393110947557e-05, + "loss": 20.3206, + "step": 1781 + }, + { + "epoch": 0.42950108459869846, + "grad_norm": 20.06827163696289, + "learning_rate": 2.5472376371399845e-05, + "loss": 20.2616, + "step": 1782 + }, + { + "epoch": 0.42974210653169437, + "grad_norm": 13.033296585083008, + "learning_rate": 2.5457356296361413e-05, + "loss": 19.853, + "step": 1783 + }, + { + "epoch": 0.4299831284646903, + "grad_norm": 10.719050407409668, + "learning_rate": 2.544233289498719e-05, + "loss": 19.1051, + "step": 1784 + }, + { + "epoch": 0.4302241503976862, + "grad_norm": 5.036662578582764, + "learning_rate": 2.5427306176434154e-05, + "loss": 19.5138, + "step": 1785 + }, + { + "epoch": 0.4304651723306821, + "grad_norm": 4.549376487731934, + "learning_rate": 2.5412276149861313e-05, + "loss": 21.1549, + "step": 1786 + }, + { + "epoch": 0.430706194263678, + "grad_norm": 7.982868671417236, + "learning_rate": 2.5397242824429666e-05, + "loss": 19.4312, + "step": 1787 + }, + { + "epoch": 0.4309472161966739, + "grad_norm": 15.24974536895752, + "learning_rate": 2.5382206209302257e-05, + "loss": 20.0069, + "step": 1788 + }, + { + "epoch": 0.4311882381296698, + "grad_norm": 23.546255111694336, + "learning_rate": 2.536716631364409e-05, + "loss": 20.0157, + "step": 1789 + }, + { + "epoch": 0.4314292600626657, + "grad_norm": 22.301273345947266, + "learning_rate": 2.5352123146622223e-05, + "loss": 21.0208, + "step": 1790 + }, + { + "epoch": 0.4316702819956616, + "grad_norm": 22.41976547241211, + "learning_rate": 2.5337076717405657e-05, + "loss": 19.8022, + "step": 1791 + }, + { + "epoch": 0.4319113039286575, + "grad_norm": 28.67011833190918, + "learning_rate": 2.532202703516541e-05, + "loss": 19.7505, + "step": 1792 + }, + { + "epoch": 0.4321523258616534, + "grad_norm": 31.41421127319336, + "learning_rate": 2.5306974109074475e-05, + "loss": 20.4195, + "step": 1793 + }, + { + "epoch": 0.43239334779464933, + "grad_norm": 25.920007705688477, + "learning_rate": 2.5291917948307822e-05, + "loss": 20.3314, + "step": 1794 + }, + { + "epoch": 0.43263436972764524, + "grad_norm": 23.551593780517578, + "learning_rate": 2.52768585620424e-05, + "loss": 19.959, + "step": 1795 + }, + { + "epoch": 0.4328753916606411, + "grad_norm": 25.17153549194336, + "learning_rate": 2.5261795959457115e-05, + "loss": 19.5841, + "step": 1796 + }, + { + "epoch": 0.433116413593637, + "grad_norm": 20.184518814086914, + "learning_rate": 2.5246730149732835e-05, + "loss": 20.9066, + "step": 1797 + }, + { + "epoch": 0.4333574355266329, + "grad_norm": 18.305349349975586, + "learning_rate": 2.5231661142052376e-05, + "loss": 20.0068, + "step": 1798 + }, + { + "epoch": 0.4335984574596288, + "grad_norm": 12.890597343444824, + "learning_rate": 2.5216588945600527e-05, + "loss": 20.0781, + "step": 1799 + }, + { + "epoch": 0.43383947939262474, + "grad_norm": 5.2488603591918945, + "learning_rate": 2.5201513569563996e-05, + "loss": 20.4247, + "step": 1800 + }, + { + "epoch": 0.43383947939262474, + "eval_cc_pretrain_accuracy": 0.949999988079071, + "eval_cc_pretrain_loss": 2.112483501434326, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 17.4532, + "eval_cc_pretrain_samples_per_second": 5.73, + "eval_cc_pretrain_steps_per_second": 0.057, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 1800 + }, + { + "epoch": 0.43383947939262474, + "eval_mscoco_pretrain_accuracy": 0.8299999833106995, + "eval_mscoco_pretrain_loss": 2.4305453300476074, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 14.1586, + "eval_mscoco_pretrain_samples_per_second": 7.063, + "eval_mscoco_pretrain_steps_per_second": 0.071, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 1800 + }, + { + "epoch": 0.43408050132562065, + "grad_norm": 2.5802388191223145, + "learning_rate": 2.5186435023131432e-05, + "loss": 19.8349, + "step": 1801 + }, + { + "epoch": 0.43432152325861656, + "grad_norm": 4.741922855377197, + "learning_rate": 2.517135331549344e-05, + "loss": 20.0177, + "step": 1802 + }, + { + "epoch": 0.4345625451916124, + "grad_norm": 19.340620040893555, + "learning_rate": 2.515626845584251e-05, + "loss": 21.4192, + "step": 1803 + }, + { + "epoch": 0.4348035671246083, + "grad_norm": 20.685556411743164, + "learning_rate": 2.51411804533731e-05, + "loss": 20.9681, + "step": 1804 + }, + { + "epoch": 0.43504458905760424, + "grad_norm": 20.679786682128906, + "learning_rate": 2.512608931728154e-05, + "loss": 20.2874, + "step": 1805 + }, + { + "epoch": 0.43528561099060015, + "grad_norm": 17.194252014160156, + "learning_rate": 2.5110995056766112e-05, + "loss": 20.7428, + "step": 1806 + }, + { + "epoch": 0.43552663292359606, + "grad_norm": 14.948983192443848, + "learning_rate": 2.5095897681026965e-05, + "loss": 19.7102, + "step": 1807 + }, + { + "epoch": 0.43576765485659197, + "grad_norm": 21.308107376098633, + "learning_rate": 2.508079719926617e-05, + "loss": 21.053, + "step": 1808 + }, + { + "epoch": 0.4360086767895879, + "grad_norm": 13.133545875549316, + "learning_rate": 2.5065693620687675e-05, + "loss": 19.058, + "step": 1809 + }, + { + "epoch": 0.43624969872258373, + "grad_norm": 13.970298767089844, + "learning_rate": 2.5050586954497337e-05, + "loss": 19.4619, + "step": 1810 + }, + { + "epoch": 0.43649072065557964, + "grad_norm": 11.600422859191895, + "learning_rate": 2.5035477209902874e-05, + "loss": 20.8335, + "step": 1811 + }, + { + "epoch": 0.43673174258857556, + "grad_norm": 9.227972984313965, + "learning_rate": 2.5020364396113897e-05, + "loss": 19.472, + "step": 1812 + }, + { + "epoch": 0.43697276452157147, + "grad_norm": 6.886537551879883, + "learning_rate": 2.5005248522341868e-05, + "loss": 20.0704, + "step": 1813 + }, + { + "epoch": 0.4372137864545674, + "grad_norm": 3.2710273265838623, + "learning_rate": 2.4990129597800136e-05, + "loss": 20.4851, + "step": 1814 + }, + { + "epoch": 0.4374548083875633, + "grad_norm": 3.3578920364379883, + "learning_rate": 2.4975007631703894e-05, + "loss": 20.1659, + "step": 1815 + }, + { + "epoch": 0.4376958303205592, + "grad_norm": 17.804019927978516, + "learning_rate": 2.49598826332702e-05, + "loss": 19.7922, + "step": 1816 + }, + { + "epoch": 0.43793685225355505, + "grad_norm": 14.748939514160156, + "learning_rate": 2.4944754611717948e-05, + "loss": 19.5863, + "step": 1817 + }, + { + "epoch": 0.43817787418655096, + "grad_norm": 24.171924591064453, + "learning_rate": 2.4929623576267884e-05, + "loss": 20.2281, + "step": 1818 + }, + { + "epoch": 0.4384188961195469, + "grad_norm": 32.292842864990234, + "learning_rate": 2.4914489536142575e-05, + "loss": 21.8144, + "step": 1819 + }, + { + "epoch": 0.4386599180525428, + "grad_norm": 28.916790008544922, + "learning_rate": 2.489935250056646e-05, + "loss": 20.8626, + "step": 1820 + }, + { + "epoch": 0.4389009399855387, + "grad_norm": 33.79328918457031, + "learning_rate": 2.4884212478765747e-05, + "loss": 20.2425, + "step": 1821 + }, + { + "epoch": 0.4391419619185346, + "grad_norm": 27.303606033325195, + "learning_rate": 2.4869069479968513e-05, + "loss": 20.3001, + "step": 1822 + }, + { + "epoch": 0.43938298385153046, + "grad_norm": 31.048748016357422, + "learning_rate": 2.4853923513404622e-05, + "loss": 20.2089, + "step": 1823 + }, + { + "epoch": 0.4396240057845264, + "grad_norm": 29.42336082458496, + "learning_rate": 2.4838774588305753e-05, + "loss": 21.3105, + "step": 1824 + }, + { + "epoch": 0.4398650277175223, + "grad_norm": 32.046390533447266, + "learning_rate": 2.4823622713905396e-05, + "loss": 20.5442, + "step": 1825 + }, + { + "epoch": 0.4401060496505182, + "grad_norm": 21.058713912963867, + "learning_rate": 2.480846789943883e-05, + "loss": 20.5031, + "step": 1826 + }, + { + "epoch": 0.4403470715835141, + "grad_norm": 18.071760177612305, + "learning_rate": 2.479331015414313e-05, + "loss": 19.239, + "step": 1827 + }, + { + "epoch": 0.44058809351651, + "grad_norm": 17.816442489624023, + "learning_rate": 2.4778149487257146e-05, + "loss": 18.9923, + "step": 1828 + }, + { + "epoch": 0.4408291154495059, + "grad_norm": 3.8892791271209717, + "learning_rate": 2.476298590802153e-05, + "loss": 19.941, + "step": 1829 + }, + { + "epoch": 0.4410701373825018, + "grad_norm": 5.990808010101318, + "learning_rate": 2.4747819425678694e-05, + "loss": 19.5703, + "step": 1830 + }, + { + "epoch": 0.4413111593154977, + "grad_norm": 6.240020275115967, + "learning_rate": 2.4732650049472824e-05, + "loss": 19.4861, + "step": 1831 + }, + { + "epoch": 0.4415521812484936, + "grad_norm": 18.82268714904785, + "learning_rate": 2.4717477788649872e-05, + "loss": 20.9192, + "step": 1832 + }, + { + "epoch": 0.4417932031814895, + "grad_norm": 18.806936264038086, + "learning_rate": 2.4702302652457534e-05, + "loss": 19.6596, + "step": 1833 + }, + { + "epoch": 0.4420342251144854, + "grad_norm": 23.05655860900879, + "learning_rate": 2.4687124650145284e-05, + "loss": 20.2474, + "step": 1834 + }, + { + "epoch": 0.44227524704748133, + "grad_norm": 24.797334671020508, + "learning_rate": 2.4671943790964317e-05, + "loss": 20.3784, + "step": 1835 + }, + { + "epoch": 0.44251626898047725, + "grad_norm": 22.562654495239258, + "learning_rate": 2.4656760084167595e-05, + "loss": 20.4098, + "step": 1836 + }, + { + "epoch": 0.4427572909134731, + "grad_norm": 23.557716369628906, + "learning_rate": 2.464157353900979e-05, + "loss": 20.7021, + "step": 1837 + }, + { + "epoch": 0.442998312846469, + "grad_norm": 24.920331954956055, + "learning_rate": 2.462638416474732e-05, + "loss": 19.3723, + "step": 1838 + }, + { + "epoch": 0.4432393347794649, + "grad_norm": 24.44020652770996, + "learning_rate": 2.4611191970638322e-05, + "loss": 21.0774, + "step": 1839 + }, + { + "epoch": 0.44348035671246083, + "grad_norm": 22.060659408569336, + "learning_rate": 2.4595996965942653e-05, + "loss": 20.5103, + "step": 1840 + }, + { + "epoch": 0.44372137864545674, + "grad_norm": 20.197526931762695, + "learning_rate": 2.4580799159921895e-05, + "loss": 20.7158, + "step": 1841 + }, + { + "epoch": 0.44396240057845265, + "grad_norm": 20.200345993041992, + "learning_rate": 2.4565598561839307e-05, + "loss": 21.1568, + "step": 1842 + }, + { + "epoch": 0.44420342251144856, + "grad_norm": 7.545557022094727, + "learning_rate": 2.455039518095988e-05, + "loss": 19.9951, + "step": 1843 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 11.600110054016113, + "learning_rate": 2.4535189026550277e-05, + "loss": 20.6387, + "step": 1844 + }, + { + "epoch": 0.44468546637744033, + "grad_norm": 5.719697952270508, + "learning_rate": 2.451998010787888e-05, + "loss": 20.2354, + "step": 1845 + }, + { + "epoch": 0.44492648831043624, + "grad_norm": 6.15648078918457, + "learning_rate": 2.4504768434215725e-05, + "loss": 19.8492, + "step": 1846 + }, + { + "epoch": 0.44516751024343215, + "grad_norm": 19.057260513305664, + "learning_rate": 2.4489554014832554e-05, + "loss": 20.1383, + "step": 1847 + }, + { + "epoch": 0.44540853217642806, + "grad_norm": 20.547948837280273, + "learning_rate": 2.447433685900277e-05, + "loss": 19.4625, + "step": 1848 + }, + { + "epoch": 0.445649554109424, + "grad_norm": 23.29857063293457, + "learning_rate": 2.4459116976001433e-05, + "loss": 20.5153, + "step": 1849 + }, + { + "epoch": 0.4458905760424199, + "grad_norm": 29.78900146484375, + "learning_rate": 2.444389437510529e-05, + "loss": 20.5952, + "step": 1850 + }, + { + "epoch": 0.44613159797541574, + "grad_norm": 28.414796829223633, + "learning_rate": 2.4428669065592724e-05, + "loss": 20.7686, + "step": 1851 + }, + { + "epoch": 0.44637261990841165, + "grad_norm": 28.16595458984375, + "learning_rate": 2.4413441056743775e-05, + "loss": 20.0673, + "step": 1852 + }, + { + "epoch": 0.44661364184140756, + "grad_norm": 30.541664123535156, + "learning_rate": 2.439821035784014e-05, + "loss": 19.7397, + "step": 1853 + }, + { + "epoch": 0.44685466377440347, + "grad_norm": 32.79475402832031, + "learning_rate": 2.4382976978165134e-05, + "loss": 19.9737, + "step": 1854 + }, + { + "epoch": 0.4470956857073994, + "grad_norm": 29.294368743896484, + "learning_rate": 2.4367740927003716e-05, + "loss": 20.1894, + "step": 1855 + }, + { + "epoch": 0.4473367076403953, + "grad_norm": 20.42129898071289, + "learning_rate": 2.435250221364249e-05, + "loss": 19.3575, + "step": 1856 + }, + { + "epoch": 0.4475777295733912, + "grad_norm": 19.801822662353516, + "learning_rate": 2.4337260847369652e-05, + "loss": 19.8687, + "step": 1857 + }, + { + "epoch": 0.44781875150638706, + "grad_norm": 13.509747505187988, + "learning_rate": 2.4322016837475033e-05, + "loss": 20.3226, + "step": 1858 + }, + { + "epoch": 0.44805977343938297, + "grad_norm": 13.147985458374023, + "learning_rate": 2.4306770193250076e-05, + "loss": 20.0534, + "step": 1859 + }, + { + "epoch": 0.4483007953723789, + "grad_norm": 3.7185678482055664, + "learning_rate": 2.4291520923987823e-05, + "loss": 20.1158, + "step": 1860 + }, + { + "epoch": 0.4485418173053748, + "grad_norm": 1.8376868963241577, + "learning_rate": 2.427626903898292e-05, + "loss": 19.699, + "step": 1861 + }, + { + "epoch": 0.4487828392383707, + "grad_norm": 9.30166244506836, + "learning_rate": 2.4261014547531613e-05, + "loss": 20.8933, + "step": 1862 + }, + { + "epoch": 0.4490238611713666, + "grad_norm": 14.14896011352539, + "learning_rate": 2.4245757458931717e-05, + "loss": 20.3241, + "step": 1863 + }, + { + "epoch": 0.4492648831043625, + "grad_norm": 17.198637008666992, + "learning_rate": 2.423049778248266e-05, + "loss": 20.6612, + "step": 1864 + }, + { + "epoch": 0.4495059050373584, + "grad_norm": 17.440717697143555, + "learning_rate": 2.4215235527485405e-05, + "loss": 19.9458, + "step": 1865 + }, + { + "epoch": 0.4497469269703543, + "grad_norm": 21.681293487548828, + "learning_rate": 2.419997070324254e-05, + "loss": 19.8863, + "step": 1866 + }, + { + "epoch": 0.4499879489033502, + "grad_norm": 21.19106101989746, + "learning_rate": 2.418470331905818e-05, + "loss": 20.3402, + "step": 1867 + }, + { + "epoch": 0.4502289708363461, + "grad_norm": 17.453590393066406, + "learning_rate": 2.4169433384238e-05, + "loss": 20.3084, + "step": 1868 + }, + { + "epoch": 0.450469992769342, + "grad_norm": 13.730365753173828, + "learning_rate": 2.4154160908089265e-05, + "loss": 21.0296, + "step": 1869 + }, + { + "epoch": 0.45071101470233793, + "grad_norm": 14.90967845916748, + "learning_rate": 2.4138885899920735e-05, + "loss": 21.3211, + "step": 1870 + }, + { + "epoch": 0.45095203663533384, + "grad_norm": 8.871872901916504, + "learning_rate": 2.412360836904277e-05, + "loss": 19.8757, + "step": 1871 + }, + { + "epoch": 0.4511930585683297, + "grad_norm": 2.935136318206787, + "learning_rate": 2.410832832476723e-05, + "loss": 19.0509, + "step": 1872 + }, + { + "epoch": 0.4514340805013256, + "grad_norm": 3.684075117111206, + "learning_rate": 2.409304577640752e-05, + "loss": 19.0763, + "step": 1873 + }, + { + "epoch": 0.4516751024343215, + "grad_norm": 10.907751083374023, + "learning_rate": 2.4077760733278566e-05, + "loss": 19.6751, + "step": 1874 + }, + { + "epoch": 0.45191612436731743, + "grad_norm": 15.262578010559082, + "learning_rate": 2.4062473204696816e-05, + "loss": 20.2934, + "step": 1875 + }, + { + "epoch": 0.45215714630031334, + "grad_norm": 23.671499252319336, + "learning_rate": 2.4047183199980243e-05, + "loss": 20.2516, + "step": 1876 + }, + { + "epoch": 0.45239816823330925, + "grad_norm": 22.680160522460938, + "learning_rate": 2.403189072844831e-05, + "loss": 20.7893, + "step": 1877 + }, + { + "epoch": 0.45263919016630516, + "grad_norm": 19.803909301757812, + "learning_rate": 2.401659579942201e-05, + "loss": 20.0103, + "step": 1878 + }, + { + "epoch": 0.452880212099301, + "grad_norm": 19.429237365722656, + "learning_rate": 2.40012984222238e-05, + "loss": 20.3362, + "step": 1879 + }, + { + "epoch": 0.4531212340322969, + "grad_norm": 20.31014060974121, + "learning_rate": 2.3985998606177667e-05, + "loss": 21.0, + "step": 1880 + }, + { + "epoch": 0.45336225596529284, + "grad_norm": 24.046619415283203, + "learning_rate": 2.3970696360609044e-05, + "loss": 19.7284, + "step": 1881 + }, + { + "epoch": 0.45360327789828875, + "grad_norm": 22.55208969116211, + "learning_rate": 2.3955391694844884e-05, + "loss": 19.3014, + "step": 1882 + }, + { + "epoch": 0.45384429983128466, + "grad_norm": 14.6911039352417, + "learning_rate": 2.3940084618213587e-05, + "loss": 19.8825, + "step": 1883 + }, + { + "epoch": 0.45408532176428057, + "grad_norm": 14.893192291259766, + "learning_rate": 2.3924775140045036e-05, + "loss": 20.5237, + "step": 1884 + }, + { + "epoch": 0.4543263436972764, + "grad_norm": 13.702637672424316, + "learning_rate": 2.3909463269670575e-05, + "loss": 20.2407, + "step": 1885 + }, + { + "epoch": 0.45456736563027234, + "grad_norm": 8.999434471130371, + "learning_rate": 2.3894149016422996e-05, + "loss": 20.6539, + "step": 1886 + }, + { + "epoch": 0.45480838756326825, + "grad_norm": 1.6822892427444458, + "learning_rate": 2.3878832389636572e-05, + "loss": 20.0135, + "step": 1887 + }, + { + "epoch": 0.45504940949626416, + "grad_norm": 8.935791015625, + "learning_rate": 2.386351339864699e-05, + "loss": 20.482, + "step": 1888 + }, + { + "epoch": 0.45529043142926007, + "grad_norm": 10.960128784179688, + "learning_rate": 2.3848192052791387e-05, + "loss": 19.4283, + "step": 1889 + }, + { + "epoch": 0.455531453362256, + "grad_norm": 11.523211479187012, + "learning_rate": 2.3832868361408355e-05, + "loss": 19.3707, + "step": 1890 + }, + { + "epoch": 0.4557724752952519, + "grad_norm": 14.202197074890137, + "learning_rate": 2.381754233383788e-05, + "loss": 20.0312, + "step": 1891 + }, + { + "epoch": 0.45601349722824774, + "grad_norm": 10.972423553466797, + "learning_rate": 2.3802213979421415e-05, + "loss": 19.9541, + "step": 1892 + }, + { + "epoch": 0.45625451916124365, + "grad_norm": 12.478150367736816, + "learning_rate": 2.3786883307501794e-05, + "loss": 20.9215, + "step": 1893 + }, + { + "epoch": 0.45649554109423957, + "grad_norm": 17.068302154541016, + "learning_rate": 2.3771550327423287e-05, + "loss": 20.1491, + "step": 1894 + }, + { + "epoch": 0.4567365630272355, + "grad_norm": 13.903290748596191, + "learning_rate": 2.375621504853155e-05, + "loss": 20.261, + "step": 1895 + }, + { + "epoch": 0.4569775849602314, + "grad_norm": 6.449080467224121, + "learning_rate": 2.3740877480173663e-05, + "loss": 20.5299, + "step": 1896 + }, + { + "epoch": 0.4572186068932273, + "grad_norm": 1.61079740524292, + "learning_rate": 2.3725537631698083e-05, + "loss": 20.421, + "step": 1897 + }, + { + "epoch": 0.4574596288262232, + "grad_norm": 2.7922861576080322, + "learning_rate": 2.3710195512454674e-05, + "loss": 19.6849, + "step": 1898 + }, + { + "epoch": 0.45770065075921906, + "grad_norm": 8.119808197021484, + "learning_rate": 2.369485113179467e-05, + "loss": 19.8303, + "step": 1899 + }, + { + "epoch": 0.457941672692215, + "grad_norm": 18.060640335083008, + "learning_rate": 2.367950449907068e-05, + "loss": 20.019, + "step": 1900 + }, + { + "epoch": 0.4581826946252109, + "grad_norm": 17.197132110595703, + "learning_rate": 2.3664155623636715e-05, + "loss": 21.0935, + "step": 1901 + }, + { + "epoch": 0.4584237165582068, + "grad_norm": 26.54437255859375, + "learning_rate": 2.3648804514848097e-05, + "loss": 20.144, + "step": 1902 + }, + { + "epoch": 0.4586647384912027, + "grad_norm": 29.552303314208984, + "learning_rate": 2.3633451182061576e-05, + "loss": 19.4919, + "step": 1903 + }, + { + "epoch": 0.4589057604241986, + "grad_norm": 27.67404556274414, + "learning_rate": 2.361809563463521e-05, + "loss": 20.6119, + "step": 1904 + }, + { + "epoch": 0.4591467823571945, + "grad_norm": 21.81255531311035, + "learning_rate": 2.3602737881928423e-05, + "loss": 20.3825, + "step": 1905 + }, + { + "epoch": 0.4593878042901904, + "grad_norm": 19.314594268798828, + "learning_rate": 2.3587377933301987e-05, + "loss": 20.5724, + "step": 1906 + }, + { + "epoch": 0.4596288262231863, + "grad_norm": 28.803354263305664, + "learning_rate": 2.3572015798118e-05, + "loss": 20.6087, + "step": 1907 + }, + { + "epoch": 0.4598698481561822, + "grad_norm": 29.30719757080078, + "learning_rate": 2.355665148573991e-05, + "loss": 21.1545, + "step": 1908 + }, + { + "epoch": 0.4601108700891781, + "grad_norm": 13.658269882202148, + "learning_rate": 2.354128500553248e-05, + "loss": 20.4322, + "step": 1909 + }, + { + "epoch": 0.460351892022174, + "grad_norm": 18.69731903076172, + "learning_rate": 2.3525916366861796e-05, + "loss": 19.9605, + "step": 1910 + }, + { + "epoch": 0.46059291395516994, + "grad_norm": 8.636419296264648, + "learning_rate": 2.3510545579095255e-05, + "loss": 19.8297, + "step": 1911 + }, + { + "epoch": 0.46083393588816585, + "grad_norm": 4.310947895050049, + "learning_rate": 2.3495172651601574e-05, + "loss": 20.3831, + "step": 1912 + }, + { + "epoch": 0.4610749578211617, + "grad_norm": 3.4353671073913574, + "learning_rate": 2.3479797593750767e-05, + "loss": 20.3087, + "step": 1913 + }, + { + "epoch": 0.4613159797541576, + "grad_norm": 4.253045558929443, + "learning_rate": 2.3464420414914156e-05, + "loss": 19.9394, + "step": 1914 + }, + { + "epoch": 0.4615570016871535, + "grad_norm": 8.456063270568848, + "learning_rate": 2.3449041124464347e-05, + "loss": 20.0004, + "step": 1915 + }, + { + "epoch": 0.46179802362014943, + "grad_norm": 8.220453262329102, + "learning_rate": 2.3433659731775226e-05, + "loss": 20.1673, + "step": 1916 + }, + { + "epoch": 0.46203904555314534, + "grad_norm": 16.331741333007812, + "learning_rate": 2.3418276246221975e-05, + "loss": 20.1259, + "step": 1917 + }, + { + "epoch": 0.46228006748614126, + "grad_norm": 12.78930950164795, + "learning_rate": 2.3402890677181044e-05, + "loss": 20.7414, + "step": 1918 + }, + { + "epoch": 0.46252108941913717, + "grad_norm": 17.962650299072266, + "learning_rate": 2.338750303403017e-05, + "loss": 21.2571, + "step": 1919 + }, + { + "epoch": 0.462762111352133, + "grad_norm": 14.207361221313477, + "learning_rate": 2.3372113326148313e-05, + "loss": 20.7853, + "step": 1920 + }, + { + "epoch": 0.46300313328512893, + "grad_norm": 3.136791467666626, + "learning_rate": 2.3356721562915735e-05, + "loss": 19.9958, + "step": 1921 + }, + { + "epoch": 0.46324415521812484, + "grad_norm": 5.995757102966309, + "learning_rate": 2.3341327753713925e-05, + "loss": 20.8529, + "step": 1922 + }, + { + "epoch": 0.46348517715112075, + "grad_norm": 5.928219318389893, + "learning_rate": 2.332593190792563e-05, + "loss": 20.0411, + "step": 1923 + }, + { + "epoch": 0.46372619908411666, + "grad_norm": 7.629456520080566, + "learning_rate": 2.331053403493484e-05, + "loss": 20.5664, + "step": 1924 + }, + { + "epoch": 0.4639672210171126, + "grad_norm": 12.512802124023438, + "learning_rate": 2.3295134144126772e-05, + "loss": 19.4455, + "step": 1925 + }, + { + "epoch": 0.4642082429501085, + "grad_norm": 16.193378448486328, + "learning_rate": 2.3279732244887878e-05, + "loss": 20.9782, + "step": 1926 + }, + { + "epoch": 0.46444926488310434, + "grad_norm": 26.300033569335938, + "learning_rate": 2.3264328346605828e-05, + "loss": 20.5259, + "step": 1927 + }, + { + "epoch": 0.46469028681610025, + "grad_norm": 28.293800354003906, + "learning_rate": 2.3248922458669524e-05, + "loss": 19.6187, + "step": 1928 + }, + { + "epoch": 0.46493130874909616, + "grad_norm": 25.42544174194336, + "learning_rate": 2.3233514590469072e-05, + "loss": 20.0621, + "step": 1929 + }, + { + "epoch": 0.4651723306820921, + "grad_norm": 24.921363830566406, + "learning_rate": 2.3218104751395777e-05, + "loss": 20.3477, + "step": 1930 + }, + { + "epoch": 0.465413352615088, + "grad_norm": 25.300065994262695, + "learning_rate": 2.3202692950842167e-05, + "loss": 20.297, + "step": 1931 + }, + { + "epoch": 0.4656543745480839, + "grad_norm": 20.804677963256836, + "learning_rate": 2.3187279198201935e-05, + "loss": 20.0072, + "step": 1932 + }, + { + "epoch": 0.4658953964810798, + "grad_norm": 18.065580368041992, + "learning_rate": 2.3171863502869993e-05, + "loss": 20.538, + "step": 1933 + }, + { + "epoch": 0.46613641841407566, + "grad_norm": 12.336984634399414, + "learning_rate": 2.315644587424242e-05, + "loss": 20.2884, + "step": 1934 + }, + { + "epoch": 0.46637744034707157, + "grad_norm": 7.539339065551758, + "learning_rate": 2.314102632171648e-05, + "loss": 20.3682, + "step": 1935 + }, + { + "epoch": 0.4666184622800675, + "grad_norm": 8.930306434631348, + "learning_rate": 2.312560485469062e-05, + "loss": 19.7759, + "step": 1936 + }, + { + "epoch": 0.4668594842130634, + "grad_norm": 8.239495277404785, + "learning_rate": 2.3110181482564424e-05, + "loss": 19.7183, + "step": 1937 + }, + { + "epoch": 0.4671005061460593, + "grad_norm": 2.1577677726745605, + "learning_rate": 2.309475621473866e-05, + "loss": 20.8677, + "step": 1938 + }, + { + "epoch": 0.4673415280790552, + "grad_norm": 2.5941951274871826, + "learning_rate": 2.3079329060615247e-05, + "loss": 19.4548, + "step": 1939 + }, + { + "epoch": 0.4675825500120511, + "grad_norm": 4.779300212860107, + "learning_rate": 2.3063900029597264e-05, + "loss": 20.1585, + "step": 1940 + }, + { + "epoch": 0.467823571945047, + "grad_norm": 8.010509490966797, + "learning_rate": 2.304846913108891e-05, + "loss": 20.5582, + "step": 1941 + }, + { + "epoch": 0.4680645938780429, + "grad_norm": 8.310798645019531, + "learning_rate": 2.303303637449554e-05, + "loss": 19.7113, + "step": 1942 + }, + { + "epoch": 0.4683056158110388, + "grad_norm": 8.338432312011719, + "learning_rate": 2.3017601769223637e-05, + "loss": 19.8223, + "step": 1943 + }, + { + "epoch": 0.4685466377440347, + "grad_norm": 9.549720764160156, + "learning_rate": 2.3002165324680813e-05, + "loss": 20.0561, + "step": 1944 + }, + { + "epoch": 0.4687876596770306, + "grad_norm": 8.120071411132812, + "learning_rate": 2.2986727050275802e-05, + "loss": 18.8324, + "step": 1945 + }, + { + "epoch": 0.46902868161002653, + "grad_norm": 10.84874439239502, + "learning_rate": 2.2971286955418446e-05, + "loss": 19.5592, + "step": 1946 + }, + { + "epoch": 0.4692697035430224, + "grad_norm": 7.861667633056641, + "learning_rate": 2.2955845049519702e-05, + "loss": 20.2722, + "step": 1947 + }, + { + "epoch": 0.4695107254760183, + "grad_norm": 1.5581390857696533, + "learning_rate": 2.294040134199162e-05, + "loss": 19.2608, + "step": 1948 + }, + { + "epoch": 0.4697517474090142, + "grad_norm": 3.243753671646118, + "learning_rate": 2.292495584224738e-05, + "loss": 19.597, + "step": 1949 + }, + { + "epoch": 0.4699927693420101, + "grad_norm": 5.136959075927734, + "learning_rate": 2.2909508559701208e-05, + "loss": 19.4655, + "step": 1950 + }, + { + "epoch": 0.47023379127500603, + "grad_norm": 10.432117462158203, + "learning_rate": 2.2894059503768457e-05, + "loss": 20.7608, + "step": 1951 + }, + { + "epoch": 0.47047481320800194, + "grad_norm": 15.768499374389648, + "learning_rate": 2.2878608683865545e-05, + "loss": 20.1251, + "step": 1952 + }, + { + "epoch": 0.47071583514099785, + "grad_norm": 19.314693450927734, + "learning_rate": 2.2863156109409953e-05, + "loss": 20.0316, + "step": 1953 + }, + { + "epoch": 0.4709568570739937, + "grad_norm": 18.690948486328125, + "learning_rate": 2.2847701789820264e-05, + "loss": 20.1732, + "step": 1954 + }, + { + "epoch": 0.4711978790069896, + "grad_norm": 15.330910682678223, + "learning_rate": 2.283224573451608e-05, + "loss": 20.114, + "step": 1955 + }, + { + "epoch": 0.47143890093998553, + "grad_norm": 18.692724227905273, + "learning_rate": 2.2816787952918103e-05, + "loss": 20.1723, + "step": 1956 + }, + { + "epoch": 0.47167992287298144, + "grad_norm": 17.578039169311523, + "learning_rate": 2.2801328454448058e-05, + "loss": 19.9386, + "step": 1957 + }, + { + "epoch": 0.47192094480597735, + "grad_norm": 12.23282241821289, + "learning_rate": 2.278586724852873e-05, + "loss": 20.3189, + "step": 1958 + }, + { + "epoch": 0.47216196673897326, + "grad_norm": 17.35020637512207, + "learning_rate": 2.2770404344583952e-05, + "loss": 21.0936, + "step": 1959 + }, + { + "epoch": 0.47240298867196917, + "grad_norm": 9.992749214172363, + "learning_rate": 2.2754939752038573e-05, + "loss": 20.589, + "step": 1960 + }, + { + "epoch": 0.472644010604965, + "grad_norm": 2.476226329803467, + "learning_rate": 2.2739473480318487e-05, + "loss": 20.0648, + "step": 1961 + }, + { + "epoch": 0.47288503253796094, + "grad_norm": 1.802409291267395, + "learning_rate": 2.2724005538850596e-05, + "loss": 20.5694, + "step": 1962 + }, + { + "epoch": 0.47312605447095685, + "grad_norm": 5.154036521911621, + "learning_rate": 2.2708535937062833e-05, + "loss": 20.041, + "step": 1963 + }, + { + "epoch": 0.47336707640395276, + "grad_norm": 8.691493034362793, + "learning_rate": 2.2693064684384142e-05, + "loss": 20.2315, + "step": 1964 + }, + { + "epoch": 0.47360809833694867, + "grad_norm": 9.182756423950195, + "learning_rate": 2.267759179024447e-05, + "loss": 19.9686, + "step": 1965 + }, + { + "epoch": 0.4738491202699446, + "grad_norm": 6.705470085144043, + "learning_rate": 2.2662117264074767e-05, + "loss": 19.7442, + "step": 1966 + }, + { + "epoch": 0.4740901422029405, + "grad_norm": 1.5670584440231323, + "learning_rate": 2.2646641115306968e-05, + "loss": 19.2471, + "step": 1967 + }, + { + "epoch": 0.47433116413593635, + "grad_norm": 7.95893669128418, + "learning_rate": 2.2631163353374007e-05, + "loss": 20.1041, + "step": 1968 + }, + { + "epoch": 0.47457218606893226, + "grad_norm": 1.8925632238388062, + "learning_rate": 2.2615683987709796e-05, + "loss": 19.9794, + "step": 1969 + }, + { + "epoch": 0.47481320800192817, + "grad_norm": 1.5670338869094849, + "learning_rate": 2.2600203027749235e-05, + "loss": 20.2362, + "step": 1970 + }, + { + "epoch": 0.4750542299349241, + "grad_norm": 13.082667350769043, + "learning_rate": 2.2584720482928183e-05, + "loss": 20.166, + "step": 1971 + }, + { + "epoch": 0.47529525186792, + "grad_norm": 15.019366264343262, + "learning_rate": 2.256923636268347e-05, + "loss": 20.25, + "step": 1972 + }, + { + "epoch": 0.4755362738009159, + "grad_norm": 25.681678771972656, + "learning_rate": 2.255375067645289e-05, + "loss": 20.4301, + "step": 1973 + }, + { + "epoch": 0.4757772957339118, + "grad_norm": 27.80413818359375, + "learning_rate": 2.2538263433675173e-05, + "loss": 19.1664, + "step": 1974 + }, + { + "epoch": 0.47601831766690766, + "grad_norm": 32.559913635253906, + "learning_rate": 2.2522774643790034e-05, + "loss": 21.1224, + "step": 1975 + }, + { + "epoch": 0.4762593395999036, + "grad_norm": 35.059288024902344, + "learning_rate": 2.2507284316238094e-05, + "loss": 21.0951, + "step": 1976 + }, + { + "epoch": 0.4765003615328995, + "grad_norm": 39.813995361328125, + "learning_rate": 2.2491792460460936e-05, + "loss": 21.2921, + "step": 1977 + }, + { + "epoch": 0.4767413834658954, + "grad_norm": 39.06175994873047, + "learning_rate": 2.2476299085901054e-05, + "loss": 20.071, + "step": 1978 + }, + { + "epoch": 0.4769824053988913, + "grad_norm": 39.56377410888672, + "learning_rate": 2.2460804202001892e-05, + "loss": 20.6726, + "step": 1979 + }, + { + "epoch": 0.4772234273318872, + "grad_norm": 36.073307037353516, + "learning_rate": 2.2445307818207785e-05, + "loss": 20.9457, + "step": 1980 + }, + { + "epoch": 0.47746444926488313, + "grad_norm": 35.0748405456543, + "learning_rate": 2.242980994396401e-05, + "loss": 21.6211, + "step": 1981 + }, + { + "epoch": 0.477705471197879, + "grad_norm": 27.089656829833984, + "learning_rate": 2.2414310588716742e-05, + "loss": 20.7825, + "step": 1982 + }, + { + "epoch": 0.4779464931308749, + "grad_norm": 22.229307174682617, + "learning_rate": 2.239880976191305e-05, + "loss": 20.5549, + "step": 1983 + }, + { + "epoch": 0.4781875150638708, + "grad_norm": 11.916168212890625, + "learning_rate": 2.238330747300091e-05, + "loss": 20.559, + "step": 1984 + }, + { + "epoch": 0.4784285369968667, + "grad_norm": 5.680569648742676, + "learning_rate": 2.2367803731429173e-05, + "loss": 20.4524, + "step": 1985 + }, + { + "epoch": 0.4786695589298626, + "grad_norm": 10.466980934143066, + "learning_rate": 2.2352298546647614e-05, + "loss": 20.9829, + "step": 1986 + }, + { + "epoch": 0.47891058086285854, + "grad_norm": 15.402517318725586, + "learning_rate": 2.2336791928106846e-05, + "loss": 20.1957, + "step": 1987 + }, + { + "epoch": 0.47915160279585445, + "grad_norm": 19.324575424194336, + "learning_rate": 2.2321283885258368e-05, + "loss": 19.9329, + "step": 1988 + }, + { + "epoch": 0.4793926247288503, + "grad_norm": 24.93718719482422, + "learning_rate": 2.230577442755457e-05, + "loss": 20.6904, + "step": 1989 + }, + { + "epoch": 0.4796336466618462, + "grad_norm": 31.18131446838379, + "learning_rate": 2.229026356444866e-05, + "loss": 21.0508, + "step": 1990 + }, + { + "epoch": 0.4798746685948421, + "grad_norm": 30.928367614746094, + "learning_rate": 2.2274751305394756e-05, + "loss": 20.3635, + "step": 1991 + }, + { + "epoch": 0.48011569052783803, + "grad_norm": 26.435148239135742, + "learning_rate": 2.225923765984778e-05, + "loss": 20.0189, + "step": 1992 + }, + { + "epoch": 0.48035671246083395, + "grad_norm": 37.301536560058594, + "learning_rate": 2.2243722637263522e-05, + "loss": 21.2239, + "step": 1993 + }, + { + "epoch": 0.48059773439382986, + "grad_norm": 28.935400009155273, + "learning_rate": 2.2228206247098614e-05, + "loss": 21.047, + "step": 1994 + }, + { + "epoch": 0.48083875632682577, + "grad_norm": 28.568130493164062, + "learning_rate": 2.2212688498810505e-05, + "loss": 21.0378, + "step": 1995 + }, + { + "epoch": 0.4810797782598216, + "grad_norm": 19.08187484741211, + "learning_rate": 2.2197169401857485e-05, + "loss": 20.348, + "step": 1996 + }, + { + "epoch": 0.48132080019281753, + "grad_norm": 18.708393096923828, + "learning_rate": 2.2181648965698668e-05, + "loss": 20.5239, + "step": 1997 + }, + { + "epoch": 0.48156182212581344, + "grad_norm": 20.326255798339844, + "learning_rate": 2.216612719979398e-05, + "loss": 20.6208, + "step": 1998 + }, + { + "epoch": 0.48180284405880935, + "grad_norm": 22.1959228515625, + "learning_rate": 2.215060411360415e-05, + "loss": 20.6837, + "step": 1999 + }, + { + "epoch": 0.48204386599180526, + "grad_norm": 4.444324493408203, + "learning_rate": 2.213507971659072e-05, + "loss": 19.1374, + "step": 2000 + }, + { + "epoch": 0.48204386599180526, + "eval_cc_pretrain_accuracy": 0.9399999976158142, + "eval_cc_pretrain_loss": 2.1168198585510254, + "eval_cc_pretrain_num_cand": 800.0, + "eval_cc_pretrain_runtime": 17.2355, + "eval_cc_pretrain_samples_per_second": 5.802, + "eval_cc_pretrain_steps_per_second": 0.058, + "eval_cc_pretrain_temperature": 0.06982421875, + "step": 2000 + }, + { + "epoch": 0.48204386599180526, + "eval_mscoco_pretrain_accuracy": 0.8299999833106995, + "eval_mscoco_pretrain_loss": 2.3792147636413574, + "eval_mscoco_pretrain_num_cand": 800.0, + "eval_mscoco_pretrain_runtime": 14.1258, + "eval_mscoco_pretrain_samples_per_second": 7.079, + "eval_mscoco_pretrain_steps_per_second": 0.071, + "eval_mscoco_pretrain_temperature": 0.06982421875, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 4149, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}