{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5349023803155925, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010698047606311847, "grad_norm": 1.7288448810577393, "learning_rate": 2.02e-06, "loss": 2.6627, "step": 1 }, { "epoch": 0.0021396095212623694, "grad_norm": 1.7482303380966187, "learning_rate": 4.04e-06, "loss": 2.8135, "step": 2 }, { "epoch": 0.0032094142818935543, "grad_norm": 2.0931556224823, "learning_rate": 6.06e-06, "loss": 2.6959, "step": 3 }, { "epoch": 0.004279219042524739, "grad_norm": 2.3097078800201416, "learning_rate": 8.08e-06, "loss": 2.8474, "step": 4 }, { "epoch": 0.005349023803155924, "grad_norm": 2.0344951152801514, "learning_rate": 1.0100000000000002e-05, "loss": 2.7842, "step": 5 }, { "epoch": 0.006418828563787109, "grad_norm": 1.9919047355651855, "learning_rate": 1.212e-05, "loss": 3.089, "step": 6 }, { "epoch": 0.007488633324418294, "grad_norm": 0.9272979497909546, "learning_rate": 1.4140000000000002e-05, "loss": 2.3151, "step": 7 }, { "epoch": 0.008558438085049478, "grad_norm": 1.6376522779464722, "learning_rate": 1.616e-05, "loss": 2.824, "step": 8 }, { "epoch": 0.009628242845680663, "grad_norm": 1.8005309104919434, "learning_rate": 1.818e-05, "loss": 2.6834, "step": 9 }, { "epoch": 0.010698047606311848, "grad_norm": 1.8941646814346313, "learning_rate": 2.0200000000000003e-05, "loss": 2.6678, "step": 10 }, { "epoch": 0.011767852366943032, "grad_norm": 1.860886573791504, "learning_rate": 2.222e-05, "loss": 2.7923, "step": 11 }, { "epoch": 0.012837657127574217, "grad_norm": 1.6937692165374756, "learning_rate": 2.424e-05, "loss": 2.4482, "step": 12 }, { "epoch": 0.013907461888205403, "grad_norm": 0.988033652305603, "learning_rate": 2.6260000000000003e-05, "loss": 2.3207, "step": 13 }, { "epoch": 0.014977266648836588, "grad_norm": 1.067140817642212, "learning_rate": 2.8280000000000004e-05, "loss": 2.3765, "step": 14 }, { "epoch": 0.016047071409467772, "grad_norm": 1.0069628953933716, "learning_rate": 3.0299999999999998e-05, "loss": 2.2754, "step": 15 }, { "epoch": 0.017116876170098955, "grad_norm": 0.8966661095619202, "learning_rate": 3.232e-05, "loss": 1.9436, "step": 16 }, { "epoch": 0.018186680930730142, "grad_norm": 0.8621421456336975, "learning_rate": 3.434e-05, "loss": 2.0105, "step": 17 }, { "epoch": 0.019256485691361326, "grad_norm": 0.6790559887886047, "learning_rate": 3.636e-05, "loss": 2.1134, "step": 18 }, { "epoch": 0.02032629045199251, "grad_norm": 0.8534191250801086, "learning_rate": 3.838e-05, "loss": 2.6721, "step": 19 }, { "epoch": 0.021396095212623697, "grad_norm": 0.6541608572006226, "learning_rate": 4.0400000000000006e-05, "loss": 2.5973, "step": 20 }, { "epoch": 0.02246589997325488, "grad_norm": 0.5651280283927917, "learning_rate": 4.242e-05, "loss": 2.068, "step": 21 }, { "epoch": 0.023535704733886064, "grad_norm": 0.6473293900489807, "learning_rate": 4.444e-05, "loss": 2.2025, "step": 22 }, { "epoch": 0.02460550949451725, "grad_norm": 0.6129287481307983, "learning_rate": 4.6460000000000006e-05, "loss": 2.2035, "step": 23 }, { "epoch": 0.025675314255148435, "grad_norm": 0.8293673992156982, "learning_rate": 4.848e-05, "loss": 2.1585, "step": 24 }, { "epoch": 0.026745119015779622, "grad_norm": 0.6636125445365906, "learning_rate": 5.05e-05, "loss": 1.9494, "step": 25 }, { "epoch": 0.027814923776410806, "grad_norm": 0.7317002415657043, "learning_rate": 5.2520000000000005e-05, "loss": 2.3731, "step": 26 }, { "epoch": 0.02888472853704199, "grad_norm": 0.5912388563156128, "learning_rate": 5.454e-05, "loss": 1.9946, "step": 27 }, { "epoch": 0.029954533297673176, "grad_norm": 0.509111225605011, "learning_rate": 5.656000000000001e-05, "loss": 1.906, "step": 28 }, { "epoch": 0.03102433805830436, "grad_norm": 0.5736451745033264, "learning_rate": 5.858e-05, "loss": 2.1449, "step": 29 }, { "epoch": 0.032094142818935543, "grad_norm": 0.5325722098350525, "learning_rate": 6.0599999999999996e-05, "loss": 1.8039, "step": 30 }, { "epoch": 0.03316394757956673, "grad_norm": 0.6035159826278687, "learning_rate": 6.262000000000001e-05, "loss": 2.0611, "step": 31 }, { "epoch": 0.03423375234019791, "grad_norm": 0.7616627812385559, "learning_rate": 6.464e-05, "loss": 2.519, "step": 32 }, { "epoch": 0.0353035571008291, "grad_norm": 0.5022464394569397, "learning_rate": 6.666e-05, "loss": 2.1251, "step": 33 }, { "epoch": 0.036373361861460285, "grad_norm": 0.5941365957260132, "learning_rate": 6.868e-05, "loss": 2.0392, "step": 34 }, { "epoch": 0.03744316662209147, "grad_norm": 0.5984216928482056, "learning_rate": 7.07e-05, "loss": 2.0259, "step": 35 }, { "epoch": 0.03851297138272265, "grad_norm": 0.5933278799057007, "learning_rate": 7.272e-05, "loss": 2.0123, "step": 36 }, { "epoch": 0.039582776143353836, "grad_norm": 0.5975232720375061, "learning_rate": 7.474e-05, "loss": 2.0471, "step": 37 }, { "epoch": 0.04065258090398502, "grad_norm": 0.7600012421607971, "learning_rate": 7.676e-05, "loss": 1.9952, "step": 38 }, { "epoch": 0.04172238566461621, "grad_norm": 0.641670286655426, "learning_rate": 7.878e-05, "loss": 2.0748, "step": 39 }, { "epoch": 0.042792190425247394, "grad_norm": 0.616679310798645, "learning_rate": 8.080000000000001e-05, "loss": 2.0396, "step": 40 }, { "epoch": 0.04386199518587858, "grad_norm": 0.7081180214881897, "learning_rate": 8.282e-05, "loss": 1.7703, "step": 41 }, { "epoch": 0.04493179994650976, "grad_norm": 0.7007136940956116, "learning_rate": 8.484e-05, "loss": 2.2135, "step": 42 }, { "epoch": 0.046001604707140945, "grad_norm": 0.729378879070282, "learning_rate": 8.686e-05, "loss": 1.8132, "step": 43 }, { "epoch": 0.04707140946777213, "grad_norm": 0.6952872276306152, "learning_rate": 8.888e-05, "loss": 2.2158, "step": 44 }, { "epoch": 0.04814121422840332, "grad_norm": 0.605029284954071, "learning_rate": 9.09e-05, "loss": 2.333, "step": 45 }, { "epoch": 0.0492110189890345, "grad_norm": 0.6177989840507507, "learning_rate": 9.292000000000001e-05, "loss": 1.6994, "step": 46 }, { "epoch": 0.050280823749665686, "grad_norm": 0.5986810922622681, "learning_rate": 9.494e-05, "loss": 1.8176, "step": 47 }, { "epoch": 0.05135062851029687, "grad_norm": 0.6833954453468323, "learning_rate": 9.696e-05, "loss": 1.9501, "step": 48 }, { "epoch": 0.05242043327092805, "grad_norm": 0.8341196775436401, "learning_rate": 9.898e-05, "loss": 1.8998, "step": 49 }, { "epoch": 0.053490238031559244, "grad_norm": 1.0498520135879517, "learning_rate": 0.000101, "loss": 1.6787, "step": 50 }, { "epoch": 0.05456004279219043, "grad_norm": 0.6734796762466431, "learning_rate": 0.00010302, "loss": 1.6638, "step": 51 }, { "epoch": 0.05562984755282161, "grad_norm": 0.6819894313812256, "learning_rate": 0.00010504000000000001, "loss": 2.0493, "step": 52 }, { "epoch": 0.056699652313452795, "grad_norm": 0.8042811155319214, "learning_rate": 0.00010706000000000001, "loss": 1.9026, "step": 53 }, { "epoch": 0.05776945707408398, "grad_norm": 0.9070219993591309, "learning_rate": 0.00010908, "loss": 2.0546, "step": 54 }, { "epoch": 0.05883926183471516, "grad_norm": 0.8149216771125793, "learning_rate": 0.00011110000000000002, "loss": 1.5653, "step": 55 }, { "epoch": 0.05990906659534635, "grad_norm": 0.8578245043754578, "learning_rate": 0.00011312000000000001, "loss": 1.7776, "step": 56 }, { "epoch": 0.060978871355977536, "grad_norm": 0.6369242668151855, "learning_rate": 0.00011514, "loss": 1.4798, "step": 57 }, { "epoch": 0.06204867611660872, "grad_norm": 0.8663727045059204, "learning_rate": 0.00011716, "loss": 1.8753, "step": 58 }, { "epoch": 0.06311848087723991, "grad_norm": 0.844870924949646, "learning_rate": 0.00011918, "loss": 1.7667, "step": 59 }, { "epoch": 0.06418828563787109, "grad_norm": 0.8465819954872131, "learning_rate": 0.00012119999999999999, "loss": 1.6944, "step": 60 }, { "epoch": 0.06525809039850228, "grad_norm": 0.807226300239563, "learning_rate": 0.00012322, "loss": 1.723, "step": 61 }, { "epoch": 0.06632789515913345, "grad_norm": 0.8274418711662292, "learning_rate": 0.00012524000000000001, "loss": 1.6919, "step": 62 }, { "epoch": 0.06739769991976464, "grad_norm": 0.8100050091743469, "learning_rate": 0.00012726, "loss": 1.8884, "step": 63 }, { "epoch": 0.06846750468039582, "grad_norm": 1.0959488153457642, "learning_rate": 0.00012928, "loss": 1.932, "step": 64 }, { "epoch": 0.06953730944102701, "grad_norm": 1.0040597915649414, "learning_rate": 0.00013130000000000002, "loss": 1.8512, "step": 65 }, { "epoch": 0.0706071142016582, "grad_norm": 1.096177101135254, "learning_rate": 0.00013332, "loss": 1.8671, "step": 66 }, { "epoch": 0.07167691896228938, "grad_norm": 1.0180017948150635, "learning_rate": 0.00013534000000000002, "loss": 1.8652, "step": 67 }, { "epoch": 0.07274672372292057, "grad_norm": 0.8910913467407227, "learning_rate": 0.00013736, "loss": 1.6867, "step": 68 }, { "epoch": 0.07381652848355175, "grad_norm": 0.7205930352210999, "learning_rate": 0.00013937999999999998, "loss": 1.3372, "step": 69 }, { "epoch": 0.07488633324418294, "grad_norm": 0.8050198554992676, "learning_rate": 0.0001414, "loss": 1.8053, "step": 70 }, { "epoch": 0.07595613800481413, "grad_norm": 0.9537407159805298, "learning_rate": 0.00014342, "loss": 1.9222, "step": 71 }, { "epoch": 0.0770259427654453, "grad_norm": 0.8652852773666382, "learning_rate": 0.00014544, "loss": 1.6321, "step": 72 }, { "epoch": 0.0780957475260765, "grad_norm": 0.9287482500076294, "learning_rate": 0.00014746, "loss": 1.6623, "step": 73 }, { "epoch": 0.07916555228670767, "grad_norm": 0.8908249735832214, "learning_rate": 0.00014948, "loss": 1.9778, "step": 74 }, { "epoch": 0.08023535704733886, "grad_norm": 1.09617280960083, "learning_rate": 0.0001515, "loss": 1.6459, "step": 75 }, { "epoch": 0.08130516180797004, "grad_norm": 0.7375260591506958, "learning_rate": 0.00015352, "loss": 1.5546, "step": 76 }, { "epoch": 0.08237496656860123, "grad_norm": 0.8195641040802002, "learning_rate": 0.00015554000000000002, "loss": 1.6602, "step": 77 }, { "epoch": 0.08344477132923242, "grad_norm": 0.8275863528251648, "learning_rate": 0.00015756, "loss": 1.7362, "step": 78 }, { "epoch": 0.0845145760898636, "grad_norm": 0.8596509695053101, "learning_rate": 0.00015958000000000001, "loss": 1.8343, "step": 79 }, { "epoch": 0.08558438085049479, "grad_norm": 0.8495697975158691, "learning_rate": 0.00016160000000000002, "loss": 1.5171, "step": 80 }, { "epoch": 0.08665418561112596, "grad_norm": 0.8796412944793701, "learning_rate": 0.00016362, "loss": 1.3298, "step": 81 }, { "epoch": 0.08772399037175715, "grad_norm": 0.7089576125144958, "learning_rate": 0.00016564, "loss": 1.5468, "step": 82 }, { "epoch": 0.08879379513238835, "grad_norm": 0.8609843850135803, "learning_rate": 0.00016766, "loss": 1.8827, "step": 83 }, { "epoch": 0.08986359989301952, "grad_norm": 0.8612833023071289, "learning_rate": 0.00016968, "loss": 1.7344, "step": 84 }, { "epoch": 0.09093340465365071, "grad_norm": 0.8177778124809265, "learning_rate": 0.0001717, "loss": 1.6543, "step": 85 }, { "epoch": 0.09200320941428189, "grad_norm": 0.9328483939170837, "learning_rate": 0.00017372, "loss": 1.452, "step": 86 }, { "epoch": 0.09307301417491308, "grad_norm": 1.2951349020004272, "learning_rate": 0.00017574, "loss": 1.5672, "step": 87 }, { "epoch": 0.09414281893554426, "grad_norm": 1.0605595111846924, "learning_rate": 0.00017776, "loss": 1.7043, "step": 88 }, { "epoch": 0.09521262369617545, "grad_norm": 0.8237940073013306, "learning_rate": 0.00017978000000000002, "loss": 1.7331, "step": 89 }, { "epoch": 0.09628242845680664, "grad_norm": 0.8034561276435852, "learning_rate": 0.0001818, "loss": 1.6586, "step": 90 }, { "epoch": 0.09735223321743781, "grad_norm": 0.7969396114349365, "learning_rate": 0.00018382, "loss": 1.5385, "step": 91 }, { "epoch": 0.098422037978069, "grad_norm": 0.792550265789032, "learning_rate": 0.00018584000000000002, "loss": 1.3909, "step": 92 }, { "epoch": 0.09949184273870018, "grad_norm": 0.8620875477790833, "learning_rate": 0.00018786, "loss": 1.8194, "step": 93 }, { "epoch": 0.10056164749933137, "grad_norm": 0.6592405438423157, "learning_rate": 0.00018988, "loss": 1.5115, "step": 94 }, { "epoch": 0.10163145225996256, "grad_norm": 0.9358750581741333, "learning_rate": 0.0001919, "loss": 1.5659, "step": 95 }, { "epoch": 0.10270125702059374, "grad_norm": 0.7691030502319336, "learning_rate": 0.00019392, "loss": 1.6011, "step": 96 }, { "epoch": 0.10377106178122493, "grad_norm": 0.8470796942710876, "learning_rate": 0.00019594, "loss": 1.9553, "step": 97 }, { "epoch": 0.1048408665418561, "grad_norm": 0.8427095413208008, "learning_rate": 0.00019796, "loss": 1.5329, "step": 98 }, { "epoch": 0.1059106713024873, "grad_norm": 0.8238065838813782, "learning_rate": 0.00019998, "loss": 1.8338, "step": 99 }, { "epoch": 0.10698047606311849, "grad_norm": 0.8195500373840332, "learning_rate": 0.000202, "loss": 1.4167, "step": 100 }, { "epoch": 0.10805028082374966, "grad_norm": 0.7097288966178894, "learning_rate": 0.00020199688492212377, "loss": 1.9694, "step": 101 }, { "epoch": 0.10912008558438085, "grad_norm": 0.8986706137657166, "learning_rate": 0.00020198753988064772, "loss": 1.7537, "step": 102 }, { "epoch": 0.11018989034501203, "grad_norm": 0.8444643616676331, "learning_rate": 0.00020197196545201806, "loss": 1.8293, "step": 103 }, { "epoch": 0.11125969510564322, "grad_norm": 0.961504340171814, "learning_rate": 0.0002019501625969389, "loss": 1.7586, "step": 104 }, { "epoch": 0.1123294998662744, "grad_norm": 1.3284465074539185, "learning_rate": 0.00020192213266031304, "loss": 1.5878, "step": 105 }, { "epoch": 0.11339930462690559, "grad_norm": 0.8253253102302551, "learning_rate": 0.00020188787737115897, "loss": 1.3733, "step": 106 }, { "epoch": 0.11446910938753678, "grad_norm": 0.7246334552764893, "learning_rate": 0.00020184739884250436, "loss": 1.9136, "step": 107 }, { "epoch": 0.11553891414816796, "grad_norm": 0.7832014560699463, "learning_rate": 0.00020180069957125544, "loss": 1.5472, "step": 108 }, { "epoch": 0.11660871890879915, "grad_norm": 1.0413076877593994, "learning_rate": 0.0002017477824380433, "loss": 1.5569, "step": 109 }, { "epoch": 0.11767852366943032, "grad_norm": 0.8862333297729492, "learning_rate": 0.00020168865070704594, "loss": 1.4746, "step": 110 }, { "epoch": 0.11874832843006151, "grad_norm": 1.0966826677322388, "learning_rate": 0.00020162330802578706, "loss": 1.6068, "step": 111 }, { "epoch": 0.1198181331906927, "grad_norm": 1.2533254623413086, "learning_rate": 0.00020155175842491107, "loss": 1.2275, "step": 112 }, { "epoch": 0.12088793795132388, "grad_norm": 0.8213242888450623, "learning_rate": 0.0002014740063179344, "loss": 1.665, "step": 113 }, { "epoch": 0.12195774271195507, "grad_norm": 0.907412052154541, "learning_rate": 0.00020139005650097317, "loss": 1.5866, "step": 114 }, { "epoch": 0.12302754747258625, "grad_norm": 1.1394984722137451, "learning_rate": 0.00020129991415244756, "loss": 1.7026, "step": 115 }, { "epoch": 0.12409735223321744, "grad_norm": 0.8012979626655579, "learning_rate": 0.00020120358483276227, "loss": 1.4903, "step": 116 }, { "epoch": 0.12516715699384862, "grad_norm": 0.7759647369384766, "learning_rate": 0.00020110107448396346, "loss": 1.6247, "step": 117 }, { "epoch": 0.12623696175447982, "grad_norm": 0.8149462342262268, "learning_rate": 0.0002009923894293723, "loss": 1.7867, "step": 118 }, { "epoch": 0.127306766515111, "grad_norm": 0.6406301259994507, "learning_rate": 0.00020087753637319499, "loss": 1.8543, "step": 119 }, { "epoch": 0.12837657127574217, "grad_norm": 0.6905284523963928, "learning_rate": 0.00020075652240010892, "loss": 1.7696, "step": 120 }, { "epoch": 0.12944637603637335, "grad_norm": 0.7446596622467041, "learning_rate": 0.00020062935497482606, "loss": 1.6399, "step": 121 }, { "epoch": 0.13051618079700456, "grad_norm": 0.6539953351020813, "learning_rate": 0.00020049604194163217, "loss": 1.3393, "step": 122 }, { "epoch": 0.13158598555763573, "grad_norm": 0.9102329611778259, "learning_rate": 0.00020035659152390313, "loss": 1.4309, "step": 123 }, { "epoch": 0.1326557903182669, "grad_norm": 0.9899008870124817, "learning_rate": 0.00020021101232359757, "loss": 1.4651, "step": 124 }, { "epoch": 0.1337255950788981, "grad_norm": 0.9645147323608398, "learning_rate": 0.0002000593133207263, "loss": 1.3662, "step": 125 }, { "epoch": 0.1347953998395293, "grad_norm": 0.5051845908164978, "learning_rate": 0.00019990150387279835, "loss": 1.7894, "step": 126 }, { "epoch": 0.13586520460016047, "grad_norm": 0.9146197438240051, "learning_rate": 0.00019973759371424388, "loss": 1.3153, "step": 127 }, { "epoch": 0.13693500936079164, "grad_norm": 0.9138407707214355, "learning_rate": 0.0001995675929558135, "loss": 1.6197, "step": 128 }, { "epoch": 0.13800481412142285, "grad_norm": 0.8355864882469177, "learning_rate": 0.0001993915120839548, "loss": 1.5158, "step": 129 }, { "epoch": 0.13907461888205402, "grad_norm": 0.7430102825164795, "learning_rate": 0.00019920936196016534, "loss": 1.4604, "step": 130 }, { "epoch": 0.1401444236426852, "grad_norm": 0.8076910376548767, "learning_rate": 0.0001990211538203228, "loss": 1.2259, "step": 131 }, { "epoch": 0.1412142284033164, "grad_norm": 0.5874120593070984, "learning_rate": 0.00019882689927399174, "loss": 1.3672, "step": 132 }, { "epoch": 0.14228403316394758, "grad_norm": 0.9105700254440308, "learning_rate": 0.00019862661030370764, "loss": 1.7311, "step": 133 }, { "epoch": 0.14335383792457876, "grad_norm": 0.8008758425712585, "learning_rate": 0.00019842029926423762, "loss": 1.6541, "step": 134 }, { "epoch": 0.14442364268520994, "grad_norm": 0.8042259216308594, "learning_rate": 0.00019820797888181837, "loss": 1.4167, "step": 135 }, { "epoch": 0.14549344744584114, "grad_norm": 0.7910740971565247, "learning_rate": 0.00019798966225337126, "loss": 1.7123, "step": 136 }, { "epoch": 0.14656325220647232, "grad_norm": 1.0059454441070557, "learning_rate": 0.00019776536284569425, "loss": 1.2412, "step": 137 }, { "epoch": 0.1476330569671035, "grad_norm": 0.5858060717582703, "learning_rate": 0.00019753509449463134, "loss": 1.5322, "step": 138 }, { "epoch": 0.1487028617277347, "grad_norm": 0.7342561483383179, "learning_rate": 0.00019729887140421912, "loss": 1.6668, "step": 139 }, { "epoch": 0.14977266648836587, "grad_norm": 0.8164688348770142, "learning_rate": 0.00019705670814581052, "loss": 1.3033, "step": 140 }, { "epoch": 0.15084247124899705, "grad_norm": 1.2722523212432861, "learning_rate": 0.00019680861965717597, "loss": 1.6553, "step": 141 }, { "epoch": 0.15191227600962826, "grad_norm": 0.8512006402015686, "learning_rate": 0.0001965546212415821, "loss": 1.5321, "step": 142 }, { "epoch": 0.15298208077025943, "grad_norm": 0.9400825500488281, "learning_rate": 0.00019629472856684755, "loss": 1.2994, "step": 143 }, { "epoch": 0.1540518855308906, "grad_norm": 0.6759757399559021, "learning_rate": 0.00019602895766437678, "loss": 1.6195, "step": 144 }, { "epoch": 0.15512169029152179, "grad_norm": 0.5927717089653015, "learning_rate": 0.00019575732492817092, "loss": 1.4048, "step": 145 }, { "epoch": 0.156191495052153, "grad_norm": 0.7187090516090393, "learning_rate": 0.00019547984711381662, "loss": 1.6072, "step": 146 }, { "epoch": 0.15726129981278417, "grad_norm": 0.7754374146461487, "learning_rate": 0.0001951965413374525, "loss": 1.407, "step": 147 }, { "epoch": 0.15833110457341534, "grad_norm": 1.0608164072036743, "learning_rate": 0.00019490742507471338, "loss": 1.5289, "step": 148 }, { "epoch": 0.15940090933404655, "grad_norm": 1.0499017238616943, "learning_rate": 0.0001946125161596522, "loss": 1.3187, "step": 149 }, { "epoch": 0.16047071409467772, "grad_norm": 0.9605572819709778, "learning_rate": 0.00019431183278363997, "loss": 1.203, "step": 150 }, { "epoch": 0.1615405188553089, "grad_norm": 0.6870492696762085, "learning_rate": 0.00019400539349424367, "loss": 1.9006, "step": 151 }, { "epoch": 0.16261032361594008, "grad_norm": 0.8582913875579834, "learning_rate": 0.0001936932171940821, "loss": 1.828, "step": 152 }, { "epoch": 0.16368012837657128, "grad_norm": 0.8130080699920654, "learning_rate": 0.00019337532313966, "loss": 1.5142, "step": 153 }, { "epoch": 0.16474993313720246, "grad_norm": 1.1094551086425781, "learning_rate": 0.00019305173094017996, "loss": 1.3347, "step": 154 }, { "epoch": 0.16581973789783364, "grad_norm": 0.7829759120941162, "learning_rate": 0.0001927224605563332, "loss": 1.2676, "step": 155 }, { "epoch": 0.16688954265846484, "grad_norm": 0.8161009550094604, "learning_rate": 0.00019238753229906797, "loss": 1.2813, "step": 156 }, { "epoch": 0.16795934741909602, "grad_norm": 0.794654369354248, "learning_rate": 0.00019204696682833682, "loss": 1.9873, "step": 157 }, { "epoch": 0.1690291521797272, "grad_norm": 0.7361302375793457, "learning_rate": 0.00019170078515182216, "loss": 1.6699, "step": 158 }, { "epoch": 0.1700989569403584, "grad_norm": 0.7557123899459839, "learning_rate": 0.00019134900862364054, "loss": 1.4325, "step": 159 }, { "epoch": 0.17116876170098957, "grad_norm": 0.863318681716919, "learning_rate": 0.00019099165894302515, "loss": 1.5635, "step": 160 }, { "epoch": 0.17223856646162075, "grad_norm": 0.8100736141204834, "learning_rate": 0.00019062875815298763, "loss": 1.3452, "step": 161 }, { "epoch": 0.17330837122225193, "grad_norm": 0.8432340025901794, "learning_rate": 0.00019026032863895805, "loss": 1.3103, "step": 162 }, { "epoch": 0.17437817598288313, "grad_norm": 0.8291115164756775, "learning_rate": 0.00018988639312740433, "loss": 1.3356, "step": 163 }, { "epoch": 0.1754479807435143, "grad_norm": 0.6323521733283997, "learning_rate": 0.0001895069746844302, "loss": 1.2827, "step": 164 }, { "epoch": 0.17651778550414549, "grad_norm": 0.6580619812011719, "learning_rate": 0.00018912209671435252, "loss": 1.483, "step": 165 }, { "epoch": 0.1775875902647767, "grad_norm": 0.927400529384613, "learning_rate": 0.00018873178295825732, "loss": 1.214, "step": 166 }, { "epoch": 0.17865739502540787, "grad_norm": 1.0609151124954224, "learning_rate": 0.00018833605749253566, "loss": 1.5702, "step": 167 }, { "epoch": 0.17972719978603904, "grad_norm": 0.9172216653823853, "learning_rate": 0.00018793494472739831, "loss": 1.4003, "step": 168 }, { "epoch": 0.18079700454667022, "grad_norm": 0.793006956577301, "learning_rate": 0.00018752846940537003, "loss": 1.5463, "step": 169 }, { "epoch": 0.18186680930730142, "grad_norm": 0.608418345451355, "learning_rate": 0.0001871166565997633, "loss": 1.4574, "step": 170 }, { "epoch": 0.1829366140679326, "grad_norm": 1.0822337865829468, "learning_rate": 0.00018669953171313188, "loss": 1.5066, "step": 171 }, { "epoch": 0.18400641882856378, "grad_norm": 0.8970229625701904, "learning_rate": 0.00018627712047570352, "loss": 1.5338, "step": 172 }, { "epoch": 0.18507622358919498, "grad_norm": 0.9221557378768921, "learning_rate": 0.0001858494489437931, "loss": 1.2652, "step": 173 }, { "epoch": 0.18614602834982616, "grad_norm": 0.8210504651069641, "learning_rate": 0.0001854165434981953, "loss": 1.6507, "step": 174 }, { "epoch": 0.18721583311045734, "grad_norm": 0.9927299618721008, "learning_rate": 0.00018497843084255708, "loss": 1.1338, "step": 175 }, { "epoch": 0.1882856378710885, "grad_norm": 0.640484631061554, "learning_rate": 0.00018453513800173072, "loss": 1.2064, "step": 176 }, { "epoch": 0.18935544263171972, "grad_norm": 1.0150686502456665, "learning_rate": 0.00018408669232010684, "loss": 1.4428, "step": 177 }, { "epoch": 0.1904252473923509, "grad_norm": 1.0852081775665283, "learning_rate": 0.00018363312145992737, "loss": 1.612, "step": 178 }, { "epoch": 0.19149505215298207, "grad_norm": 0.884283185005188, "learning_rate": 0.0001831744533995795, "loss": 1.4523, "step": 179 }, { "epoch": 0.19256485691361327, "grad_norm": 0.7068943977355957, "learning_rate": 0.00018271071643186968, "loss": 0.9464, "step": 180 }, { "epoch": 0.19363466167424445, "grad_norm": 0.9215288162231445, "learning_rate": 0.00018224193916227852, "loss": 1.255, "step": 181 }, { "epoch": 0.19470446643487563, "grad_norm": 0.6173170208930969, "learning_rate": 0.00018176815050719615, "loss": 1.556, "step": 182 }, { "epoch": 0.19577427119550683, "grad_norm": 0.7621954083442688, "learning_rate": 0.00018128937969213852, "loss": 1.7331, "step": 183 }, { "epoch": 0.196844075956138, "grad_norm": 0.8637974858283997, "learning_rate": 0.00018080565624994474, "loss": 1.2933, "step": 184 }, { "epoch": 0.19791388071676919, "grad_norm": 0.9591987729072571, "learning_rate": 0.00018031701001895524, "loss": 1.4638, "step": 185 }, { "epoch": 0.19898368547740036, "grad_norm": 0.7968719601631165, "learning_rate": 0.0001798234711411713, "loss": 1.2952, "step": 186 }, { "epoch": 0.20005349023803157, "grad_norm": 1.2682313919067383, "learning_rate": 0.00017932507006039567, "loss": 1.3475, "step": 187 }, { "epoch": 0.20112329499866274, "grad_norm": 0.5984092354774475, "learning_rate": 0.0001788218375203547, "loss": 1.4973, "step": 188 }, { "epoch": 0.20219309975929392, "grad_norm": 0.7179667949676514, "learning_rate": 0.00017831380456280192, "loss": 1.4728, "step": 189 }, { "epoch": 0.20326290451992512, "grad_norm": 1.1618380546569824, "learning_rate": 0.00017780100252560313, "loss": 1.6948, "step": 190 }, { "epoch": 0.2043327092805563, "grad_norm": 0.7051059603691101, "learning_rate": 0.00017728346304080357, "loss": 1.1708, "step": 191 }, { "epoch": 0.20540251404118748, "grad_norm": 0.8025861382484436, "learning_rate": 0.0001767612180326764, "loss": 1.2174, "step": 192 }, { "epoch": 0.20647231880181866, "grad_norm": 0.88255774974823, "learning_rate": 0.00017623429971575384, "loss": 1.4061, "step": 193 }, { "epoch": 0.20754212356244986, "grad_norm": 0.7437373995780945, "learning_rate": 0.0001757027405928396, "loss": 1.1144, "step": 194 }, { "epoch": 0.20861192832308104, "grad_norm": 1.0885889530181885, "learning_rate": 0.00017516657345300425, "loss": 1.6319, "step": 195 }, { "epoch": 0.2096817330837122, "grad_norm": 0.7450828552246094, "learning_rate": 0.00017462583136956258, "loss": 1.2593, "step": 196 }, { "epoch": 0.21075153784434342, "grad_norm": 0.7687603831291199, "learning_rate": 0.00017408054769803337, "loss": 1.4812, "step": 197 }, { "epoch": 0.2118213426049746, "grad_norm": 0.9665757417678833, "learning_rate": 0.00017353075607408209, "loss": 1.2375, "step": 198 }, { "epoch": 0.21289114736560577, "grad_norm": 0.9301120042800903, "learning_rate": 0.00017297649041144575, "loss": 1.4734, "step": 199 }, { "epoch": 0.21396095212623698, "grad_norm": 0.9758381843566895, "learning_rate": 0.0001724177848998413, "loss": 1.1196, "step": 200 }, { "epoch": 0.21503075688686815, "grad_norm": 0.6634808778762817, "learning_rate": 0.00017185467400285644, "loss": 1.3473, "step": 201 }, { "epoch": 0.21610056164749933, "grad_norm": 1.0161389112472534, "learning_rate": 0.00017128719245582374, "loss": 1.5335, "step": 202 }, { "epoch": 0.2171703664081305, "grad_norm": 1.047826886177063, "learning_rate": 0.00017071537526367817, "loss": 1.4653, "step": 203 }, { "epoch": 0.2182401711687617, "grad_norm": 0.8670541644096375, "learning_rate": 0.00017013925769879755, "loss": 1.1689, "step": 204 }, { "epoch": 0.2193099759293929, "grad_norm": 1.1081433296203613, "learning_rate": 0.00016955887529882714, "loss": 1.3757, "step": 205 }, { "epoch": 0.22037978069002406, "grad_norm": 1.3014432191848755, "learning_rate": 0.0001689742638644871, "loss": 1.1107, "step": 206 }, { "epoch": 0.22144958545065527, "grad_norm": 0.779435396194458, "learning_rate": 0.00016838545945736458, "loss": 1.4839, "step": 207 }, { "epoch": 0.22251939021128644, "grad_norm": 1.2251975536346436, "learning_rate": 0.00016779249839768884, "loss": 1.6717, "step": 208 }, { "epoch": 0.22358919497191762, "grad_norm": 1.197913646697998, "learning_rate": 0.00016719541726209117, "loss": 1.4974, "step": 209 }, { "epoch": 0.2246589997325488, "grad_norm": 1.0760390758514404, "learning_rate": 0.00016659425288134854, "loss": 1.3019, "step": 210 }, { "epoch": 0.22572880449318, "grad_norm": 1.208444356918335, "learning_rate": 0.00016598904233811168, "loss": 1.1918, "step": 211 }, { "epoch": 0.22679860925381118, "grad_norm": 1.3097633123397827, "learning_rate": 0.00016537982296461768, "loss": 1.1058, "step": 212 }, { "epoch": 0.22786841401444236, "grad_norm": 0.8760459423065186, "learning_rate": 0.00016476663234038717, "loss": 1.6432, "step": 213 }, { "epoch": 0.22893821877507356, "grad_norm": 0.7652115225791931, "learning_rate": 0.00016414950828990625, "loss": 1.7073, "step": 214 }, { "epoch": 0.23000802353570474, "grad_norm": 0.8084537386894226, "learning_rate": 0.00016352848888029326, "loss": 1.3418, "step": 215 }, { "epoch": 0.2310778282963359, "grad_norm": 1.2894420623779297, "learning_rate": 0.00016290361241895064, "loss": 1.4992, "step": 216 }, { "epoch": 0.2321476330569671, "grad_norm": 0.9421677589416504, "learning_rate": 0.00016227491745120196, "loss": 1.3181, "step": 217 }, { "epoch": 0.2332174378175983, "grad_norm": 1.0605345964431763, "learning_rate": 0.0001616424427579143, "loss": 1.4584, "step": 218 }, { "epoch": 0.23428724257822947, "grad_norm": 0.8410621285438538, "learning_rate": 0.0001610062273531059, "loss": 1.5476, "step": 219 }, { "epoch": 0.23535704733886065, "grad_norm": 0.6994707584381104, "learning_rate": 0.00016036631048153979, "loss": 1.176, "step": 220 }, { "epoch": 0.23642685209949185, "grad_norm": 1.1372922658920288, "learning_rate": 0.0001597227316163029, "loss": 1.3494, "step": 221 }, { "epoch": 0.23749665686012303, "grad_norm": 1.1717870235443115, "learning_rate": 0.00015907553045637116, "loss": 1.313, "step": 222 }, { "epoch": 0.2385664616207542, "grad_norm": 1.0904631614685059, "learning_rate": 0.00015842474692416068, "loss": 1.3035, "step": 223 }, { "epoch": 0.2396362663813854, "grad_norm": 1.2501357793807983, "learning_rate": 0.0001577704211630652, "loss": 1.2295, "step": 224 }, { "epoch": 0.2407060711420166, "grad_norm": 1.003653645515442, "learning_rate": 0.00015711259353497981, "loss": 1.0317, "step": 225 }, { "epoch": 0.24177587590264776, "grad_norm": 0.6239796876907349, "learning_rate": 0.0001564513046178113, "loss": 1.7061, "step": 226 }, { "epoch": 0.24284568066327894, "grad_norm": 0.8568412065505981, "learning_rate": 0.000155786595202975, "loss": 1.37, "step": 227 }, { "epoch": 0.24391548542391014, "grad_norm": 0.9232256412506104, "learning_rate": 0.00015511850629287865, "loss": 1.1996, "step": 228 }, { "epoch": 0.24498529018454132, "grad_norm": 1.1868820190429688, "learning_rate": 0.00015444707909839325, "loss": 1.5739, "step": 229 }, { "epoch": 0.2460550949451725, "grad_norm": 1.0000498294830322, "learning_rate": 0.00015377235503631083, "loss": 1.1122, "step": 230 }, { "epoch": 0.2471248997058037, "grad_norm": 1.3450075387954712, "learning_rate": 0.0001530943757267898, "loss": 1.1177, "step": 231 }, { "epoch": 0.24819470446643488, "grad_norm": 0.6483561992645264, "learning_rate": 0.00015241318299078751, "loss": 1.5464, "step": 232 }, { "epoch": 0.24926450922706606, "grad_norm": 0.8778035044670105, "learning_rate": 0.00015172881884748063, "loss": 1.3679, "step": 233 }, { "epoch": 0.25033431398769723, "grad_norm": 1.1430999040603638, "learning_rate": 0.00015104132551167318, "loss": 1.7691, "step": 234 }, { "epoch": 0.2514041187483284, "grad_norm": 1.1102079153060913, "learning_rate": 0.00015035074539119248, "loss": 1.4866, "step": 235 }, { "epoch": 0.25247392350895964, "grad_norm": 1.0472543239593506, "learning_rate": 0.00014965712108427323, "loss": 1.1737, "step": 236 }, { "epoch": 0.2535437282695908, "grad_norm": 1.1687076091766357, "learning_rate": 0.00014896049537693005, "loss": 1.1114, "step": 237 }, { "epoch": 0.254613533030222, "grad_norm": 0.5792782306671143, "learning_rate": 0.00014826091124031792, "loss": 1.1166, "step": 238 }, { "epoch": 0.25568333779085317, "grad_norm": 0.8307198286056519, "learning_rate": 0.0001475584118280817, "loss": 1.4741, "step": 239 }, { "epoch": 0.25675314255148435, "grad_norm": 0.6716192960739136, "learning_rate": 0.00014685304047369423, "loss": 1.2407, "step": 240 }, { "epoch": 0.2578229473121155, "grad_norm": 1.034075140953064, "learning_rate": 0.00014614484068778324, "loss": 1.4235, "step": 241 }, { "epoch": 0.2588927520727467, "grad_norm": 0.9473350644111633, "learning_rate": 0.00014543385615544744, "loss": 1.4101, "step": 242 }, { "epoch": 0.25996255683337793, "grad_norm": 1.1106480360031128, "learning_rate": 0.00014472013073356184, "loss": 0.9895, "step": 243 }, { "epoch": 0.2610323615940091, "grad_norm": 0.8599668145179749, "learning_rate": 0.00014400370844807234, "loss": 1.2244, "step": 244 }, { "epoch": 0.2621021663546403, "grad_norm": 0.7908911108970642, "learning_rate": 0.00014328463349128025, "loss": 1.5923, "step": 245 }, { "epoch": 0.26317197111527146, "grad_norm": 0.8245794773101807, "learning_rate": 0.000142562950219116, "loss": 1.4023, "step": 246 }, { "epoch": 0.26424177587590264, "grad_norm": 0.9234296679496765, "learning_rate": 0.00014183870314840325, "loss": 1.3907, "step": 247 }, { "epoch": 0.2653115806365338, "grad_norm": 1.1608610153198242, "learning_rate": 0.00014111193695411285, "loss": 1.4156, "step": 248 }, { "epoch": 0.266381385397165, "grad_norm": 1.186960220336914, "learning_rate": 0.00014038269646660703, "loss": 0.9267, "step": 249 }, { "epoch": 0.2674511901577962, "grad_norm": 1.0800729990005493, "learning_rate": 0.00013965102666887408, "loss": 1.1525, "step": 250 }, { "epoch": 0.2685209949184274, "grad_norm": 0.5610882639884949, "learning_rate": 0.0001389169726937536, "loss": 1.4339, "step": 251 }, { "epoch": 0.2695907996790586, "grad_norm": 0.9096266627311707, "learning_rate": 0.0001381805798211525, "loss": 1.4273, "step": 252 }, { "epoch": 0.27066060443968976, "grad_norm": 0.930641770362854, "learning_rate": 0.00013744189347525182, "loss": 1.0906, "step": 253 }, { "epoch": 0.27173040920032093, "grad_norm": 1.0747754573822021, "learning_rate": 0.00013670095922170498, "loss": 1.3499, "step": 254 }, { "epoch": 0.2728002139609521, "grad_norm": 1.0504320859909058, "learning_rate": 0.00013595782276482678, "loss": 0.9918, "step": 255 }, { "epoch": 0.2738700187215833, "grad_norm": 1.225920557975769, "learning_rate": 0.00013521252994477446, "loss": 1.2121, "step": 256 }, { "epoch": 0.2749398234822145, "grad_norm": 0.6419029235839844, "learning_rate": 0.00013446512673471965, "loss": 1.4319, "step": 257 }, { "epoch": 0.2760096282428457, "grad_norm": 0.9120138883590698, "learning_rate": 0.0001337156592380131, "loss": 1.7584, "step": 258 }, { "epoch": 0.27707943300347687, "grad_norm": 0.8686931133270264, "learning_rate": 0.0001329641736853402, "loss": 1.1114, "step": 259 }, { "epoch": 0.27814923776410805, "grad_norm": 1.1766819953918457, "learning_rate": 0.0001322107164318697, "loss": 1.322, "step": 260 }, { "epoch": 0.2792190425247392, "grad_norm": 1.1522575616836548, "learning_rate": 0.00013145533395439405, "loss": 1.6013, "step": 261 }, { "epoch": 0.2802888472853704, "grad_norm": 1.0668003559112549, "learning_rate": 0.0001306980728484627, "loss": 0.9711, "step": 262 }, { "epoch": 0.2813586520460016, "grad_norm": 0.7991006374359131, "learning_rate": 0.00012993897982550764, "loss": 1.5706, "step": 263 }, { "epoch": 0.2824284568066328, "grad_norm": 0.7399187684059143, "learning_rate": 0.00012917810170996218, "loss": 1.4344, "step": 264 }, { "epoch": 0.283498261567264, "grad_norm": 0.8639386296272278, "learning_rate": 0.0001284154854363725, "loss": 1.2841, "step": 265 }, { "epoch": 0.28456806632789516, "grad_norm": 1.039579153060913, "learning_rate": 0.00012765117804650267, "loss": 1.2761, "step": 266 }, { "epoch": 0.28563787108852634, "grad_norm": 1.2643564939498901, "learning_rate": 0.00012688522668643268, "loss": 0.9961, "step": 267 }, { "epoch": 0.2867076758491575, "grad_norm": 1.101484775543213, "learning_rate": 0.00012611767860365038, "loss": 1.0579, "step": 268 }, { "epoch": 0.2877774806097887, "grad_norm": 0.8096588253974915, "learning_rate": 0.00012534858114413692, "loss": 1.4865, "step": 269 }, { "epoch": 0.28884728537041987, "grad_norm": 0.9671081900596619, "learning_rate": 0.00012457798174944645, "loss": 1.712, "step": 270 }, { "epoch": 0.2899170901310511, "grad_norm": 0.8346510529518127, "learning_rate": 0.0001238059279537795, "loss": 1.4498, "step": 271 }, { "epoch": 0.2909868948916823, "grad_norm": 0.8946216702461243, "learning_rate": 0.00012303246738105082, "loss": 1.1354, "step": 272 }, { "epoch": 0.29205669965231346, "grad_norm": 1.1908957958221436, "learning_rate": 0.00012225764774195186, "loss": 1.2882, "step": 273 }, { "epoch": 0.29312650441294463, "grad_norm": 1.5412572622299194, "learning_rate": 0.00012148151683100776, "loss": 1.3073, "step": 274 }, { "epoch": 0.2941963091735758, "grad_norm": 1.269574522972107, "learning_rate": 0.00012070412252362897, "loss": 0.7722, "step": 275 }, { "epoch": 0.295266113934207, "grad_norm": 0.6998116970062256, "learning_rate": 0.0001199255127731582, "loss": 1.6273, "step": 276 }, { "epoch": 0.2963359186948382, "grad_norm": 1.2294608354568481, "learning_rate": 0.00011914573560791246, "loss": 1.3577, "step": 277 }, { "epoch": 0.2974057234554694, "grad_norm": 1.464188814163208, "learning_rate": 0.00011836483912822035, "loss": 1.5042, "step": 278 }, { "epoch": 0.29847552821610057, "grad_norm": 0.8934053778648376, "learning_rate": 0.00011758287150345516, "loss": 1.2751, "step": 279 }, { "epoch": 0.29954533297673175, "grad_norm": 1.2757556438446045, "learning_rate": 0.00011679988096906333, "loss": 1.0978, "step": 280 }, { "epoch": 0.3006151377373629, "grad_norm": 1.4734572172164917, "learning_rate": 0.00011601591582358924, "loss": 0.9759, "step": 281 }, { "epoch": 0.3016849424979941, "grad_norm": 0.5812798738479614, "learning_rate": 0.00011523102442569585, "loss": 1.8345, "step": 282 }, { "epoch": 0.3027547472586253, "grad_norm": 0.6622848510742188, "learning_rate": 0.00011444525519118179, "loss": 1.3788, "step": 283 }, { "epoch": 0.3038245520192565, "grad_norm": 0.8406071066856384, "learning_rate": 0.00011365865658999474, "loss": 1.2666, "step": 284 }, { "epoch": 0.3048943567798877, "grad_norm": 0.8677452802658081, "learning_rate": 0.00011287127714324162, "loss": 1.2111, "step": 285 }, { "epoch": 0.30596416154051886, "grad_norm": 1.3912626504898071, "learning_rate": 0.00011208316542019556, "loss": 1.0299, "step": 286 }, { "epoch": 0.30703396630115004, "grad_norm": 1.1326332092285156, "learning_rate": 0.00011129437003530006, "loss": 0.8733, "step": 287 }, { "epoch": 0.3081037710617812, "grad_norm": 0.8693293333053589, "learning_rate": 0.00011050493964516997, "loss": 1.2772, "step": 288 }, { "epoch": 0.3091735758224124, "grad_norm": 0.9244024753570557, "learning_rate": 0.00010971492294559029, "loss": 1.6244, "step": 289 }, { "epoch": 0.31024338058304357, "grad_norm": 0.99155592918396, "learning_rate": 0.00010892436866851235, "loss": 1.3652, "step": 290 }, { "epoch": 0.3113131853436748, "grad_norm": 1.061529517173767, "learning_rate": 0.00010813332557904784, "loss": 1.2438, "step": 291 }, { "epoch": 0.312382990104306, "grad_norm": 0.8850228190422058, "learning_rate": 0.00010734184247246066, "loss": 1.1902, "step": 292 }, { "epoch": 0.31345279486493716, "grad_norm": 1.0577521324157715, "learning_rate": 0.00010654996817115704, "loss": 0.8845, "step": 293 }, { "epoch": 0.31452259962556833, "grad_norm": 0.9758553504943848, "learning_rate": 0.00010575775152167391, "loss": 1.4599, "step": 294 }, { "epoch": 0.3155924043861995, "grad_norm": 0.6535763740539551, "learning_rate": 0.00010496524139166594, "loss": 1.7197, "step": 295 }, { "epoch": 0.3166622091468307, "grad_norm": 0.8583334684371948, "learning_rate": 0.00010417248666689095, "loss": 1.0697, "step": 296 }, { "epoch": 0.31773201390746186, "grad_norm": 0.9801604747772217, "learning_rate": 0.00010337953624819464, "loss": 1.3483, "step": 297 }, { "epoch": 0.3188018186680931, "grad_norm": 0.8529968857765198, "learning_rate": 0.0001025864390484939, "loss": 0.9852, "step": 298 }, { "epoch": 0.31987162342872427, "grad_norm": 1.1832444667816162, "learning_rate": 0.00010179324398975984, "loss": 0.9118, "step": 299 }, { "epoch": 0.32094142818935545, "grad_norm": 1.5987980365753174, "learning_rate": 0.000101, "loss": 0.8357, "step": 300 }, { "epoch": 0.3220112329499866, "grad_norm": 0.8217248916625977, "learning_rate": 0.00010020675601024019, "loss": 1.4393, "step": 301 }, { "epoch": 0.3230810377106178, "grad_norm": 0.8473594188690186, "learning_rate": 9.941356095150613e-05, "loss": 1.4425, "step": 302 }, { "epoch": 0.324150842471249, "grad_norm": 1.3870422840118408, "learning_rate": 9.862046375180539e-05, "loss": 1.6543, "step": 303 }, { "epoch": 0.32522064723188016, "grad_norm": 1.0462790727615356, "learning_rate": 9.782751333310905e-05, "loss": 1.1367, "step": 304 }, { "epoch": 0.3262904519925114, "grad_norm": 0.9303992986679077, "learning_rate": 9.70347586083341e-05, "loss": 1.2189, "step": 305 }, { "epoch": 0.32736025675314256, "grad_norm": 1.2022167444229126, "learning_rate": 9.62422484783261e-05, "loss": 0.9073, "step": 306 }, { "epoch": 0.32843006151377374, "grad_norm": 0.7831782102584839, "learning_rate": 9.5450031828843e-05, "loss": 1.3442, "step": 307 }, { "epoch": 0.3294998662744049, "grad_norm": 1.0228748321533203, "learning_rate": 9.465815752753935e-05, "loss": 1.3841, "step": 308 }, { "epoch": 0.3305696710350361, "grad_norm": 1.2510477304458618, "learning_rate": 9.386667442095219e-05, "loss": 1.387, "step": 309 }, { "epoch": 0.33163947579566727, "grad_norm": 1.1324783563613892, "learning_rate": 9.307563133148767e-05, "loss": 1.187, "step": 310 }, { "epoch": 0.33270928055629845, "grad_norm": 1.0329921245574951, "learning_rate": 9.228507705440976e-05, "loss": 1.0911, "step": 311 }, { "epoch": 0.3337790853169297, "grad_norm": 1.0315637588500977, "learning_rate": 9.149506035483005e-05, "loss": 0.8645, "step": 312 }, { "epoch": 0.33484889007756086, "grad_norm": 0.8189682364463806, "learning_rate": 9.070562996469997e-05, "loss": 1.6589, "step": 313 }, { "epoch": 0.33591869483819203, "grad_norm": 0.7126203775405884, "learning_rate": 8.991683457980443e-05, "loss": 1.2723, "step": 314 }, { "epoch": 0.3369884995988232, "grad_norm": 1.3155546188354492, "learning_rate": 8.912872285675841e-05, "loss": 1.5234, "step": 315 }, { "epoch": 0.3380583043594544, "grad_norm": 1.2776226997375488, "learning_rate": 8.834134341000527e-05, "loss": 1.3699, "step": 316 }, { "epoch": 0.33912810912008556, "grad_norm": 0.9939321279525757, "learning_rate": 8.755474480881823e-05, "loss": 1.0941, "step": 317 }, { "epoch": 0.3401979138807168, "grad_norm": 1.078808307647705, "learning_rate": 8.676897557430415e-05, "loss": 0.9849, "step": 318 }, { "epoch": 0.341267718641348, "grad_norm": 0.692589282989502, "learning_rate": 8.598408417641078e-05, "loss": 0.9682, "step": 319 }, { "epoch": 0.34233752340197915, "grad_norm": 0.69404137134552, "learning_rate": 8.520011903093666e-05, "loss": 1.4386, "step": 320 }, { "epoch": 0.3434073281626103, "grad_norm": 1.0537830591201782, "learning_rate": 8.441712849654485e-05, "loss": 1.3422, "step": 321 }, { "epoch": 0.3444771329232415, "grad_norm": 1.2558127641677856, "learning_rate": 8.363516087177962e-05, "loss": 1.1179, "step": 322 }, { "epoch": 0.3455469376838727, "grad_norm": 0.9815801382064819, "learning_rate": 8.285426439208755e-05, "loss": 1.159, "step": 323 }, { "epoch": 0.34661674244450386, "grad_norm": 1.1159148216247559, "learning_rate": 8.20744872268418e-05, "loss": 1.0194, "step": 324 }, { "epoch": 0.3476865472051351, "grad_norm": 0.904017448425293, "learning_rate": 8.129587747637105e-05, "loss": 0.8047, "step": 325 }, { "epoch": 0.34875635196576626, "grad_norm": 0.6387720108032227, "learning_rate": 8.051848316899227e-05, "loss": 1.5516, "step": 326 }, { "epoch": 0.34982615672639744, "grad_norm": 0.9615854024887085, "learning_rate": 7.974235225804814e-05, "loss": 1.4077, "step": 327 }, { "epoch": 0.3508959614870286, "grad_norm": 0.9426089525222778, "learning_rate": 7.896753261894923e-05, "loss": 1.4134, "step": 328 }, { "epoch": 0.3519657662476598, "grad_norm": 1.0092761516571045, "learning_rate": 7.819407204622054e-05, "loss": 1.1219, "step": 329 }, { "epoch": 0.35303557100829097, "grad_norm": 0.9588394165039062, "learning_rate": 7.74220182505536e-05, "loss": 0.9877, "step": 330 }, { "epoch": 0.35410537576892215, "grad_norm": 1.3635826110839844, "learning_rate": 7.665141885586312e-05, "loss": 0.8919, "step": 331 }, { "epoch": 0.3551751805295534, "grad_norm": 0.543036699295044, "learning_rate": 7.588232139634968e-05, "loss": 1.2026, "step": 332 }, { "epoch": 0.35624498529018456, "grad_norm": 1.009917974472046, "learning_rate": 7.511477331356733e-05, "loss": 1.4086, "step": 333 }, { "epoch": 0.35731479005081573, "grad_norm": 0.9390615224838257, "learning_rate": 7.434882195349736e-05, "loss": 1.2364, "step": 334 }, { "epoch": 0.3583845948114469, "grad_norm": 1.1638097763061523, "learning_rate": 7.358451456362751e-05, "loss": 1.4135, "step": 335 }, { "epoch": 0.3594543995720781, "grad_norm": 1.2236016988754272, "learning_rate": 7.282189829003785e-05, "loss": 1.1124, "step": 336 }, { "epoch": 0.36052420433270926, "grad_norm": 1.398525357246399, "learning_rate": 7.206102017449237e-05, "loss": 0.8598, "step": 337 }, { "epoch": 0.36159400909334044, "grad_norm": 1.0281710624694824, "learning_rate": 7.130192715153731e-05, "loss": 1.48, "step": 338 }, { "epoch": 0.3626638138539717, "grad_norm": 0.7850888967514038, "learning_rate": 7.054466604560595e-05, "loss": 1.1451, "step": 339 }, { "epoch": 0.36373361861460285, "grad_norm": 1.0542351007461548, "learning_rate": 6.978928356813031e-05, "loss": 1.2867, "step": 340 }, { "epoch": 0.364803423375234, "grad_norm": 1.0121204853057861, "learning_rate": 6.90358263146598e-05, "loss": 0.8742, "step": 341 }, { "epoch": 0.3658732281358652, "grad_norm": 1.2728453874588013, "learning_rate": 6.828434076198693e-05, "loss": 0.9807, "step": 342 }, { "epoch": 0.3669430328964964, "grad_norm": 1.3085882663726807, "learning_rate": 6.753487326528033e-05, "loss": 0.9157, "step": 343 }, { "epoch": 0.36801283765712756, "grad_norm": 0.9796567559242249, "learning_rate": 6.678747005522557e-05, "loss": 0.9743, "step": 344 }, { "epoch": 0.36908264241775873, "grad_norm": 0.6303224563598633, "learning_rate": 6.60421772351732e-05, "loss": 1.0932, "step": 345 }, { "epoch": 0.37015244717838997, "grad_norm": 1.1777071952819824, "learning_rate": 6.529904077829505e-05, "loss": 1.2724, "step": 346 }, { "epoch": 0.37122225193902114, "grad_norm": 0.9197458624839783, "learning_rate": 6.455810652474817e-05, "loss": 1.4357, "step": 347 }, { "epoch": 0.3722920566996523, "grad_norm": 1.5600042343139648, "learning_rate": 6.381942017884753e-05, "loss": 1.1761, "step": 348 }, { "epoch": 0.3733618614602835, "grad_norm": 1.1098041534423828, "learning_rate": 6.30830273062464e-05, "loss": 1.0017, "step": 349 }, { "epoch": 0.37443166622091467, "grad_norm": 1.2631731033325195, "learning_rate": 6.234897333112594e-05, "loss": 0.8865, "step": 350 }, { "epoch": 0.37550147098154585, "grad_norm": 0.8299064040184021, "learning_rate": 6.161730353339302e-05, "loss": 1.2372, "step": 351 }, { "epoch": 0.376571275742177, "grad_norm": 1.0870029926300049, "learning_rate": 6.088806304588717e-05, "loss": 1.6713, "step": 352 }, { "epoch": 0.37764108050280826, "grad_norm": 1.1188887357711792, "learning_rate": 6.0161296851596766e-05, "loss": 1.3496, "step": 353 }, { "epoch": 0.37871088526343943, "grad_norm": 1.0576889514923096, "learning_rate": 5.943704978088402e-05, "loss": 1.211, "step": 354 }, { "epoch": 0.3797806900240706, "grad_norm": 1.3803176879882812, "learning_rate": 5.871536650871979e-05, "loss": 1.0773, "step": 355 }, { "epoch": 0.3808504947847018, "grad_norm": 1.1492490768432617, "learning_rate": 5.7996291551927666e-05, "loss": 0.9198, "step": 356 }, { "epoch": 0.38192029954533296, "grad_norm": 0.6801052093505859, "learning_rate": 5.7279869266438234e-05, "loss": 1.4337, "step": 357 }, { "epoch": 0.38299010430596414, "grad_norm": 1.2174427509307861, "learning_rate": 5.656614384455257e-05, "loss": 1.2353, "step": 358 }, { "epoch": 0.3840599090665954, "grad_norm": 1.1382440328598022, "learning_rate": 5.585515931221677e-05, "loss": 1.3431, "step": 359 }, { "epoch": 0.38512971382722655, "grad_norm": 0.9358460307121277, "learning_rate": 5.514695952630578e-05, "loss": 0.9496, "step": 360 }, { "epoch": 0.3861995185878577, "grad_norm": 1.1950464248657227, "learning_rate": 5.444158817191832e-05, "loss": 0.9529, "step": 361 }, { "epoch": 0.3872693233484889, "grad_norm": 1.0980271100997925, "learning_rate": 5.373908875968211e-05, "loss": 1.1347, "step": 362 }, { "epoch": 0.3883391281091201, "grad_norm": 0.7558470368385315, "learning_rate": 5.3039504623069965e-05, "loss": 0.9622, "step": 363 }, { "epoch": 0.38940893286975126, "grad_norm": 1.0493141412734985, "learning_rate": 5.234287891572674e-05, "loss": 1.1268, "step": 364 }, { "epoch": 0.39047873763038243, "grad_norm": 1.2050443887710571, "learning_rate": 5.164925460880758e-05, "loss": 1.2384, "step": 365 }, { "epoch": 0.39154854239101367, "grad_norm": 1.2287790775299072, "learning_rate": 5.095867448832683e-05, "loss": 1.2972, "step": 366 }, { "epoch": 0.39261834715164484, "grad_norm": 1.3510652780532837, "learning_rate": 5.027118115251938e-05, "loss": 1.3639, "step": 367 }, { "epoch": 0.393688151912276, "grad_norm": 1.2277965545654297, "learning_rate": 4.95868170092125e-05, "loss": 1.1627, "step": 368 }, { "epoch": 0.3947579566729072, "grad_norm": 1.1306562423706055, "learning_rate": 4.890562427321021e-05, "loss": 1.1845, "step": 369 }, { "epoch": 0.39582776143353837, "grad_norm": 0.8683123588562012, "learning_rate": 4.822764496368917e-05, "loss": 1.1248, "step": 370 }, { "epoch": 0.39689756619416955, "grad_norm": 1.0992448329925537, "learning_rate": 4.755292090160676e-05, "loss": 1.4496, "step": 371 }, { "epoch": 0.3979673709548007, "grad_norm": 1.0645350217819214, "learning_rate": 4.6881493707121315e-05, "loss": 0.943, "step": 372 }, { "epoch": 0.39903717571543196, "grad_norm": 1.105506420135498, "learning_rate": 4.621340479702503e-05, "loss": 1.0031, "step": 373 }, { "epoch": 0.40010698047606313, "grad_norm": 0.8552220463752747, "learning_rate": 4.554869538218868e-05, "loss": 1.0468, "step": 374 }, { "epoch": 0.4011767852366943, "grad_norm": 0.8983728289604187, "learning_rate": 4.48874064650202e-05, "loss": 0.8188, "step": 375 }, { "epoch": 0.4022465899973255, "grad_norm": 0.6168304085731506, "learning_rate": 4.422957883693483e-05, "loss": 1.394, "step": 376 }, { "epoch": 0.40331639475795666, "grad_norm": 1.0310661792755127, "learning_rate": 4.357525307583933e-05, "loss": 1.2572, "step": 377 }, { "epoch": 0.40438619951858784, "grad_norm": 0.9725519418716431, "learning_rate": 4.29244695436289e-05, "loss": 1.2638, "step": 378 }, { "epoch": 0.405456004279219, "grad_norm": 0.768764317035675, "learning_rate": 4.227726838369711e-05, "loss": 1.1837, "step": 379 }, { "epoch": 0.40652580903985025, "grad_norm": 1.267078161239624, "learning_rate": 4.1633689518460225e-05, "loss": 1.0828, "step": 380 }, { "epoch": 0.4075956138004814, "grad_norm": 1.0793355703353882, "learning_rate": 4.0993772646894116e-05, "loss": 0.9099, "step": 381 }, { "epoch": 0.4086654185611126, "grad_norm": 0.7304561138153076, "learning_rate": 4.035755724208573e-05, "loss": 1.3844, "step": 382 }, { "epoch": 0.4097352233217438, "grad_norm": 0.9201098680496216, "learning_rate": 3.972508254879805e-05, "loss": 1.3788, "step": 383 }, { "epoch": 0.41080502808237496, "grad_norm": 1.1659053564071655, "learning_rate": 3.90963875810494e-05, "loss": 1.3056, "step": 384 }, { "epoch": 0.41187483284300613, "grad_norm": 0.9076784253120422, "learning_rate": 3.847151111970676e-05, "loss": 1.2791, "step": 385 }, { "epoch": 0.4129446376036373, "grad_norm": 1.2633960247039795, "learning_rate": 3.785049171009381e-05, "loss": 1.2781, "step": 386 }, { "epoch": 0.41401444236426854, "grad_norm": 0.8448993563652039, "learning_rate": 3.723336765961285e-05, "loss": 0.7594, "step": 387 }, { "epoch": 0.4150842471248997, "grad_norm": 0.9559032917022705, "learning_rate": 3.662017703538234e-05, "loss": 1.238, "step": 388 }, { "epoch": 0.4161540518855309, "grad_norm": 0.7692968845367432, "learning_rate": 3.601095766188833e-05, "loss": 1.0494, "step": 389 }, { "epoch": 0.4172238566461621, "grad_norm": 0.9550884962081909, "learning_rate": 3.540574711865146e-05, "loss": 1.2024, "step": 390 }, { "epoch": 0.41829366140679325, "grad_norm": 0.9791415333747864, "learning_rate": 3.4804582737908825e-05, "loss": 1.1066, "step": 391 }, { "epoch": 0.4193634661674244, "grad_norm": 0.876552402973175, "learning_rate": 3.420750160231118e-05, "loss": 0.9091, "step": 392 }, { "epoch": 0.4204332709280556, "grad_norm": 1.0393965244293213, "learning_rate": 3.361454054263541e-05, "loss": 0.7999, "step": 393 }, { "epoch": 0.42150307568868683, "grad_norm": 0.6238046288490295, "learning_rate": 3.302573613551292e-05, "loss": 0.9947, "step": 394 }, { "epoch": 0.422572880449318, "grad_norm": 1.2074321508407593, "learning_rate": 3.244112470117288e-05, "loss": 1.4528, "step": 395 }, { "epoch": 0.4236426852099492, "grad_norm": 0.8338631987571716, "learning_rate": 3.186074230120244e-05, "loss": 1.2835, "step": 396 }, { "epoch": 0.42471248997058036, "grad_norm": 0.9252687096595764, "learning_rate": 3.1284624736321846e-05, "loss": 1.1478, "step": 397 }, { "epoch": 0.42578229473121154, "grad_norm": 0.896878182888031, "learning_rate": 3.071280754417626e-05, "loss": 1.0608, "step": 398 }, { "epoch": 0.4268520994918427, "grad_norm": 1.1846908330917358, "learning_rate": 3.0145325997143577e-05, "loss": 0.9898, "step": 399 }, { "epoch": 0.42792190425247395, "grad_norm": 0.8462516665458679, "learning_rate": 2.9582215100158706e-05, "loss": 0.6714, "step": 400 }, { "epoch": 0.4289917090131051, "grad_norm": 0.5424546003341675, "learning_rate": 2.902350958855426e-05, "loss": 1.304, "step": 401 }, { "epoch": 0.4300615137737363, "grad_norm": 1.0380676984786987, "learning_rate": 2.846924392591794e-05, "loss": 1.4114, "step": 402 }, { "epoch": 0.4311313185343675, "grad_norm": 1.0235589742660522, "learning_rate": 2.791945230196663e-05, "loss": 1.1948, "step": 403 }, { "epoch": 0.43220112329499866, "grad_norm": 0.9752116799354553, "learning_rate": 2.7374168630437456e-05, "loss": 0.8311, "step": 404 }, { "epoch": 0.43327092805562983, "grad_norm": 1.1082531213760376, "learning_rate": 2.6833426546995782e-05, "loss": 0.8029, "step": 405 }, { "epoch": 0.434340732816261, "grad_norm": 1.0731070041656494, "learning_rate": 2.629725940716041e-05, "loss": 0.6362, "step": 406 }, { "epoch": 0.43541053757689224, "grad_norm": 0.7748804688453674, "learning_rate": 2.57657002842462e-05, "loss": 1.3502, "step": 407 }, { "epoch": 0.4364803423375234, "grad_norm": 0.9873160719871521, "learning_rate": 2.523878196732358e-05, "loss": 1.4015, "step": 408 }, { "epoch": 0.4375501470981546, "grad_norm": 1.168376088142395, "learning_rate": 2.4716536959196462e-05, "loss": 0.9562, "step": 409 }, { "epoch": 0.4386199518587858, "grad_norm": 0.9947543740272522, "learning_rate": 2.4198997474396877e-05, "loss": 1.0764, "step": 410 }, { "epoch": 0.43968975661941695, "grad_norm": 1.0038477182388306, "learning_rate": 2.3686195437198112e-05, "loss": 1.1025, "step": 411 }, { "epoch": 0.4407595613800481, "grad_norm": 1.073735237121582, "learning_rate": 2.31781624796453e-05, "loss": 0.9347, "step": 412 }, { "epoch": 0.4418293661406793, "grad_norm": 0.8761813044548035, "learning_rate": 2.2674929939604332e-05, "loss": 1.3846, "step": 413 }, { "epoch": 0.44289917090131053, "grad_norm": 0.707740068435669, "learning_rate": 2.217652885882869e-05, "loss": 1.2739, "step": 414 }, { "epoch": 0.4439689756619417, "grad_norm": 0.8587148189544678, "learning_rate": 2.1682989981044783e-05, "loss": 1.4457, "step": 415 }, { "epoch": 0.4450387804225729, "grad_norm": 0.9313194155693054, "learning_rate": 2.119434375005527e-05, "loss": 1.3368, "step": 416 }, { "epoch": 0.44610858518320406, "grad_norm": 0.9590698480606079, "learning_rate": 2.071062030786149e-05, "loss": 1.0, "step": 417 }, { "epoch": 0.44717838994383524, "grad_norm": 1.0751738548278809, "learning_rate": 2.0231849492803852e-05, "loss": 0.8612, "step": 418 }, { "epoch": 0.4482481947044664, "grad_norm": 0.8183998465538025, "learning_rate": 1.9758060837721467e-05, "loss": 1.1862, "step": 419 }, { "epoch": 0.4493179994650976, "grad_norm": 0.9243028163909912, "learning_rate": 1.928928356813032e-05, "loss": 1.6195, "step": 420 }, { "epoch": 0.4503878042257288, "grad_norm": 1.1141655445098877, "learning_rate": 1.882554660042052e-05, "loss": 1.2306, "step": 421 }, { "epoch": 0.45145760898636, "grad_norm": 0.9904926419258118, "learning_rate": 1.8366878540072614e-05, "loss": 1.1581, "step": 422 }, { "epoch": 0.4525274137469912, "grad_norm": 0.8445707559585571, "learning_rate": 1.7913307679893173e-05, "loss": 1.1488, "step": 423 }, { "epoch": 0.45359721850762236, "grad_norm": 1.067460060119629, "learning_rate": 1.7464861998269243e-05, "loss": 0.9826, "step": 424 }, { "epoch": 0.45466702326825353, "grad_norm": 1.1314057111740112, "learning_rate": 1.702156915744292e-05, "loss": 0.6977, "step": 425 }, { "epoch": 0.4557368280288847, "grad_norm": 0.8268198370933533, "learning_rate": 1.6583456501804725e-05, "loss": 1.6189, "step": 426 }, { "epoch": 0.4568066327895159, "grad_norm": 0.9529428482055664, "learning_rate": 1.6150551056206867e-05, "loss": 1.2327, "step": 427 }, { "epoch": 0.4578764375501471, "grad_norm": 1.1057630777359009, "learning_rate": 1.57228795242965e-05, "loss": 1.1263, "step": 428 }, { "epoch": 0.4589462423107783, "grad_norm": 0.8888829350471497, "learning_rate": 1.5300468286868137e-05, "loss": 0.9244, "step": 429 }, { "epoch": 0.4600160470714095, "grad_norm": 1.019853949546814, "learning_rate": 1.488334340023669e-05, "loss": 1.0842, "step": 430 }, { "epoch": 0.46108585183204065, "grad_norm": 1.1532846689224243, "learning_rate": 1.4471530594629996e-05, "loss": 0.7479, "step": 431 }, { "epoch": 0.4621556565926718, "grad_norm": 0.9575594663619995, "learning_rate": 1.4065055272601703e-05, "loss": 1.3278, "step": 432 }, { "epoch": 0.463225461353303, "grad_norm": 0.9033411145210266, "learning_rate": 1.3663942507464348e-05, "loss": 1.6109, "step": 433 }, { "epoch": 0.4642952661139342, "grad_norm": 0.9153274297714233, "learning_rate": 1.3268217041742701e-05, "loss": 0.9475, "step": 434 }, { "epoch": 0.4653650708745654, "grad_norm": 1.0005955696105957, "learning_rate": 1.2877903285647486e-05, "loss": 0.9418, "step": 435 }, { "epoch": 0.4664348756351966, "grad_norm": 1.0038702487945557, "learning_rate": 1.2493025315569801e-05, "loss": 0.8441, "step": 436 }, { "epoch": 0.46750468039582777, "grad_norm": 1.012077808380127, "learning_rate": 1.2113606872595673e-05, "loss": 0.5747, "step": 437 }, { "epoch": 0.46857448515645894, "grad_norm": 0.9703443646430969, "learning_rate": 1.173967136104196e-05, "loss": 1.2232, "step": 438 }, { "epoch": 0.4696442899170901, "grad_norm": 1.00784170627594, "learning_rate": 1.1371241847012401e-05, "loss": 1.1904, "step": 439 }, { "epoch": 0.4707140946777213, "grad_norm": 0.7357102036476135, "learning_rate": 1.1008341056974854e-05, "loss": 1.2484, "step": 440 }, { "epoch": 0.4717838994383525, "grad_norm": 1.120018482208252, "learning_rate": 1.0650991376359473e-05, "loss": 1.0129, "step": 441 }, { "epoch": 0.4728537041989837, "grad_norm": 1.1301084756851196, "learning_rate": 1.029921484817783e-05, "loss": 1.0191, "step": 442 }, { "epoch": 0.4739235089596149, "grad_norm": 0.9591827988624573, "learning_rate": 9.953033171663175e-06, "loss": 0.7102, "step": 443 }, { "epoch": 0.47499331372024606, "grad_norm": 0.9715290665626526, "learning_rate": 9.612467700932045e-06, "loss": 0.9119, "step": 444 }, { "epoch": 0.47606311848087723, "grad_norm": 0.8477181196212769, "learning_rate": 9.277539443666783e-06, "loss": 1.434, "step": 445 }, { "epoch": 0.4771329232415084, "grad_norm": 1.0625755786895752, "learning_rate": 8.948269059820025e-06, "loss": 0.9963, "step": 446 }, { "epoch": 0.4782027280021396, "grad_norm": 0.9794643521308899, "learning_rate": 8.624676860340025e-06, "loss": 0.9641, "step": 447 }, { "epoch": 0.4792725327627708, "grad_norm": 1.0721542835235596, "learning_rate": 8.306782805917904e-06, "loss": 1.2077, "step": 448 }, { "epoch": 0.480342337523402, "grad_norm": 1.1416406631469727, "learning_rate": 7.994606505756355e-06, "loss": 0.793, "step": 449 }, { "epoch": 0.4814121422840332, "grad_norm": 1.3246476650238037, "learning_rate": 7.68816721636004e-06, "loss": 0.6877, "step": 450 }, { "epoch": 0.48248194704466435, "grad_norm": 0.7753664255142212, "learning_rate": 7.3874838403478e-06, "loss": 1.5715, "step": 451 }, { "epoch": 0.4835517518052955, "grad_norm": 0.8879972696304321, "learning_rate": 7.092574925286614e-06, "loss": 1.5482, "step": 452 }, { "epoch": 0.4846215565659267, "grad_norm": 0.9376313090324402, "learning_rate": 6.803458662547507e-06, "loss": 1.0972, "step": 453 }, { "epoch": 0.4856913613265579, "grad_norm": 1.133258581161499, "learning_rate": 6.520152886183406e-06, "loss": 0.8188, "step": 454 }, { "epoch": 0.4867611660871891, "grad_norm": 1.023056983947754, "learning_rate": 6.242675071829111e-06, "loss": 0.9871, "step": 455 }, { "epoch": 0.4878309708478203, "grad_norm": 1.0413668155670166, "learning_rate": 5.971042335623229e-06, "loss": 0.8659, "step": 456 }, { "epoch": 0.48890077560845147, "grad_norm": 0.8116885423660278, "learning_rate": 5.705271433152458e-06, "loss": 1.4952, "step": 457 }, { "epoch": 0.48997058036908264, "grad_norm": 0.7350386381149292, "learning_rate": 5.445378758417925e-06, "loss": 1.3216, "step": 458 }, { "epoch": 0.4910403851297138, "grad_norm": 0.9509938359260559, "learning_rate": 5.191380342824035e-06, "loss": 1.033, "step": 459 }, { "epoch": 0.492110189890345, "grad_norm": 0.8455548882484436, "learning_rate": 4.943291854189493e-06, "loss": 0.9927, "step": 460 }, { "epoch": 0.4931799946509762, "grad_norm": 1.0387928485870361, "learning_rate": 4.701128595780878e-06, "loss": 0.8588, "step": 461 }, { "epoch": 0.4942497994116074, "grad_norm": 0.9830685257911682, "learning_rate": 4.464905505368658e-06, "loss": 0.6431, "step": 462 }, { "epoch": 0.4953196041722386, "grad_norm": 0.8270193338394165, "learning_rate": 4.23463715430577e-06, "loss": 1.4119, "step": 463 }, { "epoch": 0.49638940893286976, "grad_norm": 0.9378442764282227, "learning_rate": 4.010337746628751e-06, "loss": 1.3177, "step": 464 }, { "epoch": 0.49745921369350093, "grad_norm": 1.149201512336731, "learning_rate": 3.792021118181636e-06, "loss": 1.1545, "step": 465 }, { "epoch": 0.4985290184541321, "grad_norm": 1.167047381401062, "learning_rate": 3.5797007357623945e-06, "loss": 1.4185, "step": 466 }, { "epoch": 0.4995988232147633, "grad_norm": 1.0257937908172607, "learning_rate": 3.3733896962923658e-06, "loss": 0.8571, "step": 467 }, { "epoch": 0.5006686279753945, "grad_norm": 1.0537946224212646, "learning_rate": 3.1731007260082616e-06, "loss": 1.0841, "step": 468 }, { "epoch": 0.5017384327360257, "grad_norm": 0.7948933839797974, "learning_rate": 2.9788461796772114e-06, "loss": 0.9228, "step": 469 }, { "epoch": 0.5028082374966568, "grad_norm": 0.8775575160980225, "learning_rate": 2.790638039834668e-06, "loss": 1.0854, "step": 470 }, { "epoch": 0.503878042257288, "grad_norm": 0.8913066387176514, "learning_rate": 2.6084879160452166e-06, "loss": 1.2876, "step": 471 }, { "epoch": 0.5049478470179193, "grad_norm": 1.0341969728469849, "learning_rate": 2.432407044186509e-06, "loss": 1.0843, "step": 472 }, { "epoch": 0.5060176517785504, "grad_norm": 1.077120304107666, "learning_rate": 2.26240628575615e-06, "loss": 1.1631, "step": 473 }, { "epoch": 0.5070874565391816, "grad_norm": 1.2969963550567627, "learning_rate": 2.098496127201648e-06, "loss": 0.9719, "step": 474 }, { "epoch": 0.5081572612998128, "grad_norm": 0.9883275032043457, "learning_rate": 1.9406866792737267e-06, "loss": 0.7292, "step": 475 }, { "epoch": 0.509227066060444, "grad_norm": 0.8158979415893555, "learning_rate": 1.7889876764024505e-06, "loss": 1.7438, "step": 476 }, { "epoch": 0.5102968708210751, "grad_norm": 1.0562134981155396, "learning_rate": 1.6434084760968697e-06, "loss": 1.1742, "step": 477 }, { "epoch": 0.5113666755817063, "grad_norm": 1.1257410049438477, "learning_rate": 1.5039580583678393e-06, "loss": 1.4287, "step": 478 }, { "epoch": 0.5124364803423376, "grad_norm": 0.9307600855827332, "learning_rate": 1.3706450251739613e-06, "loss": 1.0974, "step": 479 }, { "epoch": 0.5135062851029687, "grad_norm": 1.2765836715698242, "learning_rate": 1.2434775998910964e-06, "loss": 0.913, "step": 480 }, { "epoch": 0.5145760898635999, "grad_norm": 0.960442304611206, "learning_rate": 1.1224636268050439e-06, "loss": 0.7802, "step": 481 }, { "epoch": 0.515645894624231, "grad_norm": 0.7446873188018799, "learning_rate": 1.0076105706276888e-06, "loss": 1.3411, "step": 482 }, { "epoch": 0.5167156993848623, "grad_norm": 0.8171153664588928, "learning_rate": 8.989255160365527e-07, "loss": 1.1665, "step": 483 }, { "epoch": 0.5177855041454934, "grad_norm": 1.2924435138702393, "learning_rate": 7.964151672377458e-07, "loss": 1.0651, "step": 484 }, { "epoch": 0.5188553089061246, "grad_norm": 0.8738812804222107, "learning_rate": 7.000858475524444e-07, "loss": 0.9887, "step": 485 }, { "epoch": 0.5199251136667559, "grad_norm": 1.1796964406967163, "learning_rate": 6.099434990268609e-07, "loss": 1.0128, "step": 486 }, { "epoch": 0.520994918427387, "grad_norm": 0.9124283194541931, "learning_rate": 5.259936820656257e-07, "loss": 0.5644, "step": 487 }, { "epoch": 0.5220647231880182, "grad_norm": 0.7489669322967529, "learning_rate": 4.482415750889204e-07, "loss": 1.1193, "step": 488 }, { "epoch": 0.5231345279486493, "grad_norm": 0.969879150390625, "learning_rate": 3.766919742129331e-07, "loss": 1.2911, "step": 489 }, { "epoch": 0.5242043327092806, "grad_norm": 0.9446693062782288, "learning_rate": 3.1134929295407564e-07, "loss": 1.2359, "step": 490 }, { "epoch": 0.5252741374699117, "grad_norm": 0.8747495412826538, "learning_rate": 2.5221756195672563e-07, "loss": 1.0005, "step": 491 }, { "epoch": 0.5263439422305429, "grad_norm": 1.1830285787582397, "learning_rate": 1.9930042874457254e-07, "loss": 0.8599, "step": 492 }, { "epoch": 0.5274137469911742, "grad_norm": 0.886387825012207, "learning_rate": 1.5260115749566882e-07, "loss": 0.8124, "step": 493 }, { "epoch": 0.5284835517518053, "grad_norm": 0.7844628691673279, "learning_rate": 1.1212262884103974e-07, "loss": 1.4931, "step": 494 }, { "epoch": 0.5295533565124365, "grad_norm": 0.8858184218406677, "learning_rate": 7.7867339686987e-08, "loss": 1.2632, "step": 495 }, { "epoch": 0.5306231612730676, "grad_norm": 0.9755517840385437, "learning_rate": 4.98374030611084e-08, "loss": 1.1289, "step": 496 }, { "epoch": 0.5316929660336989, "grad_norm": 0.9477503299713135, "learning_rate": 2.8034547981943713e-08, "loss": 1.1403, "step": 497 }, { "epoch": 0.53276277079433, "grad_norm": 0.9157533645629883, "learning_rate": 1.246011935228064e-08, "loss": 0.9965, "step": 498 }, { "epoch": 0.5338325755549612, "grad_norm": 1.1439539194107056, "learning_rate": 3.115077876243988e-09, "loss": 1.0751, "step": 499 }, { "epoch": 0.5349023803155925, "grad_norm": 1.0133370161056519, "learning_rate": 0.0, "loss": 0.8007, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0690671548959293e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }