diff --git "a/checkpoint-1082/trainer_state.json" "b/checkpoint-1082/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1082/trainer_state.json" @@ -0,0 +1,7639 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 271, + "global_step": 1082, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009242144177449168, + "grad_norm": 2.5604841709136963, + "learning_rate": 2e-05, + "loss": 1.4237, + "step": 1 + }, + { + "epoch": 0.0009242144177449168, + "eval_loss": 1.3103359937667847, + "eval_runtime": 257.0967, + "eval_samples_per_second": 19.557, + "eval_steps_per_second": 9.778, + "step": 1 + }, + { + "epoch": 0.0018484288354898336, + "grad_norm": 3.4046003818511963, + "learning_rate": 4e-05, + "loss": 1.5145, + "step": 2 + }, + { + "epoch": 0.0027726432532347504, + "grad_norm": 2.8815054893493652, + "learning_rate": 6e-05, + "loss": 1.2893, + "step": 3 + }, + { + "epoch": 0.0036968576709796672, + "grad_norm": 2.740572214126587, + "learning_rate": 8e-05, + "loss": 1.3412, + "step": 4 + }, + { + "epoch": 0.0046210720887245845, + "grad_norm": 5.069173336029053, + "learning_rate": 0.0001, + "loss": 1.1734, + "step": 5 + }, + { + "epoch": 0.005545286506469501, + "grad_norm": 2.2007312774658203, + "learning_rate": 0.00012, + "loss": 1.2551, + "step": 6 + }, + { + "epoch": 0.006469500924214418, + "grad_norm": 2.2793197631835938, + "learning_rate": 0.00014, + "loss": 1.0768, + "step": 7 + }, + { + "epoch": 0.0073937153419593345, + "grad_norm": 1.179013729095459, + "learning_rate": 0.00016, + "loss": 1.1537, + "step": 8 + }, + { + "epoch": 0.00831792975970425, + "grad_norm": 2.2400803565979004, + "learning_rate": 0.00018, + "loss": 1.1018, + "step": 9 + }, + { + "epoch": 0.009242144177449169, + "grad_norm": 2.3832433223724365, + "learning_rate": 0.0002, + "loss": 1.0434, + "step": 10 + }, + { + "epoch": 0.010166358595194085, + "grad_norm": 3.0864202976226807, + "learning_rate": 0.0001999995705823725, + "loss": 1.3045, + "step": 11 + }, + { + "epoch": 0.011090573012939002, + "grad_norm": 1.042543888092041, + "learning_rate": 0.0001999982823331779, + "loss": 1.0877, + "step": 12 + }, + { + "epoch": 0.012014787430683918, + "grad_norm": 3.1928391456604004, + "learning_rate": 0.00019999613526348019, + "loss": 1.1363, + "step": 13 + }, + { + "epoch": 0.012939001848428836, + "grad_norm": 1.5532358884811401, + "learning_rate": 0.00019999312939171914, + "loss": 1.2242, + "step": 14 + }, + { + "epoch": 0.013863216266173753, + "grad_norm": 0.9416839480400085, + "learning_rate": 0.00019998926474371022, + "loss": 1.0268, + "step": 15 + }, + { + "epoch": 0.014787430683918669, + "grad_norm": 0.8814142942428589, + "learning_rate": 0.00019998454135264444, + "loss": 1.0985, + "step": 16 + }, + { + "epoch": 0.015711645101663587, + "grad_norm": 1.1409777402877808, + "learning_rate": 0.0001999789592590879, + "loss": 1.1398, + "step": 17 + }, + { + "epoch": 0.0166358595194085, + "grad_norm": 1.041243314743042, + "learning_rate": 0.0001999725185109816, + "loss": 0.9227, + "step": 18 + }, + { + "epoch": 0.01756007393715342, + "grad_norm": 0.9114219546318054, + "learning_rate": 0.00019996521916364096, + "loss": 1.1407, + "step": 19 + }, + { + "epoch": 0.018484288354898338, + "grad_norm": 0.9067578911781311, + "learning_rate": 0.00019995706127975537, + "loss": 1.0716, + "step": 20 + }, + { + "epoch": 0.019408502772643253, + "grad_norm": 1.1850439310073853, + "learning_rate": 0.0001999480449293876, + "loss": 1.0858, + "step": 21 + }, + { + "epoch": 0.02033271719038817, + "grad_norm": 0.8613914251327515, + "learning_rate": 0.00019993817018997323, + "loss": 1.049, + "step": 22 + }, + { + "epoch": 0.021256931608133085, + "grad_norm": 0.9955759644508362, + "learning_rate": 0.00019992743714632, + "loss": 1.1809, + "step": 23 + }, + { + "epoch": 0.022181146025878003, + "grad_norm": 0.9438485503196716, + "learning_rate": 0.0001999158458906071, + "loss": 1.064, + "step": 24 + }, + { + "epoch": 0.02310536044362292, + "grad_norm": 0.8871132135391235, + "learning_rate": 0.0001999033965223843, + "loss": 1.0686, + "step": 25 + }, + { + "epoch": 0.024029574861367836, + "grad_norm": 0.834404706954956, + "learning_rate": 0.00019989008914857116, + "loss": 1.0521, + "step": 26 + }, + { + "epoch": 0.024953789279112754, + "grad_norm": 0.8184621334075928, + "learning_rate": 0.00019987592388345611, + "loss": 1.1696, + "step": 27 + }, + { + "epoch": 0.025878003696857672, + "grad_norm": 0.7676454782485962, + "learning_rate": 0.00019986090084869545, + "loss": 0.9901, + "step": 28 + }, + { + "epoch": 0.026802218114602587, + "grad_norm": 1.0301235914230347, + "learning_rate": 0.00019984502017331225, + "loss": 1.1206, + "step": 29 + }, + { + "epoch": 0.027726432532347505, + "grad_norm": 1.0618919134140015, + "learning_rate": 0.00019982828199369541, + "loss": 0.9438, + "step": 30 + }, + { + "epoch": 0.02865064695009242, + "grad_norm": 0.851746678352356, + "learning_rate": 0.0001998106864535983, + "loss": 1.0018, + "step": 31 + }, + { + "epoch": 0.029574861367837338, + "grad_norm": 0.8407313823699951, + "learning_rate": 0.00019979223370413763, + "loss": 1.036, + "step": 32 + }, + { + "epoch": 0.030499075785582256, + "grad_norm": 0.9378889799118042, + "learning_rate": 0.00019977292390379207, + "loss": 1.154, + "step": 33 + }, + { + "epoch": 0.031423290203327174, + "grad_norm": 0.797001838684082, + "learning_rate": 0.00019975275721840103, + "loss": 0.9886, + "step": 34 + }, + { + "epoch": 0.03234750462107209, + "grad_norm": 0.8715914487838745, + "learning_rate": 0.0001997317338211631, + "loss": 1.1029, + "step": 35 + }, + { + "epoch": 0.033271719038817, + "grad_norm": 0.8055230975151062, + "learning_rate": 0.00019970985389263467, + "loss": 0.9684, + "step": 36 + }, + { + "epoch": 0.034195933456561925, + "grad_norm": 0.9023709893226624, + "learning_rate": 0.0001996871176207282, + "loss": 1.0087, + "step": 37 + }, + { + "epoch": 0.03512014787430684, + "grad_norm": 0.8392109274864197, + "learning_rate": 0.0001996635252007109, + "loss": 0.9661, + "step": 38 + }, + { + "epoch": 0.036044362292051754, + "grad_norm": 0.8920149207115173, + "learning_rate": 0.00019963907683520274, + "loss": 1.1255, + "step": 39 + }, + { + "epoch": 0.036968576709796676, + "grad_norm": 1.0578594207763672, + "learning_rate": 0.00019961377273417487, + "loss": 1.159, + "step": 40 + }, + { + "epoch": 0.03789279112754159, + "grad_norm": 0.9505748748779297, + "learning_rate": 0.0001995876131149479, + "loss": 1.1709, + "step": 41 + }, + { + "epoch": 0.038817005545286505, + "grad_norm": 0.9387400150299072, + "learning_rate": 0.00019956059820218982, + "loss": 1.0767, + "step": 42 + }, + { + "epoch": 0.03974121996303143, + "grad_norm": 0.8217892646789551, + "learning_rate": 0.00019953272822791424, + "loss": 1.1424, + "step": 43 + }, + { + "epoch": 0.04066543438077634, + "grad_norm": 0.8881214261054993, + "learning_rate": 0.00019950400343147833, + "loss": 1.1806, + "step": 44 + }, + { + "epoch": 0.041589648798521256, + "grad_norm": 0.7667908072471619, + "learning_rate": 0.00019947442405958074, + "loss": 1.0047, + "step": 45 + }, + { + "epoch": 0.04251386321626617, + "grad_norm": 0.7907620668411255, + "learning_rate": 0.0001994439903662596, + "loss": 0.9458, + "step": 46 + }, + { + "epoch": 0.04343807763401109, + "grad_norm": 0.810932457447052, + "learning_rate": 0.00019941270261289012, + "loss": 1.072, + "step": 47 + }, + { + "epoch": 0.04436229205175601, + "grad_norm": 0.7343953847885132, + "learning_rate": 0.00019938056106818261, + "loss": 0.949, + "step": 48 + }, + { + "epoch": 0.04528650646950092, + "grad_norm": 0.8735368847846985, + "learning_rate": 0.00019934756600817997, + "loss": 1.0664, + "step": 49 + }, + { + "epoch": 0.04621072088724584, + "grad_norm": 0.8688865900039673, + "learning_rate": 0.00019931371771625544, + "loss": 0.9868, + "step": 50 + }, + { + "epoch": 0.04713493530499076, + "grad_norm": 0.8754143714904785, + "learning_rate": 0.00019927901648311003, + "loss": 1.0468, + "step": 51 + }, + { + "epoch": 0.04805914972273567, + "grad_norm": 1.0708386898040771, + "learning_rate": 0.0001992434626067702, + "loss": 1.2202, + "step": 52 + }, + { + "epoch": 0.048983364140480594, + "grad_norm": 0.9158581495285034, + "learning_rate": 0.00019920705639258517, + "loss": 1.0832, + "step": 53 + }, + { + "epoch": 0.04990757855822551, + "grad_norm": 0.81212317943573, + "learning_rate": 0.00019916979815322433, + "loss": 0.9221, + "step": 54 + }, + { + "epoch": 0.05083179297597042, + "grad_norm": 0.900019645690918, + "learning_rate": 0.00019913168820867458, + "loss": 0.9403, + "step": 55 + }, + { + "epoch": 0.051756007393715345, + "grad_norm": 0.8837038278579712, + "learning_rate": 0.00019909272688623756, + "loss": 1.0986, + "step": 56 + }, + { + "epoch": 0.05268022181146026, + "grad_norm": 0.9048298597335815, + "learning_rate": 0.00019905291452052687, + "loss": 1.1226, + "step": 57 + }, + { + "epoch": 0.053604436229205174, + "grad_norm": 0.9006702303886414, + "learning_rate": 0.0001990122514534651, + "loss": 1.0095, + "step": 58 + }, + { + "epoch": 0.054528650646950096, + "grad_norm": 0.8431378602981567, + "learning_rate": 0.00019897073803428104, + "loss": 1.1415, + "step": 59 + }, + { + "epoch": 0.05545286506469501, + "grad_norm": 0.8463863134384155, + "learning_rate": 0.00019892837461950652, + "loss": 1.1103, + "step": 60 + }, + { + "epoch": 0.056377079482439925, + "grad_norm": 0.8911519050598145, + "learning_rate": 0.00019888516157297358, + "loss": 1.0765, + "step": 61 + }, + { + "epoch": 0.05730129390018484, + "grad_norm": 0.8233453035354614, + "learning_rate": 0.00019884109926581096, + "loss": 1.0181, + "step": 62 + }, + { + "epoch": 0.05822550831792976, + "grad_norm": 0.8035470247268677, + "learning_rate": 0.00019879618807644138, + "loss": 0.9194, + "step": 63 + }, + { + "epoch": 0.059149722735674676, + "grad_norm": 0.9088996052742004, + "learning_rate": 0.00019875042839057798, + "loss": 1.0055, + "step": 64 + }, + { + "epoch": 0.06007393715341959, + "grad_norm": 0.8611018657684326, + "learning_rate": 0.000198703820601221, + "loss": 1.0828, + "step": 65 + }, + { + "epoch": 0.06099815157116451, + "grad_norm": 0.9225314855575562, + "learning_rate": 0.00019865636510865467, + "loss": 1.302, + "step": 66 + }, + { + "epoch": 0.06192236598890943, + "grad_norm": 0.9531003832817078, + "learning_rate": 0.00019860806232044337, + "loss": 1.1366, + "step": 67 + }, + { + "epoch": 0.06284658040665435, + "grad_norm": 0.863047182559967, + "learning_rate": 0.0001985589126514286, + "loss": 0.9941, + "step": 68 + }, + { + "epoch": 0.06377079482439926, + "grad_norm": 0.8294839262962341, + "learning_rate": 0.0001985089165237249, + "loss": 1.0746, + "step": 69 + }, + { + "epoch": 0.06469500924214418, + "grad_norm": 0.907938539981842, + "learning_rate": 0.0001984580743667168, + "loss": 0.9875, + "step": 70 + }, + { + "epoch": 0.06561922365988909, + "grad_norm": 0.8769300580024719, + "learning_rate": 0.00019840638661705454, + "loss": 1.1933, + "step": 71 + }, + { + "epoch": 0.066543438077634, + "grad_norm": 0.8994389176368713, + "learning_rate": 0.0001983538537186508, + "loss": 1.1666, + "step": 72 + }, + { + "epoch": 0.06746765249537892, + "grad_norm": 0.8897333741188049, + "learning_rate": 0.00019830047612267663, + "loss": 1.1059, + "step": 73 + }, + { + "epoch": 0.06839186691312385, + "grad_norm": 0.869263768196106, + "learning_rate": 0.0001982462542875576, + "loss": 1.0403, + "step": 74 + }, + { + "epoch": 0.06931608133086876, + "grad_norm": 0.9123412370681763, + "learning_rate": 0.00019819118867897003, + "loss": 0.9915, + "step": 75 + }, + { + "epoch": 0.07024029574861368, + "grad_norm": 0.7876412868499756, + "learning_rate": 0.0001981352797698367, + "loss": 1.1364, + "step": 76 + }, + { + "epoch": 0.0711645101663586, + "grad_norm": 0.9286438822746277, + "learning_rate": 0.00019807852804032305, + "loss": 1.2299, + "step": 77 + }, + { + "epoch": 0.07208872458410351, + "grad_norm": 0.7597365975379944, + "learning_rate": 0.00019802093397783296, + "loss": 0.999, + "step": 78 + }, + { + "epoch": 0.07301293900184842, + "grad_norm": 0.8065080046653748, + "learning_rate": 0.00019796249807700457, + "loss": 1.1116, + "step": 79 + }, + { + "epoch": 0.07393715341959335, + "grad_norm": 0.739787220954895, + "learning_rate": 0.0001979032208397059, + "loss": 0.8557, + "step": 80 + }, + { + "epoch": 0.07486136783733827, + "grad_norm": 0.818260133266449, + "learning_rate": 0.00019784310277503085, + "loss": 1.0661, + "step": 81 + }, + { + "epoch": 0.07578558225508318, + "grad_norm": 0.6905511021614075, + "learning_rate": 0.00019778214439929452, + "loss": 0.9828, + "step": 82 + }, + { + "epoch": 0.0767097966728281, + "grad_norm": 0.8324970602989197, + "learning_rate": 0.00019772034623602894, + "loss": 1.0965, + "step": 83 + }, + { + "epoch": 0.07763401109057301, + "grad_norm": 0.9057538509368896, + "learning_rate": 0.00019765770881597855, + "loss": 1.1846, + "step": 84 + }, + { + "epoch": 0.07855822550831792, + "grad_norm": 0.7174400091171265, + "learning_rate": 0.00019759423267709555, + "loss": 0.9941, + "step": 85 + }, + { + "epoch": 0.07948243992606285, + "grad_norm": 0.8102924227714539, + "learning_rate": 0.00019752991836453543, + "loss": 1.1113, + "step": 86 + }, + { + "epoch": 0.08040665434380777, + "grad_norm": 0.8962113261222839, + "learning_rate": 0.00019746476643065216, + "loss": 1.2425, + "step": 87 + }, + { + "epoch": 0.08133086876155268, + "grad_norm": 0.7245362997055054, + "learning_rate": 0.00019739877743499352, + "loss": 1.0007, + "step": 88 + }, + { + "epoch": 0.0822550831792976, + "grad_norm": 0.8629370331764221, + "learning_rate": 0.00019733195194429628, + "loss": 1.0246, + "step": 89 + }, + { + "epoch": 0.08317929759704251, + "grad_norm": 0.8182492852210999, + "learning_rate": 0.0001972642905324813, + "loss": 0.9944, + "step": 90 + }, + { + "epoch": 0.08410351201478743, + "grad_norm": 0.8758109211921692, + "learning_rate": 0.00019719579378064869, + "loss": 1.1746, + "step": 91 + }, + { + "epoch": 0.08502772643253234, + "grad_norm": 0.8601279258728027, + "learning_rate": 0.00019712646227707263, + "loss": 1.0339, + "step": 92 + }, + { + "epoch": 0.08595194085027727, + "grad_norm": 0.9349322319030762, + "learning_rate": 0.00019705629661719652, + "loss": 1.0134, + "step": 93 + }, + { + "epoch": 0.08687615526802218, + "grad_norm": 0.7070643305778503, + "learning_rate": 0.00019698529740362785, + "loss": 1.0092, + "step": 94 + }, + { + "epoch": 0.0878003696857671, + "grad_norm": 0.8130859732627869, + "learning_rate": 0.00019691346524613286, + "loss": 1.0961, + "step": 95 + }, + { + "epoch": 0.08872458410351201, + "grad_norm": 0.8331341743469238, + "learning_rate": 0.00019684080076163142, + "loss": 1.0237, + "step": 96 + }, + { + "epoch": 0.08964879852125693, + "grad_norm": 0.8766252398490906, + "learning_rate": 0.00019676730457419178, + "loss": 1.1368, + "step": 97 + }, + { + "epoch": 0.09057301293900184, + "grad_norm": 0.8121916651725769, + "learning_rate": 0.00019669297731502507, + "loss": 1.0288, + "step": 98 + }, + { + "epoch": 0.09149722735674677, + "grad_norm": 0.842100203037262, + "learning_rate": 0.00019661781962248003, + "loss": 1.1347, + "step": 99 + }, + { + "epoch": 0.09242144177449169, + "grad_norm": 0.7509603500366211, + "learning_rate": 0.0001965418321420374, + "loss": 0.9508, + "step": 100 + }, + { + "epoch": 0.0933456561922366, + "grad_norm": 0.7321972846984863, + "learning_rate": 0.00019646501552630444, + "loss": 1.2226, + "step": 101 + }, + { + "epoch": 0.09426987060998152, + "grad_norm": 0.8529574871063232, + "learning_rate": 0.0001963873704350094, + "loss": 0.9725, + "step": 102 + }, + { + "epoch": 0.09519408502772643, + "grad_norm": 0.7662296891212463, + "learning_rate": 0.0001963088975349956, + "loss": 0.9445, + "step": 103 + }, + { + "epoch": 0.09611829944547134, + "grad_norm": 0.7230259776115417, + "learning_rate": 0.00019622959750021605, + "loss": 0.9691, + "step": 104 + }, + { + "epoch": 0.09704251386321626, + "grad_norm": 0.7084821462631226, + "learning_rate": 0.00019614947101172732, + "loss": 0.9546, + "step": 105 + }, + { + "epoch": 0.09796672828096119, + "grad_norm": 0.7990809679031372, + "learning_rate": 0.000196068518757684, + "loss": 0.998, + "step": 106 + }, + { + "epoch": 0.0988909426987061, + "grad_norm": 0.825856626033783, + "learning_rate": 0.00019598674143333263, + "loss": 0.9768, + "step": 107 + }, + { + "epoch": 0.09981515711645102, + "grad_norm": 0.782279372215271, + "learning_rate": 0.0001959041397410056, + "loss": 1.0431, + "step": 108 + }, + { + "epoch": 0.10073937153419593, + "grad_norm": 0.7822299003601074, + "learning_rate": 0.00019582071439011546, + "loss": 1.0298, + "step": 109 + }, + { + "epoch": 0.10166358595194085, + "grad_norm": 0.8558406233787537, + "learning_rate": 0.0001957364660971485, + "loss": 1.3647, + "step": 110 + }, + { + "epoch": 0.10258780036968576, + "grad_norm": 0.9535115361213684, + "learning_rate": 0.0001956513955856587, + "loss": 1.0358, + "step": 111 + }, + { + "epoch": 0.10351201478743069, + "grad_norm": 0.7181050181388855, + "learning_rate": 0.0001955655035862617, + "loss": 1.0234, + "step": 112 + }, + { + "epoch": 0.1044362292051756, + "grad_norm": 1.088115930557251, + "learning_rate": 0.00019547879083662819, + "loss": 1.1732, + "step": 113 + }, + { + "epoch": 0.10536044362292052, + "grad_norm": 0.8000708222389221, + "learning_rate": 0.0001953912580814779, + "loss": 1.088, + "step": 114 + }, + { + "epoch": 0.10628465804066543, + "grad_norm": 0.8368499875068665, + "learning_rate": 0.000195302906072573, + "loss": 1.1577, + "step": 115 + }, + { + "epoch": 0.10720887245841035, + "grad_norm": 0.6857576370239258, + "learning_rate": 0.0001952137355687116, + "loss": 1.0038, + "step": 116 + }, + { + "epoch": 0.10813308687615526, + "grad_norm": 0.7136886715888977, + "learning_rate": 0.00019512374733572153, + "loss": 0.9336, + "step": 117 + }, + { + "epoch": 0.10905730129390019, + "grad_norm": 0.6903534531593323, + "learning_rate": 0.00019503294214645337, + "loss": 1.1292, + "step": 118 + }, + { + "epoch": 0.1099815157116451, + "grad_norm": 0.7180472016334534, + "learning_rate": 0.00019494132078077414, + "loss": 0.9238, + "step": 119 + }, + { + "epoch": 0.11090573012939002, + "grad_norm": 0.8095017075538635, + "learning_rate": 0.00019484888402556045, + "loss": 1.0366, + "step": 120 + }, + { + "epoch": 0.11182994454713494, + "grad_norm": 0.7536140084266663, + "learning_rate": 0.00019475563267469173, + "loss": 1.108, + "step": 121 + }, + { + "epoch": 0.11275415896487985, + "grad_norm": 0.8670536875724792, + "learning_rate": 0.00019466156752904343, + "loss": 1.0401, + "step": 122 + }, + { + "epoch": 0.11367837338262476, + "grad_norm": 0.640887975692749, + "learning_rate": 0.0001945666893964802, + "loss": 0.9349, + "step": 123 + }, + { + "epoch": 0.11460258780036968, + "grad_norm": 0.714077889919281, + "learning_rate": 0.0001944709990918489, + "loss": 0.924, + "step": 124 + }, + { + "epoch": 0.11552680221811461, + "grad_norm": 0.7269852161407471, + "learning_rate": 0.00019437449743697164, + "loss": 0.9067, + "step": 125 + }, + { + "epoch": 0.11645101663585952, + "grad_norm": 0.6728264689445496, + "learning_rate": 0.00019427718526063856, + "loss": 1.0145, + "step": 126 + }, + { + "epoch": 0.11737523105360444, + "grad_norm": 0.8546668291091919, + "learning_rate": 0.00019417906339860098, + "loss": 1.2389, + "step": 127 + }, + { + "epoch": 0.11829944547134935, + "grad_norm": 0.6989983916282654, + "learning_rate": 0.00019408013269356408, + "loss": 1.0556, + "step": 128 + }, + { + "epoch": 0.11922365988909427, + "grad_norm": 0.762776792049408, + "learning_rate": 0.0001939803939951796, + "loss": 1.0673, + "step": 129 + }, + { + "epoch": 0.12014787430683918, + "grad_norm": 0.7708427906036377, + "learning_rate": 0.00019387984816003867, + "loss": 1.1755, + "step": 130 + }, + { + "epoch": 0.12107208872458411, + "grad_norm": 0.7115902900695801, + "learning_rate": 0.0001937784960516643, + "loss": 1.1696, + "step": 131 + }, + { + "epoch": 0.12199630314232902, + "grad_norm": 0.7481346130371094, + "learning_rate": 0.00019367633854050422, + "loss": 0.9057, + "step": 132 + }, + { + "epoch": 0.12292051756007394, + "grad_norm": 0.6975292563438416, + "learning_rate": 0.0001935733765039231, + "loss": 0.9558, + "step": 133 + }, + { + "epoch": 0.12384473197781885, + "grad_norm": 0.8065600395202637, + "learning_rate": 0.00019346961082619522, + "loss": 1.1788, + "step": 134 + }, + { + "epoch": 0.12476894639556377, + "grad_norm": 0.6492029428482056, + "learning_rate": 0.00019336504239849677, + "loss": 0.9413, + "step": 135 + }, + { + "epoch": 0.1256931608133087, + "grad_norm": 0.8229854702949524, + "learning_rate": 0.00019325967211889834, + "loss": 1.1889, + "step": 136 + }, + { + "epoch": 0.1266173752310536, + "grad_norm": 0.7550889849662781, + "learning_rate": 0.000193153500892357, + "loss": 1.1466, + "step": 137 + }, + { + "epoch": 0.12754158964879853, + "grad_norm": 0.7214056253433228, + "learning_rate": 0.0001930465296307087, + "loss": 1.0057, + "step": 138 + }, + { + "epoch": 0.12846580406654343, + "grad_norm": 0.7544134259223938, + "learning_rate": 0.00019293875925266028, + "loss": 1.0656, + "step": 139 + }, + { + "epoch": 0.12939001848428835, + "grad_norm": 0.7095998525619507, + "learning_rate": 0.00019283019068378182, + "loss": 1.0903, + "step": 140 + }, + { + "epoch": 0.13031423290203328, + "grad_norm": 0.7648972868919373, + "learning_rate": 0.0001927208248564984, + "loss": 0.9388, + "step": 141 + }, + { + "epoch": 0.13123844731977818, + "grad_norm": 0.7109007835388184, + "learning_rate": 0.00019261066271008235, + "loss": 0.9681, + "step": 142 + }, + { + "epoch": 0.1321626617375231, + "grad_norm": 0.7397719621658325, + "learning_rate": 0.00019249970519064503, + "loss": 1.345, + "step": 143 + }, + { + "epoch": 0.133086876155268, + "grad_norm": 0.8710482716560364, + "learning_rate": 0.0001923879532511287, + "loss": 1.0109, + "step": 144 + }, + { + "epoch": 0.13401109057301294, + "grad_norm": 0.8579124212265015, + "learning_rate": 0.0001922754078512984, + "loss": 1.1195, + "step": 145 + }, + { + "epoch": 0.13493530499075784, + "grad_norm": 1.0125819444656372, + "learning_rate": 0.00019216206995773373, + "loss": 1.1219, + "step": 146 + }, + { + "epoch": 0.13585951940850277, + "grad_norm": 0.6876494884490967, + "learning_rate": 0.00019204794054382052, + "loss": 0.9757, + "step": 147 + }, + { + "epoch": 0.1367837338262477, + "grad_norm": 0.7080855965614319, + "learning_rate": 0.00019193302058974232, + "loss": 0.9973, + "step": 148 + }, + { + "epoch": 0.1377079482439926, + "grad_norm": 0.6936303973197937, + "learning_rate": 0.00019181731108247228, + "loss": 1.0311, + "step": 149 + }, + { + "epoch": 0.13863216266173753, + "grad_norm": 0.6997070908546448, + "learning_rate": 0.00019170081301576444, + "loss": 0.9524, + "step": 150 + }, + { + "epoch": 0.13955637707948243, + "grad_norm": 0.6929956078529358, + "learning_rate": 0.00019158352739014523, + "loss": 0.9242, + "step": 151 + }, + { + "epoch": 0.14048059149722736, + "grad_norm": 0.658458411693573, + "learning_rate": 0.00019146545521290495, + "loss": 0.9067, + "step": 152 + }, + { + "epoch": 0.1414048059149723, + "grad_norm": 0.6803703904151917, + "learning_rate": 0.00019134659749808913, + "loss": 0.9616, + "step": 153 + }, + { + "epoch": 0.1423290203327172, + "grad_norm": 0.6902372241020203, + "learning_rate": 0.00019122695526648968, + "loss": 1.0902, + "step": 154 + }, + { + "epoch": 0.14325323475046212, + "grad_norm": 0.7907196879386902, + "learning_rate": 0.00019110652954563631, + "loss": 1.0511, + "step": 155 + }, + { + "epoch": 0.14417744916820702, + "grad_norm": 0.8718544840812683, + "learning_rate": 0.00019098532136978754, + "loss": 1.1125, + "step": 156 + }, + { + "epoch": 0.14510166358595195, + "grad_norm": 0.7161783576011658, + "learning_rate": 0.00019086333177992191, + "loss": 1.0796, + "step": 157 + }, + { + "epoch": 0.14602587800369685, + "grad_norm": 0.6105704307556152, + "learning_rate": 0.00019074056182372907, + "loss": 0.9869, + "step": 158 + }, + { + "epoch": 0.14695009242144177, + "grad_norm": 0.8143605589866638, + "learning_rate": 0.0001906170125556006, + "loss": 1.0158, + "step": 159 + }, + { + "epoch": 0.1478743068391867, + "grad_norm": 0.737464964389801, + "learning_rate": 0.00019049268503662126, + "loss": 1.1014, + "step": 160 + }, + { + "epoch": 0.1487985212569316, + "grad_norm": 0.7588536143302917, + "learning_rate": 0.00019036758033455956, + "loss": 1.1994, + "step": 161 + }, + { + "epoch": 0.14972273567467653, + "grad_norm": 0.7603012919425964, + "learning_rate": 0.00019024169952385885, + "loss": 1.0045, + "step": 162 + }, + { + "epoch": 0.15064695009242143, + "grad_norm": 0.7441855669021606, + "learning_rate": 0.00019011504368562782, + "loss": 0.9352, + "step": 163 + }, + { + "epoch": 0.15157116451016636, + "grad_norm": 0.8306816220283508, + "learning_rate": 0.00018998761390763154, + "loss": 1.0285, + "step": 164 + }, + { + "epoch": 0.15249537892791126, + "grad_norm": 0.712649405002594, + "learning_rate": 0.00018985941128428185, + "loss": 1.1093, + "step": 165 + }, + { + "epoch": 0.1534195933456562, + "grad_norm": 0.7444804310798645, + "learning_rate": 0.00018973043691662803, + "loss": 1.0998, + "step": 166 + }, + { + "epoch": 0.15434380776340112, + "grad_norm": 0.7480347752571106, + "learning_rate": 0.00018960069191234746, + "loss": 0.9268, + "step": 167 + }, + { + "epoch": 0.15526802218114602, + "grad_norm": 0.775780439376831, + "learning_rate": 0.000189470177385736, + "loss": 1.093, + "step": 168 + }, + { + "epoch": 0.15619223659889095, + "grad_norm": 0.6818462610244751, + "learning_rate": 0.00018933889445769836, + "loss": 1.0451, + "step": 169 + }, + { + "epoch": 0.15711645101663585, + "grad_norm": 0.7714971899986267, + "learning_rate": 0.00018920684425573865, + "loss": 1.0461, + "step": 170 + }, + { + "epoch": 0.15804066543438078, + "grad_norm": 0.7886775135993958, + "learning_rate": 0.00018907402791395057, + "loss": 1.0813, + "step": 171 + }, + { + "epoch": 0.1589648798521257, + "grad_norm": 0.7273821830749512, + "learning_rate": 0.00018894044657300765, + "loss": 0.9833, + "step": 172 + }, + { + "epoch": 0.1598890942698706, + "grad_norm": 0.7884137630462646, + "learning_rate": 0.00018880610138015356, + "loss": 1.2085, + "step": 173 + }, + { + "epoch": 0.16081330868761554, + "grad_norm": 0.689659595489502, + "learning_rate": 0.00018867099348919217, + "loss": 0.9477, + "step": 174 + }, + { + "epoch": 0.16173752310536044, + "grad_norm": 0.6827597618103027, + "learning_rate": 0.00018853512406047772, + "loss": 1.303, + "step": 175 + }, + { + "epoch": 0.16266173752310537, + "grad_norm": 0.7574382424354553, + "learning_rate": 0.0001883984942609047, + "loss": 1.1157, + "step": 176 + }, + { + "epoch": 0.16358595194085027, + "grad_norm": 0.6791549324989319, + "learning_rate": 0.00018826110526389803, + "loss": 1.006, + "step": 177 + }, + { + "epoch": 0.1645101663585952, + "grad_norm": 0.7198850512504578, + "learning_rate": 0.00018812295824940285, + "loss": 1.0868, + "step": 178 + }, + { + "epoch": 0.16543438077634012, + "grad_norm": 0.7608547806739807, + "learning_rate": 0.00018798405440387445, + "loss": 0.7716, + "step": 179 + }, + { + "epoch": 0.16635859519408502, + "grad_norm": 0.9131553173065186, + "learning_rate": 0.00018784439492026798, + "loss": 1.0254, + "step": 180 + }, + { + "epoch": 0.16728280961182995, + "grad_norm": 0.6674430966377258, + "learning_rate": 0.00018770398099802836, + "loss": 0.8047, + "step": 181 + }, + { + "epoch": 0.16820702402957485, + "grad_norm": 0.6775060296058655, + "learning_rate": 0.00018756281384307982, + "loss": 0.9752, + "step": 182 + }, + { + "epoch": 0.16913123844731978, + "grad_norm": 0.7324479818344116, + "learning_rate": 0.0001874208946678157, + "loss": 1.0063, + "step": 183 + }, + { + "epoch": 0.17005545286506468, + "grad_norm": 0.7278487682342529, + "learning_rate": 0.0001872782246910879, + "loss": 1.0593, + "step": 184 + }, + { + "epoch": 0.1709796672828096, + "grad_norm": 0.6093087196350098, + "learning_rate": 0.00018713480513819644, + "loss": 0.8663, + "step": 185 + }, + { + "epoch": 0.17190388170055454, + "grad_norm": 1.1111029386520386, + "learning_rate": 0.00018699063724087904, + "loss": 0.9937, + "step": 186 + }, + { + "epoch": 0.17282809611829944, + "grad_norm": 0.7506117820739746, + "learning_rate": 0.00018684572223730045, + "loss": 1.2419, + "step": 187 + }, + { + "epoch": 0.17375231053604437, + "grad_norm": 0.6952787637710571, + "learning_rate": 0.0001867000613720417, + "loss": 1.0363, + "step": 188 + }, + { + "epoch": 0.17467652495378927, + "grad_norm": 0.7385110855102539, + "learning_rate": 0.0001865536558960898, + "loss": 1.1671, + "step": 189 + }, + { + "epoch": 0.1756007393715342, + "grad_norm": 0.7654502391815186, + "learning_rate": 0.0001864065070668265, + "loss": 1.108, + "step": 190 + }, + { + "epoch": 0.17652495378927913, + "grad_norm": 1.2794867753982544, + "learning_rate": 0.00018625861614801785, + "loss": 1.19, + "step": 191 + }, + { + "epoch": 0.17744916820702403, + "grad_norm": 0.7433966398239136, + "learning_rate": 0.00018610998440980324, + "loss": 1.0539, + "step": 192 + }, + { + "epoch": 0.17837338262476896, + "grad_norm": 0.7351729273796082, + "learning_rate": 0.0001859606131286843, + "loss": 0.9915, + "step": 193 + }, + { + "epoch": 0.17929759704251386, + "grad_norm": 0.6647420525550842, + "learning_rate": 0.00018581050358751445, + "loss": 1.0903, + "step": 194 + }, + { + "epoch": 0.18022181146025879, + "grad_norm": 0.7536112666130066, + "learning_rate": 0.0001856596570754872, + "loss": 1.2017, + "step": 195 + }, + { + "epoch": 0.18114602587800369, + "grad_norm": 0.6352774500846863, + "learning_rate": 0.00018550807488812562, + "loss": 0.913, + "step": 196 + }, + { + "epoch": 0.18207024029574861, + "grad_norm": 0.7909333109855652, + "learning_rate": 0.00018535575832727102, + "loss": 1.2359, + "step": 197 + }, + { + "epoch": 0.18299445471349354, + "grad_norm": 0.711321234703064, + "learning_rate": 0.00018520270870107166, + "loss": 1.1239, + "step": 198 + }, + { + "epoch": 0.18391866913123844, + "grad_norm": 0.6280321478843689, + "learning_rate": 0.00018504892732397173, + "loss": 0.9277, + "step": 199 + }, + { + "epoch": 0.18484288354898337, + "grad_norm": 0.7469192147254944, + "learning_rate": 0.00018489441551669986, + "loss": 1.1021, + "step": 200 + }, + { + "epoch": 0.18576709796672827, + "grad_norm": 0.6786873936653137, + "learning_rate": 0.00018473917460625798, + "loss": 0.9679, + "step": 201 + }, + { + "epoch": 0.1866913123844732, + "grad_norm": 0.6588121056556702, + "learning_rate": 0.00018458320592590975, + "loss": 0.9737, + "step": 202 + }, + { + "epoch": 0.1876155268022181, + "grad_norm": 0.8966547846794128, + "learning_rate": 0.00018442651081516917, + "loss": 1.2208, + "step": 203 + }, + { + "epoch": 0.18853974121996303, + "grad_norm": 0.6928480267524719, + "learning_rate": 0.00018426909061978908, + "loss": 0.9534, + "step": 204 + }, + { + "epoch": 0.18946395563770796, + "grad_norm": 0.799971342086792, + "learning_rate": 0.00018411094669174965, + "loss": 1.0563, + "step": 205 + }, + { + "epoch": 0.19038817005545286, + "grad_norm": 0.676296055316925, + "learning_rate": 0.00018395208038924667, + "loss": 1.0369, + "step": 206 + }, + { + "epoch": 0.1913123844731978, + "grad_norm": 0.7316782474517822, + "learning_rate": 0.00018379249307667994, + "loss": 0.9643, + "step": 207 + }, + { + "epoch": 0.1922365988909427, + "grad_norm": 0.6900054812431335, + "learning_rate": 0.00018363218612464158, + "loss": 0.9629, + "step": 208 + }, + { + "epoch": 0.19316081330868762, + "grad_norm": 0.8323389291763306, + "learning_rate": 0.00018347116090990424, + "loss": 0.9887, + "step": 209 + }, + { + "epoch": 0.19408502772643252, + "grad_norm": 0.7214491963386536, + "learning_rate": 0.00018330941881540915, + "loss": 0.8915, + "step": 210 + }, + { + "epoch": 0.19500924214417745, + "grad_norm": 0.6861477494239807, + "learning_rate": 0.00018314696123025454, + "loss": 1.0244, + "step": 211 + }, + { + "epoch": 0.19593345656192238, + "grad_norm": 0.7220661640167236, + "learning_rate": 0.00018298378954968337, + "loss": 0.8882, + "step": 212 + }, + { + "epoch": 0.19685767097966728, + "grad_norm": 0.7545854449272156, + "learning_rate": 0.00018281990517507156, + "loss": 1.0317, + "step": 213 + }, + { + "epoch": 0.1977818853974122, + "grad_norm": 0.8720320463180542, + "learning_rate": 0.0001826553095139159, + "loss": 1.2824, + "step": 214 + }, + { + "epoch": 0.1987060998151571, + "grad_norm": 0.6537701487541199, + "learning_rate": 0.00018249000397982195, + "loss": 0.9396, + "step": 215 + }, + { + "epoch": 0.19963031423290203, + "grad_norm": 0.8188151121139526, + "learning_rate": 0.00018232398999249192, + "loss": 1.4162, + "step": 216 + }, + { + "epoch": 0.20055452865064696, + "grad_norm": 0.7201845645904541, + "learning_rate": 0.00018215726897771248, + "loss": 1.1039, + "step": 217 + }, + { + "epoch": 0.20147874306839186, + "grad_norm": 0.7602134346961975, + "learning_rate": 0.00018198984236734246, + "loss": 1.0142, + "step": 218 + }, + { + "epoch": 0.2024029574861368, + "grad_norm": 0.6614157557487488, + "learning_rate": 0.00018182171159930065, + "loss": 0.9878, + "step": 219 + }, + { + "epoch": 0.2033271719038817, + "grad_norm": 0.661346435546875, + "learning_rate": 0.0001816528781175533, + "loss": 1.0213, + "step": 220 + }, + { + "epoch": 0.20425138632162662, + "grad_norm": 0.707669198513031, + "learning_rate": 0.00018148334337210193, + "loss": 1.1438, + "step": 221 + }, + { + "epoch": 0.20517560073937152, + "grad_norm": 0.7131678462028503, + "learning_rate": 0.0001813131088189707, + "loss": 1.1101, + "step": 222 + }, + { + "epoch": 0.20609981515711645, + "grad_norm": 0.6725898385047913, + "learning_rate": 0.00018114217592019393, + "loss": 1.0486, + "step": 223 + }, + { + "epoch": 0.20702402957486138, + "grad_norm": 0.6992596387863159, + "learning_rate": 0.00018097054614380365, + "loss": 1.18, + "step": 224 + }, + { + "epoch": 0.20794824399260628, + "grad_norm": 0.7368807792663574, + "learning_rate": 0.00018079822096381688, + "loss": 1.0584, + "step": 225 + }, + { + "epoch": 0.2088724584103512, + "grad_norm": 0.7491002678871155, + "learning_rate": 0.000180625201860223, + "loss": 1.2422, + "step": 226 + }, + { + "epoch": 0.2097966728280961, + "grad_norm": 0.6869086623191833, + "learning_rate": 0.0001804514903189711, + "loss": 1.1148, + "step": 227 + }, + { + "epoch": 0.21072088724584104, + "grad_norm": 0.6771584749221802, + "learning_rate": 0.0001802770878319571, + "loss": 0.9445, + "step": 228 + }, + { + "epoch": 0.21164510166358594, + "grad_norm": 0.7400600910186768, + "learning_rate": 0.00018010199589701107, + "loss": 1.1358, + "step": 229 + }, + { + "epoch": 0.21256931608133087, + "grad_norm": 0.7790092825889587, + "learning_rate": 0.00017992621601788428, + "loss": 1.2502, + "step": 230 + }, + { + "epoch": 0.2134935304990758, + "grad_norm": 0.6150336861610413, + "learning_rate": 0.00017974974970423632, + "loss": 0.873, + "step": 231 + }, + { + "epoch": 0.2144177449168207, + "grad_norm": 0.6341018676757812, + "learning_rate": 0.00017957259847162205, + "loss": 0.9765, + "step": 232 + }, + { + "epoch": 0.21534195933456562, + "grad_norm": 0.6409825086593628, + "learning_rate": 0.00017939476384147877, + "loss": 0.961, + "step": 233 + }, + { + "epoch": 0.21626617375231053, + "grad_norm": 0.6662341356277466, + "learning_rate": 0.00017921624734111292, + "loss": 1.0044, + "step": 234 + }, + { + "epoch": 0.21719038817005545, + "grad_norm": 0.6861549615859985, + "learning_rate": 0.00017903705050368722, + "loss": 0.9655, + "step": 235 + }, + { + "epoch": 0.21811460258780038, + "grad_norm": 0.6224799156188965, + "learning_rate": 0.00017885717486820722, + "loss": 0.9168, + "step": 236 + }, + { + "epoch": 0.21903881700554528, + "grad_norm": 0.7034262418746948, + "learning_rate": 0.0001786766219795083, + "loss": 0.9834, + "step": 237 + }, + { + "epoch": 0.2199630314232902, + "grad_norm": 0.6560840606689453, + "learning_rate": 0.00017849539338824231, + "loss": 1.085, + "step": 238 + }, + { + "epoch": 0.2208872458410351, + "grad_norm": 0.6557031869888306, + "learning_rate": 0.00017831349065086435, + "loss": 0.9681, + "step": 239 + }, + { + "epoch": 0.22181146025878004, + "grad_norm": 0.5857090950012207, + "learning_rate": 0.0001781309153296192, + "loss": 0.9118, + "step": 240 + }, + { + "epoch": 0.22273567467652494, + "grad_norm": 0.6605651378631592, + "learning_rate": 0.00017794766899252812, + "loss": 1.0714, + "step": 241 + }, + { + "epoch": 0.22365988909426987, + "grad_norm": 0.6810103058815002, + "learning_rate": 0.00017776375321337521, + "loss": 1.149, + "step": 242 + }, + { + "epoch": 0.2245841035120148, + "grad_norm": 0.7103047370910645, + "learning_rate": 0.00017757916957169404, + "loss": 1.0941, + "step": 243 + }, + { + "epoch": 0.2255083179297597, + "grad_norm": 0.6880953311920166, + "learning_rate": 0.00017739391965275404, + "loss": 1.1278, + "step": 244 + }, + { + "epoch": 0.22643253234750463, + "grad_norm": 0.6372288465499878, + "learning_rate": 0.0001772080050475468, + "loss": 1.007, + "step": 245 + }, + { + "epoch": 0.22735674676524953, + "grad_norm": 0.6732058525085449, + "learning_rate": 0.00017702142735277247, + "loss": 1.1345, + "step": 246 + }, + { + "epoch": 0.22828096118299446, + "grad_norm": 0.7241851687431335, + "learning_rate": 0.0001768341881708261, + "loss": 1.0881, + "step": 247 + }, + { + "epoch": 0.22920517560073936, + "grad_norm": 0.7192524671554565, + "learning_rate": 0.00017664628910978375, + "loss": 1.2325, + "step": 248 + }, + { + "epoch": 0.2301293900184843, + "grad_norm": 0.8972578048706055, + "learning_rate": 0.00017645773178338886, + "loss": 1.588, + "step": 249 + }, + { + "epoch": 0.23105360443622922, + "grad_norm": 0.7118556499481201, + "learning_rate": 0.0001762685178110382, + "loss": 1.1287, + "step": 250 + }, + { + "epoch": 0.23197781885397412, + "grad_norm": 0.6829894185066223, + "learning_rate": 0.00017607864881776807, + "loss": 1.0851, + "step": 251 + }, + { + "epoch": 0.23290203327171904, + "grad_norm": 0.6413517594337463, + "learning_rate": 0.00017588812643424032, + "loss": 0.941, + "step": 252 + }, + { + "epoch": 0.23382624768946395, + "grad_norm": 0.750128984451294, + "learning_rate": 0.00017569695229672835, + "loss": 0.9655, + "step": 253 + }, + { + "epoch": 0.23475046210720887, + "grad_norm": 0.7154532670974731, + "learning_rate": 0.0001755051280471031, + "loss": 0.971, + "step": 254 + }, + { + "epoch": 0.2356746765249538, + "grad_norm": 0.7001273036003113, + "learning_rate": 0.00017531265533281872, + "loss": 1.0347, + "step": 255 + }, + { + "epoch": 0.2365988909426987, + "grad_norm": 0.6109774112701416, + "learning_rate": 0.00017511953580689888, + "loss": 1.0437, + "step": 256 + }, + { + "epoch": 0.23752310536044363, + "grad_norm": 0.6632633805274963, + "learning_rate": 0.00017492577112792208, + "loss": 1.1123, + "step": 257 + }, + { + "epoch": 0.23844731977818853, + "grad_norm": 0.7349646687507629, + "learning_rate": 0.00017473136296000772, + "loss": 1.1411, + "step": 258 + }, + { + "epoch": 0.23937153419593346, + "grad_norm": 0.739486575126648, + "learning_rate": 0.00017453631297280166, + "loss": 0.98, + "step": 259 + }, + { + "epoch": 0.24029574861367836, + "grad_norm": 0.6610244512557983, + "learning_rate": 0.000174340622841462, + "loss": 1.0162, + "step": 260 + }, + { + "epoch": 0.2412199630314233, + "grad_norm": 0.6939518451690674, + "learning_rate": 0.00017414429424664454, + "loss": 1.137, + "step": 261 + }, + { + "epoch": 0.24214417744916822, + "grad_norm": 0.6127789616584778, + "learning_rate": 0.00017394732887448847, + "loss": 0.9861, + "step": 262 + }, + { + "epoch": 0.24306839186691312, + "grad_norm": 0.7002210021018982, + "learning_rate": 0.00017374972841660186, + "loss": 1.0062, + "step": 263 + }, + { + "epoch": 0.24399260628465805, + "grad_norm": 0.7472264170646667, + "learning_rate": 0.00017355149457004709, + "loss": 0.8847, + "step": 264 + }, + { + "epoch": 0.24491682070240295, + "grad_norm": 0.6717665195465088, + "learning_rate": 0.00017335262903732634, + "loss": 0.8006, + "step": 265 + }, + { + "epoch": 0.24584103512014788, + "grad_norm": 0.7069761753082275, + "learning_rate": 0.0001731531335263669, + "loss": 1.0659, + "step": 266 + }, + { + "epoch": 0.24676524953789278, + "grad_norm": 0.678119957447052, + "learning_rate": 0.00017295300975050658, + "loss": 0.972, + "step": 267 + }, + { + "epoch": 0.2476894639556377, + "grad_norm": 0.7060781121253967, + "learning_rate": 0.0001727522594284789, + "loss": 1.0823, + "step": 268 + }, + { + "epoch": 0.24861367837338263, + "grad_norm": 0.8115720748901367, + "learning_rate": 0.00017255088428439836, + "loss": 1.139, + "step": 269 + }, + { + "epoch": 0.24953789279112754, + "grad_norm": 0.7960264682769775, + "learning_rate": 0.00017234888604774574, + "loss": 1.1639, + "step": 270 + }, + { + "epoch": 0.25046210720887246, + "grad_norm": 0.6807265877723694, + "learning_rate": 0.00017214626645335314, + "loss": 0.8682, + "step": 271 + }, + { + "epoch": 0.25046210720887246, + "eval_loss": 1.0369349718093872, + "eval_runtime": 259.0763, + "eval_samples_per_second": 19.407, + "eval_steps_per_second": 9.704, + "step": 271 + }, + { + "epoch": 0.2513863216266174, + "grad_norm": 0.6943592429161072, + "learning_rate": 0.00017194302724138903, + "loss": 0.9869, + "step": 272 + }, + { + "epoch": 0.25231053604436227, + "grad_norm": 0.7785775661468506, + "learning_rate": 0.00017173917015734336, + "loss": 0.9965, + "step": 273 + }, + { + "epoch": 0.2532347504621072, + "grad_norm": 0.7634903788566589, + "learning_rate": 0.00017153469695201277, + "loss": 1.0031, + "step": 274 + }, + { + "epoch": 0.2541589648798521, + "grad_norm": 1.1226165294647217, + "learning_rate": 0.00017132960938148512, + "loss": 0.9671, + "step": 275 + }, + { + "epoch": 0.25508317929759705, + "grad_norm": 0.7175688743591309, + "learning_rate": 0.0001711239092071248, + "loss": 1.1323, + "step": 276 + }, + { + "epoch": 0.256007393715342, + "grad_norm": 0.711585521697998, + "learning_rate": 0.00017091759819555744, + "loss": 1.0969, + "step": 277 + }, + { + "epoch": 0.25693160813308685, + "grad_norm": 0.6999589204788208, + "learning_rate": 0.00017071067811865476, + "loss": 0.994, + "step": 278 + }, + { + "epoch": 0.2578558225508318, + "grad_norm": 0.7230240106582642, + "learning_rate": 0.0001705031507535193, + "loss": 1.0185, + "step": 279 + }, + { + "epoch": 0.2587800369685767, + "grad_norm": 0.7297222018241882, + "learning_rate": 0.00017029501788246924, + "loss": 1.1727, + "step": 280 + }, + { + "epoch": 0.25970425138632164, + "grad_norm": 0.7299928665161133, + "learning_rate": 0.00017008628129302307, + "loss": 0.9436, + "step": 281 + }, + { + "epoch": 0.26062846580406657, + "grad_norm": 0.6015912294387817, + "learning_rate": 0.00016987694277788417, + "loss": 0.9869, + "step": 282 + }, + { + "epoch": 0.26155268022181144, + "grad_norm": 0.7023926377296448, + "learning_rate": 0.00016966700413492556, + "loss": 1.2392, + "step": 283 + }, + { + "epoch": 0.26247689463955637, + "grad_norm": 0.6982827186584473, + "learning_rate": 0.0001694564671671743, + "loss": 1.1813, + "step": 284 + }, + { + "epoch": 0.2634011090573013, + "grad_norm": 0.6689731478691101, + "learning_rate": 0.00016924533368279607, + "loss": 1.1206, + "step": 285 + }, + { + "epoch": 0.2643253234750462, + "grad_norm": 0.7199162244796753, + "learning_rate": 0.0001690336054950797, + "loss": 1.176, + "step": 286 + }, + { + "epoch": 0.26524953789279115, + "grad_norm": 0.6056158542633057, + "learning_rate": 0.00016882128442242156, + "loss": 0.9041, + "step": 287 + }, + { + "epoch": 0.266173752310536, + "grad_norm": 0.6410432457923889, + "learning_rate": 0.00016860837228830974, + "loss": 0.8289, + "step": 288 + }, + { + "epoch": 0.26709796672828096, + "grad_norm": 0.6758670210838318, + "learning_rate": 0.00016839487092130883, + "loss": 0.9673, + "step": 289 + }, + { + "epoch": 0.2680221811460259, + "grad_norm": 0.6756737232208252, + "learning_rate": 0.0001681807821550438, + "loss": 1.1143, + "step": 290 + }, + { + "epoch": 0.2689463955637708, + "grad_norm": 0.6370823383331299, + "learning_rate": 0.00016796610782818442, + "loss": 0.974, + "step": 291 + }, + { + "epoch": 0.2698706099815157, + "grad_norm": 0.8725601434707642, + "learning_rate": 0.00016775084978442955, + "loss": 1.0108, + "step": 292 + }, + { + "epoch": 0.2707948243992606, + "grad_norm": 0.7525449991226196, + "learning_rate": 0.0001675350098724911, + "loss": 1.0827, + "step": 293 + }, + { + "epoch": 0.27171903881700554, + "grad_norm": 0.8351470232009888, + "learning_rate": 0.00016731858994607838, + "loss": 1.1675, + "step": 294 + }, + { + "epoch": 0.27264325323475047, + "grad_norm": 0.6841646432876587, + "learning_rate": 0.00016710159186388203, + "loss": 1.0823, + "step": 295 + }, + { + "epoch": 0.2735674676524954, + "grad_norm": 0.6603293418884277, + "learning_rate": 0.00016688401748955802, + "loss": 0.9394, + "step": 296 + }, + { + "epoch": 0.2744916820702403, + "grad_norm": 0.5712734460830688, + "learning_rate": 0.0001666658686917118, + "loss": 0.9081, + "step": 297 + }, + { + "epoch": 0.2754158964879852, + "grad_norm": 0.6858909130096436, + "learning_rate": 0.00016644714734388217, + "loss": 1.1348, + "step": 298 + }, + { + "epoch": 0.27634011090573013, + "grad_norm": 0.7321335077285767, + "learning_rate": 0.0001662278553245252, + "loss": 1.0515, + "step": 299 + }, + { + "epoch": 0.27726432532347506, + "grad_norm": 0.7120572924613953, + "learning_rate": 0.00016600799451699802, + "loss": 1.3112, + "step": 300 + }, + { + "epoch": 0.27818853974122, + "grad_norm": 0.7827785611152649, + "learning_rate": 0.00016578756680954277, + "loss": 1.0183, + "step": 301 + }, + { + "epoch": 0.27911275415896486, + "grad_norm": 0.652125895023346, + "learning_rate": 0.0001655665740952703, + "loss": 1.144, + "step": 302 + }, + { + "epoch": 0.2800369685767098, + "grad_norm": 0.6406841278076172, + "learning_rate": 0.000165345018272144, + "loss": 0.8839, + "step": 303 + }, + { + "epoch": 0.2809611829944547, + "grad_norm": 0.6793238520622253, + "learning_rate": 0.00016512290124296336, + "loss": 0.9736, + "step": 304 + }, + { + "epoch": 0.28188539741219965, + "grad_norm": 0.6621578931808472, + "learning_rate": 0.00016490022491534768, + "loss": 1.1895, + "step": 305 + }, + { + "epoch": 0.2828096118299446, + "grad_norm": 0.5940911769866943, + "learning_rate": 0.00016467699120171987, + "loss": 0.8541, + "step": 306 + }, + { + "epoch": 0.28373382624768945, + "grad_norm": 0.7680670022964478, + "learning_rate": 0.0001644532020192897, + "loss": 1.0773, + "step": 307 + }, + { + "epoch": 0.2846580406654344, + "grad_norm": 0.6798003911972046, + "learning_rate": 0.00016422885929003758, + "loss": 0.997, + "step": 308 + }, + { + "epoch": 0.2855822550831793, + "grad_norm": 0.6679592728614807, + "learning_rate": 0.00016400396494069792, + "loss": 0.9759, + "step": 309 + }, + { + "epoch": 0.28650646950092423, + "grad_norm": 0.7058706283569336, + "learning_rate": 0.00016377852090274276, + "loss": 1.016, + "step": 310 + }, + { + "epoch": 0.2874306839186691, + "grad_norm": 0.7393819093704224, + "learning_rate": 0.00016355252911236492, + "loss": 1.009, + "step": 311 + }, + { + "epoch": 0.28835489833641403, + "grad_norm": 0.7212921380996704, + "learning_rate": 0.0001633259915104616, + "loss": 1.0256, + "step": 312 + }, + { + "epoch": 0.28927911275415896, + "grad_norm": 0.6295643448829651, + "learning_rate": 0.00016309891004261755, + "loss": 1.0031, + "step": 313 + }, + { + "epoch": 0.2902033271719039, + "grad_norm": 0.6600052118301392, + "learning_rate": 0.0001628712866590885, + "loss": 1.132, + "step": 314 + }, + { + "epoch": 0.2911275415896488, + "grad_norm": 0.7021812200546265, + "learning_rate": 0.0001626431233147843, + "loss": 0.9675, + "step": 315 + }, + { + "epoch": 0.2920517560073937, + "grad_norm": 0.6409662961959839, + "learning_rate": 0.00016241442196925223, + "loss": 1.0901, + "step": 316 + }, + { + "epoch": 0.2929759704251386, + "grad_norm": 0.6512841582298279, + "learning_rate": 0.00016218518458666008, + "loss": 0.9297, + "step": 317 + }, + { + "epoch": 0.29390018484288355, + "grad_norm": 0.7617345452308655, + "learning_rate": 0.00016195541313577923, + "loss": 1.0534, + "step": 318 + }, + { + "epoch": 0.2948243992606285, + "grad_norm": 0.8643571734428406, + "learning_rate": 0.00016172510958996795, + "loss": 1.1504, + "step": 319 + }, + { + "epoch": 0.2957486136783734, + "grad_norm": 0.6329824924468994, + "learning_rate": 0.00016149427592715432, + "loss": 0.9054, + "step": 320 + }, + { + "epoch": 0.2966728280961183, + "grad_norm": 0.8242455124855042, + "learning_rate": 0.0001612629141298192, + "loss": 0.9692, + "step": 321 + }, + { + "epoch": 0.2975970425138632, + "grad_norm": 0.668005108833313, + "learning_rate": 0.00016103102618497922, + "loss": 0.8778, + "step": 322 + }, + { + "epoch": 0.29852125693160814, + "grad_norm": 0.6514158248901367, + "learning_rate": 0.00016079861408416985, + "loss": 1.0045, + "step": 323 + }, + { + "epoch": 0.29944547134935307, + "grad_norm": 0.6487702131271362, + "learning_rate": 0.00016056567982342817, + "loss": 1.0774, + "step": 324 + }, + { + "epoch": 0.300369685767098, + "grad_norm": 0.6633313894271851, + "learning_rate": 0.00016033222540327567, + "loss": 1.0107, + "step": 325 + }, + { + "epoch": 0.30129390018484287, + "grad_norm": 0.6318841576576233, + "learning_rate": 0.00016009825282870126, + "loss": 1.0179, + "step": 326 + }, + { + "epoch": 0.3022181146025878, + "grad_norm": 0.8350809812545776, + "learning_rate": 0.00015986376410914388, + "loss": 1.1638, + "step": 327 + }, + { + "epoch": 0.3031423290203327, + "grad_norm": 0.6731352210044861, + "learning_rate": 0.00015962876125847535, + "loss": 0.9473, + "step": 328 + }, + { + "epoch": 0.30406654343807765, + "grad_norm": 0.735564112663269, + "learning_rate": 0.00015939324629498294, + "loss": 1.1567, + "step": 329 + }, + { + "epoch": 0.3049907578558225, + "grad_norm": 0.6614008545875549, + "learning_rate": 0.00015915722124135227, + "loss": 0.9406, + "step": 330 + }, + { + "epoch": 0.30591497227356745, + "grad_norm": 0.6943185329437256, + "learning_rate": 0.00015892068812464963, + "loss": 0.9803, + "step": 331 + }, + { + "epoch": 0.3068391866913124, + "grad_norm": 0.6823167204856873, + "learning_rate": 0.0001586836489763049, + "loss": 0.9948, + "step": 332 + }, + { + "epoch": 0.3077634011090573, + "grad_norm": 0.7684437036514282, + "learning_rate": 0.00015844610583209373, + "loss": 1.0388, + "step": 333 + }, + { + "epoch": 0.30868761552680224, + "grad_norm": 0.7264105677604675, + "learning_rate": 0.00015820806073212055, + "loss": 1.0603, + "step": 334 + }, + { + "epoch": 0.3096118299445471, + "grad_norm": 0.5998838543891907, + "learning_rate": 0.00015796951572080047, + "loss": 0.9361, + "step": 335 + }, + { + "epoch": 0.31053604436229204, + "grad_norm": 0.6764875650405884, + "learning_rate": 0.0001577304728468422, + "loss": 1.1427, + "step": 336 + }, + { + "epoch": 0.31146025878003697, + "grad_norm": 0.6622886061668396, + "learning_rate": 0.00015749093416323024, + "loss": 0.9299, + "step": 337 + }, + { + "epoch": 0.3123844731977819, + "grad_norm": 0.6334503889083862, + "learning_rate": 0.0001572509017272072, + "loss": 0.9834, + "step": 338 + }, + { + "epoch": 0.3133086876155268, + "grad_norm": 0.6898949146270752, + "learning_rate": 0.0001570103776002563, + "loss": 1.0888, + "step": 339 + }, + { + "epoch": 0.3142329020332717, + "grad_norm": 0.9040470719337463, + "learning_rate": 0.00015676936384808354, + "loss": 1.0985, + "step": 340 + }, + { + "epoch": 0.31515711645101663, + "grad_norm": 8.731682777404785, + "learning_rate": 0.00015652786254059998, + "loss": 1.0993, + "step": 341 + }, + { + "epoch": 0.31608133086876156, + "grad_norm": 0.6911963224411011, + "learning_rate": 0.00015628587575190395, + "loss": 0.9791, + "step": 342 + }, + { + "epoch": 0.3170055452865065, + "grad_norm": 0.6241868138313293, + "learning_rate": 0.0001560434055602634, + "loss": 0.9013, + "step": 343 + }, + { + "epoch": 0.3179297597042514, + "grad_norm": 0.6385155320167542, + "learning_rate": 0.00015580045404809772, + "loss": 1.0251, + "step": 344 + }, + { + "epoch": 0.3188539741219963, + "grad_norm": 0.7352232933044434, + "learning_rate": 0.00015555702330196023, + "loss": 0.8741, + "step": 345 + }, + { + "epoch": 0.3197781885397412, + "grad_norm": 0.6918854713439941, + "learning_rate": 0.00015531311541251995, + "loss": 1.0707, + "step": 346 + }, + { + "epoch": 0.32070240295748614, + "grad_norm": 0.5709326863288879, + "learning_rate": 0.00015506873247454384, + "loss": 0.844, + "step": 347 + }, + { + "epoch": 0.32162661737523107, + "grad_norm": 0.6701300144195557, + "learning_rate": 0.00015482387658687875, + "loss": 0.9212, + "step": 348 + }, + { + "epoch": 0.32255083179297594, + "grad_norm": 0.7593115568161011, + "learning_rate": 0.0001545785498524333, + "loss": 0.9909, + "step": 349 + }, + { + "epoch": 0.3234750462107209, + "grad_norm": 0.6131500005722046, + "learning_rate": 0.00015433275437816004, + "loss": 0.9188, + "step": 350 + }, + { + "epoch": 0.3243992606284658, + "grad_norm": 1.0076595544815063, + "learning_rate": 0.00015408649227503714, + "loss": 1.0796, + "step": 351 + }, + { + "epoch": 0.32532347504621073, + "grad_norm": 0.6694477200508118, + "learning_rate": 0.00015383976565805035, + "loss": 1.0788, + "step": 352 + }, + { + "epoch": 0.32624768946395566, + "grad_norm": 0.7312178611755371, + "learning_rate": 0.00015359257664617485, + "loss": 1.3597, + "step": 353 + }, + { + "epoch": 0.32717190388170053, + "grad_norm": 0.6947386264801025, + "learning_rate": 0.00015334492736235705, + "loss": 0.9992, + "step": 354 + }, + { + "epoch": 0.32809611829944546, + "grad_norm": 0.7701761722564697, + "learning_rate": 0.00015309681993349626, + "loss": 1.1137, + "step": 355 + }, + { + "epoch": 0.3290203327171904, + "grad_norm": 0.7784320116043091, + "learning_rate": 0.00015284825649042655, + "loss": 1.0823, + "step": 356 + }, + { + "epoch": 0.3299445471349353, + "grad_norm": 0.7495863437652588, + "learning_rate": 0.00015259923916789844, + "loss": 1.1035, + "step": 357 + }, + { + "epoch": 0.33086876155268025, + "grad_norm": 0.6265072226524353, + "learning_rate": 0.00015234977010456047, + "loss": 0.9308, + "step": 358 + }, + { + "epoch": 0.3317929759704251, + "grad_norm": 0.9008778929710388, + "learning_rate": 0.0001520998514429409, + "loss": 1.0843, + "step": 359 + }, + { + "epoch": 0.33271719038817005, + "grad_norm": 0.6419479250907898, + "learning_rate": 0.00015184948532942928, + "loss": 1.0143, + "step": 360 + }, + { + "epoch": 0.333641404805915, + "grad_norm": 0.7432788610458374, + "learning_rate": 0.0001515986739142581, + "loss": 1.1297, + "step": 361 + }, + { + "epoch": 0.3345656192236599, + "grad_norm": 0.659816324710846, + "learning_rate": 0.0001513474193514842, + "loss": 0.9952, + "step": 362 + }, + { + "epoch": 0.33548983364140483, + "grad_norm": 0.6711155772209167, + "learning_rate": 0.00015109572379897035, + "loss": 1.0346, + "step": 363 + }, + { + "epoch": 0.3364140480591497, + "grad_norm": 0.6461386680603027, + "learning_rate": 0.0001508435894183667, + "loss": 0.9187, + "step": 364 + }, + { + "epoch": 0.33733826247689463, + "grad_norm": 0.6412788033485413, + "learning_rate": 0.0001505910183750922, + "loss": 1.0141, + "step": 365 + }, + { + "epoch": 0.33826247689463956, + "grad_norm": 0.6505403518676758, + "learning_rate": 0.00015033801283831596, + "loss": 0.9602, + "step": 366 + }, + { + "epoch": 0.3391866913123845, + "grad_norm": 0.6988029479980469, + "learning_rate": 0.00015008457498093882, + "loss": 1.0409, + "step": 367 + }, + { + "epoch": 0.34011090573012936, + "grad_norm": 0.7848196029663086, + "learning_rate": 0.00014983070697957438, + "loss": 1.0929, + "step": 368 + }, + { + "epoch": 0.3410351201478743, + "grad_norm": 0.7550395131111145, + "learning_rate": 0.00014957641101453055, + "loss": 1.1429, + "step": 369 + }, + { + "epoch": 0.3419593345656192, + "grad_norm": 0.5989096760749817, + "learning_rate": 0.00014932168926979074, + "loss": 0.7855, + "step": 370 + }, + { + "epoch": 0.34288354898336415, + "grad_norm": 0.6776415109634399, + "learning_rate": 0.0001490665439329951, + "loss": 1.102, + "step": 371 + }, + { + "epoch": 0.3438077634011091, + "grad_norm": 0.6827484369277954, + "learning_rate": 0.00014881097719542173, + "loss": 0.8986, + "step": 372 + }, + { + "epoch": 0.34473197781885395, + "grad_norm": 0.6277738213539124, + "learning_rate": 0.00014855499125196784, + "loss": 0.9051, + "step": 373 + }, + { + "epoch": 0.3456561922365989, + "grad_norm": 0.6909447312355042, + "learning_rate": 0.000148298588301131, + "loss": 0.9459, + "step": 374 + }, + { + "epoch": 0.3465804066543438, + "grad_norm": 0.7616958022117615, + "learning_rate": 0.00014804177054499016, + "loss": 1.0556, + "step": 375 + }, + { + "epoch": 0.34750462107208874, + "grad_norm": 0.5968858599662781, + "learning_rate": 0.0001477845401891867, + "loss": 0.8668, + "step": 376 + }, + { + "epoch": 0.34842883548983367, + "grad_norm": 0.9198528528213501, + "learning_rate": 0.00014752689944290564, + "loss": 1.2407, + "step": 377 + }, + { + "epoch": 0.34935304990757854, + "grad_norm": 0.7038524746894836, + "learning_rate": 0.00014726885051885653, + "loss": 1.1057, + "step": 378 + }, + { + "epoch": 0.35027726432532347, + "grad_norm": 0.6649475693702698, + "learning_rate": 0.00014701039563325453, + "loss": 0.8731, + "step": 379 + }, + { + "epoch": 0.3512014787430684, + "grad_norm": 0.7103481888771057, + "learning_rate": 0.00014675153700580124, + "loss": 1.1009, + "step": 380 + }, + { + "epoch": 0.3521256931608133, + "grad_norm": 0.6922548413276672, + "learning_rate": 0.00014649227685966588, + "loss": 1.0636, + "step": 381 + }, + { + "epoch": 0.35304990757855825, + "grad_norm": 0.7133587002754211, + "learning_rate": 0.00014623261742146602, + "loss": 1.097, + "step": 382 + }, + { + "epoch": 0.3539741219963031, + "grad_norm": 0.581001877784729, + "learning_rate": 0.00014597256092124832, + "loss": 0.7147, + "step": 383 + }, + { + "epoch": 0.35489833641404805, + "grad_norm": 0.7503137588500977, + "learning_rate": 0.00014571210959246988, + "loss": 0.9929, + "step": 384 + }, + { + "epoch": 0.355822550831793, + "grad_norm": 0.6988643407821655, + "learning_rate": 0.00014545126567197834, + "loss": 1.1311, + "step": 385 + }, + { + "epoch": 0.3567467652495379, + "grad_norm": 0.6308209300041199, + "learning_rate": 0.00014519003139999337, + "loss": 1.0195, + "step": 386 + }, + { + "epoch": 0.3576709796672828, + "grad_norm": 0.6229655742645264, + "learning_rate": 0.000144928409020087, + "loss": 0.9711, + "step": 387 + }, + { + "epoch": 0.3585951940850277, + "grad_norm": 0.6783496737480164, + "learning_rate": 0.0001446664007791644, + "loss": 1.0427, + "step": 388 + }, + { + "epoch": 0.35951940850277264, + "grad_norm": 0.5948294401168823, + "learning_rate": 0.00014440400892744477, + "loss": 0.8694, + "step": 389 + }, + { + "epoch": 0.36044362292051757, + "grad_norm": 0.7050589323043823, + "learning_rate": 0.00014414123571844178, + "loss": 1.1643, + "step": 390 + }, + { + "epoch": 0.3613678373382625, + "grad_norm": 0.6779505610466003, + "learning_rate": 0.00014387808340894444, + "loss": 1.0825, + "step": 391 + }, + { + "epoch": 0.36229205175600737, + "grad_norm": 0.6414440870285034, + "learning_rate": 0.00014361455425899756, + "loss": 0.866, + "step": 392 + }, + { + "epoch": 0.3632162661737523, + "grad_norm": 0.7097681164741516, + "learning_rate": 0.0001433506505318823, + "loss": 0.9513, + "step": 393 + }, + { + "epoch": 0.36414048059149723, + "grad_norm": 0.675818145275116, + "learning_rate": 0.00014308637449409706, + "loss": 1.1676, + "step": 394 + }, + { + "epoch": 0.36506469500924216, + "grad_norm": 0.7372836470603943, + "learning_rate": 0.0001428217284153375, + "loss": 1.0957, + "step": 395 + }, + { + "epoch": 0.3659889094269871, + "grad_norm": 0.6622201800346375, + "learning_rate": 0.00014255671456847748, + "loss": 1.0058, + "step": 396 + }, + { + "epoch": 0.36691312384473196, + "grad_norm": 0.7442257404327393, + "learning_rate": 0.00014229133522954937, + "loss": 1.191, + "step": 397 + }, + { + "epoch": 0.3678373382624769, + "grad_norm": 1.1466059684753418, + "learning_rate": 0.00014202559267772444, + "loss": 1.2428, + "step": 398 + }, + { + "epoch": 0.3687615526802218, + "grad_norm": 0.7353366017341614, + "learning_rate": 0.0001417594891952935, + "loss": 0.9798, + "step": 399 + }, + { + "epoch": 0.36968576709796674, + "grad_norm": 0.5790997743606567, + "learning_rate": 0.00014149302706764697, + "loss": 1.054, + "step": 400 + }, + { + "epoch": 0.3706099815157117, + "grad_norm": 0.735878050327301, + "learning_rate": 0.00014122620858325564, + "loss": 0.9708, + "step": 401 + }, + { + "epoch": 0.37153419593345655, + "grad_norm": 0.7378543019294739, + "learning_rate": 0.00014095903603365066, + "loss": 1.1358, + "step": 402 + }, + { + "epoch": 0.3724584103512015, + "grad_norm": 0.8758243918418884, + "learning_rate": 0.0001406915117134041, + "loss": 1.097, + "step": 403 + }, + { + "epoch": 0.3733826247689464, + "grad_norm": 0.7630063891410828, + "learning_rate": 0.0001404236379201091, + "loss": 1.0084, + "step": 404 + }, + { + "epoch": 0.37430683918669133, + "grad_norm": 0.6138845086097717, + "learning_rate": 0.00014015541695436027, + "loss": 0.9815, + "step": 405 + }, + { + "epoch": 0.3752310536044362, + "grad_norm": 0.6343159079551697, + "learning_rate": 0.00013988685111973384, + "loss": 1.0915, + "step": 406 + }, + { + "epoch": 0.37615526802218113, + "grad_norm": 0.6707723736763, + "learning_rate": 0.00013961794272276788, + "loss": 1.0365, + "step": 407 + }, + { + "epoch": 0.37707948243992606, + "grad_norm": 0.605938732624054, + "learning_rate": 0.00013934869407294245, + "loss": 0.9873, + "step": 408 + }, + { + "epoch": 0.378003696857671, + "grad_norm": 0.6962162256240845, + "learning_rate": 0.0001390791074826599, + "loss": 0.9836, + "step": 409 + }, + { + "epoch": 0.3789279112754159, + "grad_norm": 0.6543064117431641, + "learning_rate": 0.00013880918526722497, + "loss": 0.9754, + "step": 410 + }, + { + "epoch": 0.3798521256931608, + "grad_norm": 0.6104928851127625, + "learning_rate": 0.00013853892974482476, + "loss": 0.8943, + "step": 411 + }, + { + "epoch": 0.3807763401109057, + "grad_norm": 0.6469597816467285, + "learning_rate": 0.000138268343236509, + "loss": 1.0806, + "step": 412 + }, + { + "epoch": 0.38170055452865065, + "grad_norm": 0.6169907450675964, + "learning_rate": 0.00013799742806616994, + "loss": 1.0747, + "step": 413 + }, + { + "epoch": 0.3826247689463956, + "grad_norm": 0.7273542881011963, + "learning_rate": 0.0001377261865605227, + "loss": 1.0315, + "step": 414 + }, + { + "epoch": 0.3835489833641405, + "grad_norm": 0.7549827098846436, + "learning_rate": 0.00013745462104908487, + "loss": 1.0283, + "step": 415 + }, + { + "epoch": 0.3844731977818854, + "grad_norm": 0.6836807131767273, + "learning_rate": 0.0001371827338641568, + "loss": 1.0632, + "step": 416 + }, + { + "epoch": 0.3853974121996303, + "grad_norm": 0.6877472400665283, + "learning_rate": 0.00013691052734080152, + "loss": 1.017, + "step": 417 + }, + { + "epoch": 0.38632162661737524, + "grad_norm": 0.7163965106010437, + "learning_rate": 0.00013663800381682464, + "loss": 1.1975, + "step": 418 + }, + { + "epoch": 0.38724584103512016, + "grad_norm": 0.7615072727203369, + "learning_rate": 0.00013636516563275418, + "loss": 1.028, + "step": 419 + }, + { + "epoch": 0.38817005545286504, + "grad_norm": 0.696151614189148, + "learning_rate": 0.00013609201513182075, + "loss": 0.9486, + "step": 420 + }, + { + "epoch": 0.38909426987060997, + "grad_norm": 0.6646804809570312, + "learning_rate": 0.0001358185546599371, + "loss": 1.1025, + "step": 421 + }, + { + "epoch": 0.3900184842883549, + "grad_norm": 0.7286469340324402, + "learning_rate": 0.00013554478656567818, + "loss": 1.0797, + "step": 422 + }, + { + "epoch": 0.3909426987060998, + "grad_norm": 0.6299863457679749, + "learning_rate": 0.0001352707132002609, + "loss": 0.94, + "step": 423 + }, + { + "epoch": 0.39186691312384475, + "grad_norm": 0.7259340882301331, + "learning_rate": 0.0001349963369175239, + "loss": 1.1681, + "step": 424 + }, + { + "epoch": 0.3927911275415896, + "grad_norm": 0.6097292900085449, + "learning_rate": 0.00013472166007390753, + "loss": 0.9657, + "step": 425 + }, + { + "epoch": 0.39371534195933455, + "grad_norm": 0.6440349817276001, + "learning_rate": 0.0001344466850284333, + "loss": 0.9537, + "step": 426 + }, + { + "epoch": 0.3946395563770795, + "grad_norm": 0.7018892765045166, + "learning_rate": 0.00013417141414268384, + "loss": 1.0847, + "step": 427 + }, + { + "epoch": 0.3955637707948244, + "grad_norm": 0.6364416480064392, + "learning_rate": 0.00013389584978078258, + "loss": 0.9768, + "step": 428 + }, + { + "epoch": 0.39648798521256934, + "grad_norm": 0.7663088440895081, + "learning_rate": 0.00013361999430937338, + "loss": 1.203, + "step": 429 + }, + { + "epoch": 0.3974121996303142, + "grad_norm": 0.6121943593025208, + "learning_rate": 0.00013334385009760032, + "loss": 1.0377, + "step": 430 + }, + { + "epoch": 0.39833641404805914, + "grad_norm": 0.7227523922920227, + "learning_rate": 0.00013306741951708723, + "loss": 1.1192, + "step": 431 + }, + { + "epoch": 0.39926062846580407, + "grad_norm": 0.7231754064559937, + "learning_rate": 0.00013279070494191737, + "loss": 1.1585, + "step": 432 + }, + { + "epoch": 0.400184842883549, + "grad_norm": 0.7629643678665161, + "learning_rate": 0.0001325137087486131, + "loss": 1.1305, + "step": 433 + }, + { + "epoch": 0.4011090573012939, + "grad_norm": 0.6507116556167603, + "learning_rate": 0.00013223643331611537, + "loss": 0.9322, + "step": 434 + }, + { + "epoch": 0.4020332717190388, + "grad_norm": 0.6998779773712158, + "learning_rate": 0.00013195888102576336, + "loss": 0.9471, + "step": 435 + }, + { + "epoch": 0.4029574861367837, + "grad_norm": 0.7487782835960388, + "learning_rate": 0.000131681054261274, + "loss": 1.1667, + "step": 436 + }, + { + "epoch": 0.40388170055452866, + "grad_norm": 0.6665334105491638, + "learning_rate": 0.0001314029554087214, + "loss": 1.0733, + "step": 437 + }, + { + "epoch": 0.4048059149722736, + "grad_norm": 0.8053660988807678, + "learning_rate": 0.00013112458685651668, + "loss": 1.0943, + "step": 438 + }, + { + "epoch": 0.40573012939001846, + "grad_norm": 0.7646785974502563, + "learning_rate": 0.00013084595099538705, + "loss": 1.2028, + "step": 439 + }, + { + "epoch": 0.4066543438077634, + "grad_norm": 0.6794949769973755, + "learning_rate": 0.00013056705021835546, + "loss": 1.0584, + "step": 440 + }, + { + "epoch": 0.4075785582255083, + "grad_norm": 0.8115156888961792, + "learning_rate": 0.00013028788692072025, + "loss": 0.9457, + "step": 441 + }, + { + "epoch": 0.40850277264325324, + "grad_norm": 0.6334480047225952, + "learning_rate": 0.0001300084635000341, + "loss": 0.9663, + "step": 442 + }, + { + "epoch": 0.40942698706099817, + "grad_norm": 0.6951630115509033, + "learning_rate": 0.0001297287823560839, + "loss": 0.9697, + "step": 443 + }, + { + "epoch": 0.41035120147874304, + "grad_norm": 0.7049196362495422, + "learning_rate": 0.00012944884589086993, + "loss": 1.2208, + "step": 444 + }, + { + "epoch": 0.411275415896488, + "grad_norm": 0.6848672032356262, + "learning_rate": 0.00012916865650858527, + "loss": 0.9456, + "step": 445 + }, + { + "epoch": 0.4121996303142329, + "grad_norm": 0.7070568203926086, + "learning_rate": 0.00012888821661559508, + "loss": 0.8987, + "step": 446 + }, + { + "epoch": 0.41312384473197783, + "grad_norm": 0.5700350403785706, + "learning_rate": 0.00012860752862041603, + "loss": 0.9219, + "step": 447 + }, + { + "epoch": 0.41404805914972276, + "grad_norm": 0.6008641719818115, + "learning_rate": 0.00012832659493369557, + "loss": 0.945, + "step": 448 + }, + { + "epoch": 0.41497227356746763, + "grad_norm": 0.8899533152580261, + "learning_rate": 0.0001280454179681913, + "loss": 1.2305, + "step": 449 + }, + { + "epoch": 0.41589648798521256, + "grad_norm": 0.6918568015098572, + "learning_rate": 0.00012776400013875006, + "loss": 1.0077, + "step": 450 + }, + { + "epoch": 0.4168207024029575, + "grad_norm": 0.8641760349273682, + "learning_rate": 0.00012748234386228746, + "loss": 1.0395, + "step": 451 + }, + { + "epoch": 0.4177449168207024, + "grad_norm": 0.7230687737464905, + "learning_rate": 0.0001272004515577668, + "loss": 0.8811, + "step": 452 + }, + { + "epoch": 0.41866913123844735, + "grad_norm": 0.7071014046669006, + "learning_rate": 0.00012691832564617865, + "loss": 1.1364, + "step": 453 + }, + { + "epoch": 0.4195933456561922, + "grad_norm": 0.7187917828559875, + "learning_rate": 0.0001266359685505198, + "loss": 1.0832, + "step": 454 + }, + { + "epoch": 0.42051756007393715, + "grad_norm": 0.6126280426979065, + "learning_rate": 0.00012635338269577248, + "loss": 1.0355, + "step": 455 + }, + { + "epoch": 0.4214417744916821, + "grad_norm": 0.6394835710525513, + "learning_rate": 0.0001260705705088837, + "loss": 1.0759, + "step": 456 + }, + { + "epoch": 0.422365988909427, + "grad_norm": 0.6926038265228271, + "learning_rate": 0.00012578753441874416, + "loss": 1.0639, + "step": 457 + }, + { + "epoch": 0.4232902033271719, + "grad_norm": 0.6955393552780151, + "learning_rate": 0.00012550427685616765, + "loss": 1.0496, + "step": 458 + }, + { + "epoch": 0.4242144177449168, + "grad_norm": 0.6321708559989929, + "learning_rate": 0.00012522080025386995, + "loss": 1.0173, + "step": 459 + }, + { + "epoch": 0.42513863216266173, + "grad_norm": 0.5871878862380981, + "learning_rate": 0.00012493710704644805, + "loss": 0.9312, + "step": 460 + }, + { + "epoch": 0.42606284658040666, + "grad_norm": 0.7447975277900696, + "learning_rate": 0.00012465319967035925, + "loss": 1.0089, + "step": 461 + }, + { + "epoch": 0.4269870609981516, + "grad_norm": 0.7522186040878296, + "learning_rate": 0.0001243690805639002, + "loss": 1.051, + "step": 462 + }, + { + "epoch": 0.42791127541589646, + "grad_norm": 0.882038414478302, + "learning_rate": 0.0001240847521671859, + "loss": 1.2995, + "step": 463 + }, + { + "epoch": 0.4288354898336414, + "grad_norm": 0.6197600960731506, + "learning_rate": 0.00012380021692212894, + "loss": 1.0778, + "step": 464 + }, + { + "epoch": 0.4297597042513863, + "grad_norm": 0.5964616537094116, + "learning_rate": 0.00012351547727241824, + "loss": 0.9789, + "step": 465 + }, + { + "epoch": 0.43068391866913125, + "grad_norm": 0.6916967630386353, + "learning_rate": 0.00012323053566349834, + "loss": 0.9791, + "step": 466 + }, + { + "epoch": 0.4316081330868762, + "grad_norm": 0.7413254976272583, + "learning_rate": 0.00012294539454254822, + "loss": 1.0098, + "step": 467 + }, + { + "epoch": 0.43253234750462105, + "grad_norm": 0.7243200540542603, + "learning_rate": 0.00012266005635846037, + "loss": 1.0558, + "step": 468 + }, + { + "epoch": 0.433456561922366, + "grad_norm": 0.6610310673713684, + "learning_rate": 0.00012237452356181968, + "loss": 1.0434, + "step": 469 + }, + { + "epoch": 0.4343807763401109, + "grad_norm": 0.6901101469993591, + "learning_rate": 0.0001220887986048825, + "loss": 0.9747, + "step": 470 + }, + { + "epoch": 0.43530499075785584, + "grad_norm": 0.676794707775116, + "learning_rate": 0.00012180288394155547, + "loss": 1.1224, + "step": 471 + }, + { + "epoch": 0.43622920517560076, + "grad_norm": 0.7171177268028259, + "learning_rate": 0.00012151678202737456, + "loss": 1.1443, + "step": 472 + }, + { + "epoch": 0.43715341959334564, + "grad_norm": 0.766306459903717, + "learning_rate": 0.00012123049531948381, + "loss": 0.8906, + "step": 473 + }, + { + "epoch": 0.43807763401109057, + "grad_norm": 0.7548720836639404, + "learning_rate": 0.00012094402627661447, + "loss": 1.0674, + "step": 474 + }, + { + "epoch": 0.4390018484288355, + "grad_norm": 0.7553579807281494, + "learning_rate": 0.00012065737735906361, + "loss": 0.9348, + "step": 475 + }, + { + "epoch": 0.4399260628465804, + "grad_norm": 0.6829140186309814, + "learning_rate": 0.00012037055102867321, + "loss": 1.0239, + "step": 476 + }, + { + "epoch": 0.4408502772643253, + "grad_norm": 0.6292818784713745, + "learning_rate": 0.00012008354974880896, + "loss": 1.0139, + "step": 477 + }, + { + "epoch": 0.4417744916820702, + "grad_norm": 0.6471489071846008, + "learning_rate": 0.00011979637598433899, + "loss": 1.0157, + "step": 478 + }, + { + "epoch": 0.44269870609981515, + "grad_norm": 0.7179769277572632, + "learning_rate": 0.00011950903220161285, + "loss": 1.1263, + "step": 479 + }, + { + "epoch": 0.4436229205175601, + "grad_norm": 0.6089410185813904, + "learning_rate": 0.00011922152086844023, + "loss": 1.0204, + "step": 480 + }, + { + "epoch": 0.444547134935305, + "grad_norm": 0.6487398743629456, + "learning_rate": 0.00011893384445406983, + "loss": 1.0548, + "step": 481 + }, + { + "epoch": 0.4454713493530499, + "grad_norm": 0.9229975938796997, + "learning_rate": 0.00011864600542916813, + "loss": 1.0672, + "step": 482 + }, + { + "epoch": 0.4463955637707948, + "grad_norm": 0.6691107153892517, + "learning_rate": 0.00011835800626579814, + "loss": 1.229, + "step": 483 + }, + { + "epoch": 0.44731977818853974, + "grad_norm": 0.7045539617538452, + "learning_rate": 0.00011806984943739821, + "loss": 1.0232, + "step": 484 + }, + { + "epoch": 0.44824399260628467, + "grad_norm": 0.5709706544876099, + "learning_rate": 0.00011778153741876081, + "loss": 0.8613, + "step": 485 + }, + { + "epoch": 0.4491682070240296, + "grad_norm": 0.6500287055969238, + "learning_rate": 0.00011749307268601111, + "loss": 1.0544, + "step": 486 + }, + { + "epoch": 0.45009242144177447, + "grad_norm": 0.671847403049469, + "learning_rate": 0.00011720445771658604, + "loss": 1.0416, + "step": 487 + }, + { + "epoch": 0.4510166358595194, + "grad_norm": 0.7322655916213989, + "learning_rate": 0.00011691569498921264, + "loss": 1.1538, + "step": 488 + }, + { + "epoch": 0.4519408502772643, + "grad_norm": 0.6459632515907288, + "learning_rate": 0.00011662678698388702, + "loss": 1.2394, + "step": 489 + }, + { + "epoch": 0.45286506469500926, + "grad_norm": 0.7091917395591736, + "learning_rate": 0.00011633773618185302, + "loss": 1.1348, + "step": 490 + }, + { + "epoch": 0.4537892791127542, + "grad_norm": 0.8139805793762207, + "learning_rate": 0.00011604854506558083, + "loss": 1.0216, + "step": 491 + }, + { + "epoch": 0.45471349353049906, + "grad_norm": 0.6534249186515808, + "learning_rate": 0.00011575921611874565, + "loss": 0.893, + "step": 492 + }, + { + "epoch": 0.455637707948244, + "grad_norm": 0.7494158744812012, + "learning_rate": 0.00011546975182620656, + "loss": 1.0213, + "step": 493 + }, + { + "epoch": 0.4565619223659889, + "grad_norm": 0.6042291522026062, + "learning_rate": 0.00011518015467398489, + "loss": 0.9325, + "step": 494 + }, + { + "epoch": 0.45748613678373384, + "grad_norm": 0.6219349503517151, + "learning_rate": 0.00011489042714924312, + "loss": 1.2748, + "step": 495 + }, + { + "epoch": 0.4584103512014787, + "grad_norm": 0.6438570618629456, + "learning_rate": 0.00011460057174026335, + "loss": 0.9067, + "step": 496 + }, + { + "epoch": 0.45933456561922364, + "grad_norm": 0.641319215297699, + "learning_rate": 0.00011431059093642605, + "loss": 0.8733, + "step": 497 + }, + { + "epoch": 0.4602587800369686, + "grad_norm": 0.735232949256897, + "learning_rate": 0.0001140204872281886, + "loss": 1.0106, + "step": 498 + }, + { + "epoch": 0.4611829944547135, + "grad_norm": 0.6639251112937927, + "learning_rate": 0.00011373026310706386, + "loss": 1.0014, + "step": 499 + }, + { + "epoch": 0.46210720887245843, + "grad_norm": 0.6597011089324951, + "learning_rate": 0.00011343992106559898, + "loss": 1.0411, + "step": 500 + }, + { + "epoch": 0.4630314232902033, + "grad_norm": 0.5826075673103333, + "learning_rate": 0.00011314946359735373, + "loss": 0.8324, + "step": 501 + }, + { + "epoch": 0.46395563770794823, + "grad_norm": 0.6745023131370544, + "learning_rate": 0.00011285889319687923, + "loss": 1.1705, + "step": 502 + }, + { + "epoch": 0.46487985212569316, + "grad_norm": 0.7299513816833496, + "learning_rate": 0.00011256821235969657, + "loss": 1.2169, + "step": 503 + }, + { + "epoch": 0.4658040665434381, + "grad_norm": 0.9269928336143494, + "learning_rate": 0.00011227742358227522, + "loss": 1.283, + "step": 504 + }, + { + "epoch": 0.466728280961183, + "grad_norm": 0.6437788605690002, + "learning_rate": 0.00011198652936201175, + "loss": 0.8839, + "step": 505 + }, + { + "epoch": 0.4676524953789279, + "grad_norm": 0.7397333383560181, + "learning_rate": 0.00011169553219720828, + "loss": 1.2167, + "step": 506 + }, + { + "epoch": 0.4685767097966728, + "grad_norm": 0.6261634230613708, + "learning_rate": 0.00011140443458705096, + "loss": 0.9214, + "step": 507 + }, + { + "epoch": 0.46950092421441775, + "grad_norm": 0.6012060046195984, + "learning_rate": 0.00011111323903158885, + "loss": 0.9218, + "step": 508 + }, + { + "epoch": 0.4704251386321627, + "grad_norm": 0.6968507766723633, + "learning_rate": 0.0001108219480317119, + "loss": 0.9261, + "step": 509 + }, + { + "epoch": 0.4713493530499076, + "grad_norm": 0.6878217458724976, + "learning_rate": 0.00011053056408913001, + "loss": 0.9844, + "step": 510 + }, + { + "epoch": 0.4722735674676525, + "grad_norm": 0.6315551996231079, + "learning_rate": 0.00011023908970635115, + "loss": 1.0012, + "step": 511 + }, + { + "epoch": 0.4731977818853974, + "grad_norm": 0.5573776960372925, + "learning_rate": 0.0001099475273866601, + "loss": 0.7817, + "step": 512 + }, + { + "epoch": 0.47412199630314233, + "grad_norm": 0.6290990710258484, + "learning_rate": 0.00010965587963409682, + "loss": 1.0575, + "step": 513 + }, + { + "epoch": 0.47504621072088726, + "grad_norm": 0.6656198501586914, + "learning_rate": 0.0001093641489534351, + "loss": 0.8813, + "step": 514 + }, + { + "epoch": 0.47597042513863214, + "grad_norm": 0.7537880539894104, + "learning_rate": 0.0001090723378501608, + "loss": 0.912, + "step": 515 + }, + { + "epoch": 0.47689463955637706, + "grad_norm": 0.6261211037635803, + "learning_rate": 0.0001087804488304506, + "loss": 0.8927, + "step": 516 + }, + { + "epoch": 0.477818853974122, + "grad_norm": 0.6701942682266235, + "learning_rate": 0.00010848848440115028, + "loss": 1.0891, + "step": 517 + }, + { + "epoch": 0.4787430683918669, + "grad_norm": 0.6747245788574219, + "learning_rate": 0.00010819644706975332, + "loss": 1.1814, + "step": 518 + }, + { + "epoch": 0.47966728280961185, + "grad_norm": 0.6291522979736328, + "learning_rate": 0.00010790433934437922, + "loss": 0.9048, + "step": 519 + }, + { + "epoch": 0.4805914972273567, + "grad_norm": 0.6312196254730225, + "learning_rate": 0.00010761216373375221, + "loss": 0.9192, + "step": 520 + }, + { + "epoch": 0.48151571164510165, + "grad_norm": 0.6374568343162537, + "learning_rate": 0.00010731992274717937, + "loss": 1.0835, + "step": 521 + }, + { + "epoch": 0.4824399260628466, + "grad_norm": 0.7122923731803894, + "learning_rate": 0.0001070276188945293, + "loss": 1.2499, + "step": 522 + }, + { + "epoch": 0.4833641404805915, + "grad_norm": 0.667521595954895, + "learning_rate": 0.0001067352546862106, + "loss": 0.9689, + "step": 523 + }, + { + "epoch": 0.48428835489833644, + "grad_norm": 0.6109979152679443, + "learning_rate": 0.00010644283263315014, + "loss": 0.9237, + "step": 524 + }, + { + "epoch": 0.4852125693160813, + "grad_norm": 0.6356685161590576, + "learning_rate": 0.0001061503552467716, + "loss": 1.0189, + "step": 525 + }, + { + "epoch": 0.48613678373382624, + "grad_norm": 0.6923561096191406, + "learning_rate": 0.00010585782503897388, + "loss": 1.1093, + "step": 526 + }, + { + "epoch": 0.48706099815157117, + "grad_norm": 0.6677191257476807, + "learning_rate": 0.00010556524452210952, + "loss": 0.9803, + "step": 527 + }, + { + "epoch": 0.4879852125693161, + "grad_norm": 0.5721224546432495, + "learning_rate": 0.00010527261620896323, + "loss": 0.8944, + "step": 528 + }, + { + "epoch": 0.488909426987061, + "grad_norm": 0.6035501956939697, + "learning_rate": 0.00010497994261273006, + "loss": 0.7696, + "step": 529 + }, + { + "epoch": 0.4898336414048059, + "grad_norm": 0.7504411935806274, + "learning_rate": 0.00010468722624699401, + "loss": 1.1699, + "step": 530 + }, + { + "epoch": 0.4907578558225508, + "grad_norm": 0.7169279456138611, + "learning_rate": 0.00010439446962570652, + "loss": 1.0684, + "step": 531 + }, + { + "epoch": 0.49168207024029575, + "grad_norm": 0.6567798852920532, + "learning_rate": 0.00010410167526316457, + "loss": 0.9579, + "step": 532 + }, + { + "epoch": 0.4926062846580407, + "grad_norm": 0.6132720112800598, + "learning_rate": 0.00010380884567398943, + "loss": 0.8562, + "step": 533 + }, + { + "epoch": 0.49353049907578556, + "grad_norm": 0.7070246934890747, + "learning_rate": 0.00010351598337310482, + "loss": 0.9984, + "step": 534 + }, + { + "epoch": 0.4944547134935305, + "grad_norm": 0.7589561343193054, + "learning_rate": 0.00010322309087571544, + "loss": 0.968, + "step": 535 + }, + { + "epoch": 0.4953789279112754, + "grad_norm": 0.7544695734977722, + "learning_rate": 0.00010293017069728535, + "loss": 1.0899, + "step": 536 + }, + { + "epoch": 0.49630314232902034, + "grad_norm": 0.6375424265861511, + "learning_rate": 0.00010263722535351626, + "loss": 0.9599, + "step": 537 + }, + { + "epoch": 0.49722735674676527, + "grad_norm": 0.5513481497764587, + "learning_rate": 0.00010234425736032607, + "loss": 0.7923, + "step": 538 + }, + { + "epoch": 0.49815157116451014, + "grad_norm": 0.6888278126716614, + "learning_rate": 0.0001020512692338272, + "loss": 0.9802, + "step": 539 + }, + { + "epoch": 0.49907578558225507, + "grad_norm": 0.7270799875259399, + "learning_rate": 0.00010175826349030496, + "loss": 1.0539, + "step": 540 + }, + { + "epoch": 0.5, + "grad_norm": 0.6513544917106628, + "learning_rate": 0.00010146524264619601, + "loss": 0.9056, + "step": 541 + }, + { + "epoch": 0.5009242144177449, + "grad_norm": 0.6731391549110413, + "learning_rate": 0.00010117220921806664, + "loss": 0.9946, + "step": 542 + }, + { + "epoch": 0.5009242144177449, + "eval_loss": 1.0004768371582031, + "eval_runtime": 282.1159, + "eval_samples_per_second": 17.822, + "eval_steps_per_second": 8.911, + "step": 542 + }, + { + "epoch": 0.5018484288354899, + "grad_norm": 0.6968109011650085, + "learning_rate": 0.00010087916572259124, + "loss": 0.8846, + "step": 543 + }, + { + "epoch": 0.5027726432532348, + "grad_norm": 0.643092155456543, + "learning_rate": 0.00010058611467653066, + "loss": 1.0187, + "step": 544 + }, + { + "epoch": 0.5036968576709797, + "grad_norm": 0.7097816467285156, + "learning_rate": 0.00010029305859671061, + "loss": 0.9641, + "step": 545 + }, + { + "epoch": 0.5046210720887245, + "grad_norm": 0.7045507431030273, + "learning_rate": 0.0001, + "loss": 1.0526, + "step": 546 + }, + { + "epoch": 0.5055452865064695, + "grad_norm": 0.8612992167472839, + "learning_rate": 9.970694140328941e-05, + "loss": 1.1912, + "step": 547 + }, + { + "epoch": 0.5064695009242144, + "grad_norm": 0.5835033655166626, + "learning_rate": 9.941388532346934e-05, + "loss": 0.8381, + "step": 548 + }, + { + "epoch": 0.5073937153419593, + "grad_norm": 0.6675471663475037, + "learning_rate": 9.912083427740877e-05, + "loss": 1.0843, + "step": 549 + }, + { + "epoch": 0.5083179297597042, + "grad_norm": 0.617229700088501, + "learning_rate": 9.882779078193339e-05, + "loss": 1.0496, + "step": 550 + }, + { + "epoch": 0.5092421441774492, + "grad_norm": 0.6543481945991516, + "learning_rate": 9.853475735380402e-05, + "loss": 1.0677, + "step": 551 + }, + { + "epoch": 0.5101663585951941, + "grad_norm": 0.6298348307609558, + "learning_rate": 9.824173650969507e-05, + "loss": 0.9764, + "step": 552 + }, + { + "epoch": 0.511090573012939, + "grad_norm": 0.6333633661270142, + "learning_rate": 9.794873076617283e-05, + "loss": 0.8255, + "step": 553 + }, + { + "epoch": 0.512014787430684, + "grad_norm": 0.7238278985023499, + "learning_rate": 9.765574263967396e-05, + "loss": 1.0533, + "step": 554 + }, + { + "epoch": 0.5129390018484289, + "grad_norm": 0.8976675271987915, + "learning_rate": 9.736277464648378e-05, + "loss": 1.0276, + "step": 555 + }, + { + "epoch": 0.5138632162661737, + "grad_norm": 0.8237775564193726, + "learning_rate": 9.706982930271465e-05, + "loss": 1.1503, + "step": 556 + }, + { + "epoch": 0.5147874306839186, + "grad_norm": 0.7562410235404968, + "learning_rate": 9.677690912428458e-05, + "loss": 1.0876, + "step": 557 + }, + { + "epoch": 0.5157116451016636, + "grad_norm": 0.6200810670852661, + "learning_rate": 9.648401662689521e-05, + "loss": 0.8214, + "step": 558 + }, + { + "epoch": 0.5166358595194085, + "grad_norm": 0.6449149250984192, + "learning_rate": 9.61911543260106e-05, + "loss": 0.8984, + "step": 559 + }, + { + "epoch": 0.5175600739371534, + "grad_norm": 0.6706170439720154, + "learning_rate": 9.589832473683547e-05, + "loss": 1.0512, + "step": 560 + }, + { + "epoch": 0.5184842883548983, + "grad_norm": 0.6909142136573792, + "learning_rate": 9.56055303742935e-05, + "loss": 1.1424, + "step": 561 + }, + { + "epoch": 0.5194085027726433, + "grad_norm": 0.6504043936729431, + "learning_rate": 9.531277375300599e-05, + "loss": 0.9437, + "step": 562 + }, + { + "epoch": 0.5203327171903882, + "grad_norm": 0.787574052810669, + "learning_rate": 9.502005738727e-05, + "loss": 1.0838, + "step": 563 + }, + { + "epoch": 0.5212569316081331, + "grad_norm": 0.6789671778678894, + "learning_rate": 9.472738379103682e-05, + "loss": 0.969, + "step": 564 + }, + { + "epoch": 0.522181146025878, + "grad_norm": 1.0221126079559326, + "learning_rate": 9.44347554778905e-05, + "loss": 1.0748, + "step": 565 + }, + { + "epoch": 0.5231053604436229, + "grad_norm": 0.6313021779060364, + "learning_rate": 9.414217496102614e-05, + "loss": 0.825, + "step": 566 + }, + { + "epoch": 0.5240295748613678, + "grad_norm": 0.6178043484687805, + "learning_rate": 9.384964475322843e-05, + "loss": 0.9409, + "step": 567 + }, + { + "epoch": 0.5249537892791127, + "grad_norm": 0.6451977491378784, + "learning_rate": 9.355716736684988e-05, + "loss": 1.1681, + "step": 568 + }, + { + "epoch": 0.5258780036968577, + "grad_norm": 0.6314733624458313, + "learning_rate": 9.32647453137894e-05, + "loss": 0.9459, + "step": 569 + }, + { + "epoch": 0.5268022181146026, + "grad_norm": 0.6225720643997192, + "learning_rate": 9.297238110547074e-05, + "loss": 0.9653, + "step": 570 + }, + { + "epoch": 0.5277264325323475, + "grad_norm": 0.6059353947639465, + "learning_rate": 9.268007725282068e-05, + "loss": 0.9645, + "step": 571 + }, + { + "epoch": 0.5286506469500925, + "grad_norm": 0.6386352777481079, + "learning_rate": 9.238783626624781e-05, + "loss": 0.8254, + "step": 572 + }, + { + "epoch": 0.5295748613678374, + "grad_norm": 0.6776155233383179, + "learning_rate": 9.209566065562079e-05, + "loss": 1.0594, + "step": 573 + }, + { + "epoch": 0.5304990757855823, + "grad_norm": 0.720901608467102, + "learning_rate": 9.180355293024669e-05, + "loss": 1.0062, + "step": 574 + }, + { + "epoch": 0.5314232902033271, + "grad_norm": 0.7126890420913696, + "learning_rate": 9.151151559884973e-05, + "loss": 1.1316, + "step": 575 + }, + { + "epoch": 0.532347504621072, + "grad_norm": 0.622345507144928, + "learning_rate": 9.121955116954942e-05, + "loss": 1.0716, + "step": 576 + }, + { + "epoch": 0.533271719038817, + "grad_norm": 0.5974884033203125, + "learning_rate": 9.092766214983924e-05, + "loss": 1.0135, + "step": 577 + }, + { + "epoch": 0.5341959334565619, + "grad_norm": 0.6549699306488037, + "learning_rate": 9.063585104656493e-05, + "loss": 0.8832, + "step": 578 + }, + { + "epoch": 0.5351201478743068, + "grad_norm": 0.6487647294998169, + "learning_rate": 9.03441203659032e-05, + "loss": 0.9431, + "step": 579 + }, + { + "epoch": 0.5360443622920518, + "grad_norm": 0.6896703243255615, + "learning_rate": 9.005247261333993e-05, + "loss": 1.1277, + "step": 580 + }, + { + "epoch": 0.5369685767097967, + "grad_norm": 0.7238255143165588, + "learning_rate": 8.976091029364888e-05, + "loss": 1.1282, + "step": 581 + }, + { + "epoch": 0.5378927911275416, + "grad_norm": 0.6330609321594238, + "learning_rate": 8.946943591087e-05, + "loss": 0.9379, + "step": 582 + }, + { + "epoch": 0.5388170055452866, + "grad_norm": 0.7297467589378357, + "learning_rate": 8.917805196828812e-05, + "loss": 1.1116, + "step": 583 + }, + { + "epoch": 0.5397412199630314, + "grad_norm": 0.7108982801437378, + "learning_rate": 8.88867609684112e-05, + "loss": 0.9791, + "step": 584 + }, + { + "epoch": 0.5406654343807763, + "grad_norm": 0.7811503410339355, + "learning_rate": 8.859556541294905e-05, + "loss": 0.9603, + "step": 585 + }, + { + "epoch": 0.5415896487985212, + "grad_norm": 0.6512707471847534, + "learning_rate": 8.830446780279176e-05, + "loss": 0.9618, + "step": 586 + }, + { + "epoch": 0.5425138632162662, + "grad_norm": 0.6759016513824463, + "learning_rate": 8.801347063798826e-05, + "loss": 1.1068, + "step": 587 + }, + { + "epoch": 0.5434380776340111, + "grad_norm": 0.5693769454956055, + "learning_rate": 8.772257641772479e-05, + "loss": 0.8665, + "step": 588 + }, + { + "epoch": 0.544362292051756, + "grad_norm": 0.707755446434021, + "learning_rate": 8.743178764030343e-05, + "loss": 1.0552, + "step": 589 + }, + { + "epoch": 0.5452865064695009, + "grad_norm": 0.6849241256713867, + "learning_rate": 8.71411068031208e-05, + "loss": 1.1891, + "step": 590 + }, + { + "epoch": 0.5462107208872459, + "grad_norm": 0.6496337652206421, + "learning_rate": 8.685053640264631e-05, + "loss": 0.8846, + "step": 591 + }, + { + "epoch": 0.5471349353049908, + "grad_norm": 0.6156033277511597, + "learning_rate": 8.656007893440106e-05, + "loss": 0.9913, + "step": 592 + }, + { + "epoch": 0.5480591497227357, + "grad_norm": 0.5575032234191895, + "learning_rate": 8.626973689293615e-05, + "loss": 0.8094, + "step": 593 + }, + { + "epoch": 0.5489833641404805, + "grad_norm": 0.6274961233139038, + "learning_rate": 8.597951277181142e-05, + "loss": 1.003, + "step": 594 + }, + { + "epoch": 0.5499075785582255, + "grad_norm": 0.7292885780334473, + "learning_rate": 8.568940906357396e-05, + "loss": 1.1647, + "step": 595 + }, + { + "epoch": 0.5508317929759704, + "grad_norm": 0.7404693961143494, + "learning_rate": 8.539942825973666e-05, + "loss": 0.9162, + "step": 596 + }, + { + "epoch": 0.5517560073937153, + "grad_norm": 0.7311802506446838, + "learning_rate": 8.510957285075692e-05, + "loss": 0.9075, + "step": 597 + }, + { + "epoch": 0.5526802218114603, + "grad_norm": 0.7017751932144165, + "learning_rate": 8.481984532601515e-05, + "loss": 0.8433, + "step": 598 + }, + { + "epoch": 0.5536044362292052, + "grad_norm": 0.63099604845047, + "learning_rate": 8.453024817379347e-05, + "loss": 1.0791, + "step": 599 + }, + { + "epoch": 0.5545286506469501, + "grad_norm": 0.6764926910400391, + "learning_rate": 8.424078388125436e-05, + "loss": 1.0771, + "step": 600 + }, + { + "epoch": 0.555452865064695, + "grad_norm": 0.6191377639770508, + "learning_rate": 8.395145493441921e-05, + "loss": 0.8244, + "step": 601 + }, + { + "epoch": 0.55637707948244, + "grad_norm": 0.602446436882019, + "learning_rate": 8.366226381814697e-05, + "loss": 0.906, + "step": 602 + }, + { + "epoch": 0.5573012939001848, + "grad_norm": 0.8746786713600159, + "learning_rate": 8.337321301611301e-05, + "loss": 1.0911, + "step": 603 + }, + { + "epoch": 0.5582255083179297, + "grad_norm": 0.6535836458206177, + "learning_rate": 8.308430501078739e-05, + "loss": 1.0041, + "step": 604 + }, + { + "epoch": 0.5591497227356746, + "grad_norm": 0.6471948623657227, + "learning_rate": 8.2795542283414e-05, + "loss": 1.0256, + "step": 605 + }, + { + "epoch": 0.5600739371534196, + "grad_norm": 0.6511720418930054, + "learning_rate": 8.25069273139889e-05, + "loss": 1.0453, + "step": 606 + }, + { + "epoch": 0.5609981515711645, + "grad_norm": 0.6590318083763123, + "learning_rate": 8.221846258123921e-05, + "loss": 1.2217, + "step": 607 + }, + { + "epoch": 0.5619223659889094, + "grad_norm": 0.7107011675834656, + "learning_rate": 8.19301505626018e-05, + "loss": 0.8713, + "step": 608 + }, + { + "epoch": 0.5628465804066544, + "grad_norm": 0.913418710231781, + "learning_rate": 8.164199373420187e-05, + "loss": 0.9954, + "step": 609 + }, + { + "epoch": 0.5637707948243993, + "grad_norm": 0.7569400668144226, + "learning_rate": 8.13539945708319e-05, + "loss": 1.0716, + "step": 610 + }, + { + "epoch": 0.5646950092421442, + "grad_norm": 0.6535944938659668, + "learning_rate": 8.106615554593021e-05, + "loss": 0.9248, + "step": 611 + }, + { + "epoch": 0.5656192236598891, + "grad_norm": 0.7175112962722778, + "learning_rate": 8.07784791315598e-05, + "loss": 0.9487, + "step": 612 + }, + { + "epoch": 0.566543438077634, + "grad_norm": 0.6683740019798279, + "learning_rate": 8.049096779838719e-05, + "loss": 0.9359, + "step": 613 + }, + { + "epoch": 0.5674676524953789, + "grad_norm": 0.653056263923645, + "learning_rate": 8.020362401566103e-05, + "loss": 1.0392, + "step": 614 + }, + { + "epoch": 0.5683918669131238, + "grad_norm": 0.6295396685600281, + "learning_rate": 7.991645025119104e-05, + "loss": 0.9685, + "step": 615 + }, + { + "epoch": 0.5693160813308688, + "grad_norm": 0.6374639868736267, + "learning_rate": 7.962944897132678e-05, + "loss": 0.9158, + "step": 616 + }, + { + "epoch": 0.5702402957486137, + "grad_norm": 0.6164650917053223, + "learning_rate": 7.934262264093642e-05, + "loss": 0.9224, + "step": 617 + }, + { + "epoch": 0.5711645101663586, + "grad_norm": 0.6264497637748718, + "learning_rate": 7.905597372338558e-05, + "loss": 0.9791, + "step": 618 + }, + { + "epoch": 0.5720887245841035, + "grad_norm": 0.6655760407447815, + "learning_rate": 7.876950468051623e-05, + "loss": 1.0106, + "step": 619 + }, + { + "epoch": 0.5730129390018485, + "grad_norm": 0.5880747437477112, + "learning_rate": 7.848321797262548e-05, + "loss": 0.9451, + "step": 620 + }, + { + "epoch": 0.5739371534195934, + "grad_norm": 0.6468313336372375, + "learning_rate": 7.819711605844454e-05, + "loss": 0.9605, + "step": 621 + }, + { + "epoch": 0.5748613678373382, + "grad_norm": 0.6386004090309143, + "learning_rate": 7.791120139511752e-05, + "loss": 0.8029, + "step": 622 + }, + { + "epoch": 0.5757855822550831, + "grad_norm": 0.7496598958969116, + "learning_rate": 7.762547643818032e-05, + "loss": 0.8361, + "step": 623 + }, + { + "epoch": 0.5767097966728281, + "grad_norm": 0.5908403992652893, + "learning_rate": 7.733994364153969e-05, + "loss": 0.9715, + "step": 624 + }, + { + "epoch": 0.577634011090573, + "grad_norm": 0.6351504325866699, + "learning_rate": 7.705460545745182e-05, + "loss": 1.0116, + "step": 625 + }, + { + "epoch": 0.5785582255083179, + "grad_norm": 0.5675674080848694, + "learning_rate": 7.67694643365017e-05, + "loss": 0.8919, + "step": 626 + }, + { + "epoch": 0.5794824399260629, + "grad_norm": 0.6759780645370483, + "learning_rate": 7.64845227275818e-05, + "loss": 0.9947, + "step": 627 + }, + { + "epoch": 0.5804066543438078, + "grad_norm": 0.6936529278755188, + "learning_rate": 7.619978307787108e-05, + "loss": 1.0098, + "step": 628 + }, + { + "epoch": 0.5813308687615527, + "grad_norm": 0.6837719678878784, + "learning_rate": 7.591524783281409e-05, + "loss": 1.0243, + "step": 629 + }, + { + "epoch": 0.5822550831792976, + "grad_norm": 0.7080698609352112, + "learning_rate": 7.563091943609984e-05, + "loss": 1.1453, + "step": 630 + }, + { + "epoch": 0.5831792975970426, + "grad_norm": 0.6386309266090393, + "learning_rate": 7.534680032964078e-05, + "loss": 0.9835, + "step": 631 + }, + { + "epoch": 0.5841035120147874, + "grad_norm": 0.7057380676269531, + "learning_rate": 7.506289295355198e-05, + "loss": 1.0487, + "step": 632 + }, + { + "epoch": 0.5850277264325323, + "grad_norm": 0.6716188192367554, + "learning_rate": 7.477919974613007e-05, + "loss": 1.0586, + "step": 633 + }, + { + "epoch": 0.5859519408502772, + "grad_norm": 0.6709314584732056, + "learning_rate": 7.449572314383237e-05, + "loss": 0.9115, + "step": 634 + }, + { + "epoch": 0.5868761552680222, + "grad_norm": 0.7409635186195374, + "learning_rate": 7.421246558125585e-05, + "loss": 0.9316, + "step": 635 + }, + { + "epoch": 0.5878003696857671, + "grad_norm": 0.6120432615280151, + "learning_rate": 7.392942949111633e-05, + "loss": 0.9654, + "step": 636 + }, + { + "epoch": 0.588724584103512, + "grad_norm": 0.6360400915145874, + "learning_rate": 7.364661730422756e-05, + "loss": 0.9467, + "step": 637 + }, + { + "epoch": 0.589648798521257, + "grad_norm": 0.6966155767440796, + "learning_rate": 7.336403144948022e-05, + "loss": 1.3489, + "step": 638 + }, + { + "epoch": 0.5905730129390019, + "grad_norm": 0.6508221626281738, + "learning_rate": 7.308167435382137e-05, + "loss": 0.8993, + "step": 639 + }, + { + "epoch": 0.5914972273567468, + "grad_norm": 0.7364124059677124, + "learning_rate": 7.279954844223323e-05, + "loss": 1.1851, + "step": 640 + }, + { + "epoch": 0.5924214417744916, + "grad_norm": 0.672358512878418, + "learning_rate": 7.251765613771257e-05, + "loss": 1.0678, + "step": 641 + }, + { + "epoch": 0.5933456561922366, + "grad_norm": 0.6131464838981628, + "learning_rate": 7.223599986124994e-05, + "loss": 0.9678, + "step": 642 + }, + { + "epoch": 0.5942698706099815, + "grad_norm": 0.645045816898346, + "learning_rate": 7.195458203180872e-05, + "loss": 1.1014, + "step": 643 + }, + { + "epoch": 0.5951940850277264, + "grad_norm": 0.5539612174034119, + "learning_rate": 7.167340506630445e-05, + "loss": 0.7922, + "step": 644 + }, + { + "epoch": 0.5961182994454713, + "grad_norm": 0.7109516263008118, + "learning_rate": 7.1392471379584e-05, + "loss": 1.1021, + "step": 645 + }, + { + "epoch": 0.5970425138632163, + "grad_norm": 0.7719130516052246, + "learning_rate": 7.111178338440496e-05, + "loss": 1.1807, + "step": 646 + }, + { + "epoch": 0.5979667282809612, + "grad_norm": 0.7706263661384583, + "learning_rate": 7.083134349141475e-05, + "loss": 0.9217, + "step": 647 + }, + { + "epoch": 0.5988909426987061, + "grad_norm": 0.7011818885803223, + "learning_rate": 7.055115410913009e-05, + "loss": 1.0691, + "step": 648 + }, + { + "epoch": 0.5998151571164511, + "grad_norm": 0.6436817646026611, + "learning_rate": 7.02712176439161e-05, + "loss": 1.0856, + "step": 649 + }, + { + "epoch": 0.600739371534196, + "grad_norm": 0.6470714807510376, + "learning_rate": 6.999153649996595e-05, + "loss": 1.0449, + "step": 650 + }, + { + "epoch": 0.6016635859519408, + "grad_norm": 0.6648321747779846, + "learning_rate": 6.971211307927981e-05, + "loss": 0.9638, + "step": 651 + }, + { + "epoch": 0.6025878003696857, + "grad_norm": 0.6020437479019165, + "learning_rate": 6.943294978164454e-05, + "loss": 0.9265, + "step": 652 + }, + { + "epoch": 0.6035120147874307, + "grad_norm": 0.645083487033844, + "learning_rate": 6.915404900461297e-05, + "loss": 0.9318, + "step": 653 + }, + { + "epoch": 0.6044362292051756, + "grad_norm": 0.7505110502243042, + "learning_rate": 6.887541314348333e-05, + "loss": 1.192, + "step": 654 + }, + { + "epoch": 0.6053604436229205, + "grad_norm": 0.6432421803474426, + "learning_rate": 6.85970445912786e-05, + "loss": 1.0113, + "step": 655 + }, + { + "epoch": 0.6062846580406654, + "grad_norm": 0.71739262342453, + "learning_rate": 6.831894573872601e-05, + "loss": 0.8764, + "step": 656 + }, + { + "epoch": 0.6072088724584104, + "grad_norm": 0.6454412341117859, + "learning_rate": 6.804111897423667e-05, + "loss": 1.1637, + "step": 657 + }, + { + "epoch": 0.6081330868761553, + "grad_norm": 0.6732053160667419, + "learning_rate": 6.776356668388464e-05, + "loss": 1.2789, + "step": 658 + }, + { + "epoch": 0.6090573012939002, + "grad_norm": 0.631897509098053, + "learning_rate": 6.748629125138691e-05, + "loss": 0.9074, + "step": 659 + }, + { + "epoch": 0.609981515711645, + "grad_norm": 0.6378685832023621, + "learning_rate": 6.720929505808265e-05, + "loss": 0.924, + "step": 660 + }, + { + "epoch": 0.61090573012939, + "grad_norm": 0.620550811290741, + "learning_rate": 6.693258048291277e-05, + "loss": 0.8763, + "step": 661 + }, + { + "epoch": 0.6118299445471349, + "grad_norm": 0.6288255453109741, + "learning_rate": 6.665614990239969e-05, + "loss": 0.7988, + "step": 662 + }, + { + "epoch": 0.6127541589648798, + "grad_norm": 0.698802649974823, + "learning_rate": 6.638000569062664e-05, + "loss": 1.0702, + "step": 663 + }, + { + "epoch": 0.6136783733826248, + "grad_norm": 0.6344176530838013, + "learning_rate": 6.610415021921747e-05, + "loss": 0.9284, + "step": 664 + }, + { + "epoch": 0.6146025878003697, + "grad_norm": 0.8580932021141052, + "learning_rate": 6.58285858573162e-05, + "loss": 1.0768, + "step": 665 + }, + { + "epoch": 0.6155268022181146, + "grad_norm": 0.6669883728027344, + "learning_rate": 6.555331497156672e-05, + "loss": 0.8689, + "step": 666 + }, + { + "epoch": 0.6164510166358595, + "grad_norm": 0.6564050912857056, + "learning_rate": 6.527833992609248e-05, + "loss": 1.0659, + "step": 667 + }, + { + "epoch": 0.6173752310536045, + "grad_norm": 0.6515840291976929, + "learning_rate": 6.50036630824761e-05, + "loss": 0.9726, + "step": 668 + }, + { + "epoch": 0.6182994454713494, + "grad_norm": 0.6124295592308044, + "learning_rate": 6.472928679973912e-05, + "loss": 0.9613, + "step": 669 + }, + { + "epoch": 0.6192236598890942, + "grad_norm": 0.6839505434036255, + "learning_rate": 6.445521343432188e-05, + "loss": 0.9285, + "step": 670 + }, + { + "epoch": 0.6201478743068392, + "grad_norm": 0.7414712905883789, + "learning_rate": 6.418144534006292e-05, + "loss": 1.0718, + "step": 671 + }, + { + "epoch": 0.6210720887245841, + "grad_norm": 0.6556944251060486, + "learning_rate": 6.390798486817929e-05, + "loss": 0.9938, + "step": 672 + }, + { + "epoch": 0.621996303142329, + "grad_norm": 0.6394513249397278, + "learning_rate": 6.363483436724583e-05, + "loss": 1.0882, + "step": 673 + }, + { + "epoch": 0.6229205175600739, + "grad_norm": 0.8331225514411926, + "learning_rate": 6.336199618317537e-05, + "loss": 1.0052, + "step": 674 + }, + { + "epoch": 0.6238447319778189, + "grad_norm": 0.6852128505706787, + "learning_rate": 6.308947265919849e-05, + "loss": 0.9406, + "step": 675 + }, + { + "epoch": 0.6247689463955638, + "grad_norm": 0.9205944538116455, + "learning_rate": 6.281726613584321e-05, + "loss": 1.0855, + "step": 676 + }, + { + "epoch": 0.6256931608133087, + "grad_norm": 0.6449244618415833, + "learning_rate": 6.254537895091518e-05, + "loss": 1.0464, + "step": 677 + }, + { + "epoch": 0.6266173752310537, + "grad_norm": 0.7212557196617126, + "learning_rate": 6.227381343947733e-05, + "loss": 0.9969, + "step": 678 + }, + { + "epoch": 0.6275415896487985, + "grad_norm": 0.6240119338035583, + "learning_rate": 6.200257193383006e-05, + "loss": 0.8405, + "step": 679 + }, + { + "epoch": 0.6284658040665434, + "grad_norm": 0.6850358247756958, + "learning_rate": 6.173165676349103e-05, + "loss": 1.1017, + "step": 680 + }, + { + "epoch": 0.6293900184842883, + "grad_norm": 0.6329962015151978, + "learning_rate": 6.146107025517524e-05, + "loss": 0.9296, + "step": 681 + }, + { + "epoch": 0.6303142329020333, + "grad_norm": 0.6645837426185608, + "learning_rate": 6.119081473277501e-05, + "loss": 0.8614, + "step": 682 + }, + { + "epoch": 0.6312384473197782, + "grad_norm": 0.64887934923172, + "learning_rate": 6.0920892517340077e-05, + "loss": 0.9993, + "step": 683 + }, + { + "epoch": 0.6321626617375231, + "grad_norm": 0.607116162776947, + "learning_rate": 6.065130592705759e-05, + "loss": 0.8521, + "step": 684 + }, + { + "epoch": 0.633086876155268, + "grad_norm": 0.6553835272789001, + "learning_rate": 6.0382057277232184e-05, + "loss": 0.8903, + "step": 685 + }, + { + "epoch": 0.634011090573013, + "grad_norm": 0.6603589057922363, + "learning_rate": 6.01131488802662e-05, + "loss": 1.0705, + "step": 686 + }, + { + "epoch": 0.6349353049907579, + "grad_norm": 0.7158153057098389, + "learning_rate": 5.9844583045639734e-05, + "loss": 1.0088, + "step": 687 + }, + { + "epoch": 0.6358595194085028, + "grad_norm": 0.6813520789146423, + "learning_rate": 5.9576362079890925e-05, + "loss": 0.9132, + "step": 688 + }, + { + "epoch": 0.6367837338262476, + "grad_norm": 0.6426079869270325, + "learning_rate": 5.930848828659594e-05, + "loss": 1.1032, + "step": 689 + }, + { + "epoch": 0.6377079482439926, + "grad_norm": 0.7008043527603149, + "learning_rate": 5.904096396634935e-05, + "loss": 1.0981, + "step": 690 + }, + { + "epoch": 0.6386321626617375, + "grad_norm": 0.8796435594558716, + "learning_rate": 5.87737914167444e-05, + "loss": 1.1305, + "step": 691 + }, + { + "epoch": 0.6395563770794824, + "grad_norm": 0.5850383043289185, + "learning_rate": 5.8506972932353035e-05, + "loss": 0.9795, + "step": 692 + }, + { + "epoch": 0.6404805914972274, + "grad_norm": 0.667824923992157, + "learning_rate": 5.824051080470654e-05, + "loss": 0.9297, + "step": 693 + }, + { + "epoch": 0.6414048059149723, + "grad_norm": 0.660589337348938, + "learning_rate": 5.797440732227555e-05, + "loss": 1.2153, + "step": 694 + }, + { + "epoch": 0.6423290203327172, + "grad_norm": 0.5866970419883728, + "learning_rate": 5.770866477045067e-05, + "loss": 0.8339, + "step": 695 + }, + { + "epoch": 0.6432532347504621, + "grad_norm": 0.7400878667831421, + "learning_rate": 5.744328543152253e-05, + "loss": 0.9622, + "step": 696 + }, + { + "epoch": 0.6441774491682071, + "grad_norm": 0.6008949875831604, + "learning_rate": 5.7178271584662535e-05, + "loss": 0.9428, + "step": 697 + }, + { + "epoch": 0.6451016635859519, + "grad_norm": 0.671312689781189, + "learning_rate": 5.691362550590297e-05, + "loss": 1.169, + "step": 698 + }, + { + "epoch": 0.6460258780036968, + "grad_norm": 0.6563397645950317, + "learning_rate": 5.6649349468117706e-05, + "loss": 1.0908, + "step": 699 + }, + { + "epoch": 0.6469500924214417, + "grad_norm": 0.6253775954246521, + "learning_rate": 5.638544574100249e-05, + "loss": 1.0465, + "step": 700 + }, + { + "epoch": 0.6478743068391867, + "grad_norm": 0.7953898906707764, + "learning_rate": 5.6121916591055565e-05, + "loss": 1.1457, + "step": 701 + }, + { + "epoch": 0.6487985212569316, + "grad_norm": 0.6638981103897095, + "learning_rate": 5.585876428155824e-05, + "loss": 0.7782, + "step": 702 + }, + { + "epoch": 0.6497227356746765, + "grad_norm": 0.62986159324646, + "learning_rate": 5.559599107255524e-05, + "loss": 0.9059, + "step": 703 + }, + { + "epoch": 0.6506469500924215, + "grad_norm": 0.6704596281051636, + "learning_rate": 5.533359922083562e-05, + "loss": 1.0248, + "step": 704 + }, + { + "epoch": 0.6515711645101664, + "grad_norm": 0.778503954410553, + "learning_rate": 5.5071590979913046e-05, + "loss": 1.0199, + "step": 705 + }, + { + "epoch": 0.6524953789279113, + "grad_norm": 0.698898196220398, + "learning_rate": 5.4809968600006635e-05, + "loss": 0.9157, + "step": 706 + }, + { + "epoch": 0.6534195933456562, + "grad_norm": 0.6358421444892883, + "learning_rate": 5.4548734328021655e-05, + "loss": 1.0301, + "step": 707 + }, + { + "epoch": 0.6543438077634011, + "grad_norm": 0.6946456432342529, + "learning_rate": 5.4287890407530175e-05, + "loss": 0.9616, + "step": 708 + }, + { + "epoch": 0.655268022181146, + "grad_norm": 0.6511873602867126, + "learning_rate": 5.4027439078751666e-05, + "loss": 1.1326, + "step": 709 + }, + { + "epoch": 0.6561922365988909, + "grad_norm": 0.6874746084213257, + "learning_rate": 5.3767382578534e-05, + "loss": 1.0983, + "step": 710 + }, + { + "epoch": 0.6571164510166358, + "grad_norm": 0.6438902616500854, + "learning_rate": 5.350772314033412e-05, + "loss": 1.0382, + "step": 711 + }, + { + "epoch": 0.6580406654343808, + "grad_norm": 0.7720553278923035, + "learning_rate": 5.324846299419879e-05, + "loss": 1.0427, + "step": 712 + }, + { + "epoch": 0.6589648798521257, + "grad_norm": 0.5832366347312927, + "learning_rate": 5.2989604366745516e-05, + "loss": 0.7421, + "step": 713 + }, + { + "epoch": 0.6598890942698706, + "grad_norm": 0.7342596650123596, + "learning_rate": 5.273114948114346e-05, + "loss": 1.0528, + "step": 714 + }, + { + "epoch": 0.6608133086876156, + "grad_norm": 0.7269583940505981, + "learning_rate": 5.247310055709438e-05, + "loss": 1.1798, + "step": 715 + }, + { + "epoch": 0.6617375231053605, + "grad_norm": 0.5938032865524292, + "learning_rate": 5.2215459810813306e-05, + "loss": 0.9166, + "step": 716 + }, + { + "epoch": 0.6626617375231053, + "grad_norm": 0.7352185249328613, + "learning_rate": 5.1958229455009876e-05, + "loss": 0.9892, + "step": 717 + }, + { + "epoch": 0.6635859519408502, + "grad_norm": 0.671506941318512, + "learning_rate": 5.170141169886904e-05, + "loss": 0.8764, + "step": 718 + }, + { + "epoch": 0.6645101663585952, + "grad_norm": 0.6787394881248474, + "learning_rate": 5.1445008748032185e-05, + "loss": 1.013, + "step": 719 + }, + { + "epoch": 0.6654343807763401, + "grad_norm": 0.7591982483863831, + "learning_rate": 5.118902280457829e-05, + "loss": 1.1755, + "step": 720 + }, + { + "epoch": 0.666358595194085, + "grad_norm": 0.6659126281738281, + "learning_rate": 5.093345606700494e-05, + "loss": 1.0561, + "step": 721 + }, + { + "epoch": 0.66728280961183, + "grad_norm": 0.7177993059158325, + "learning_rate": 5.0678310730209275e-05, + "loss": 1.0653, + "step": 722 + }, + { + "epoch": 0.6682070240295749, + "grad_norm": 0.7012899518013, + "learning_rate": 5.042358898546944e-05, + "loss": 0.9921, + "step": 723 + }, + { + "epoch": 0.6691312384473198, + "grad_norm": 0.6190822124481201, + "learning_rate": 5.016929302042563e-05, + "loss": 0.8716, + "step": 724 + }, + { + "epoch": 0.6700554528650647, + "grad_norm": 0.6341177821159363, + "learning_rate": 4.9915425019061224e-05, + "loss": 0.9633, + "step": 725 + }, + { + "epoch": 0.6709796672828097, + "grad_norm": 0.6331872940063477, + "learning_rate": 4.9661987161684045e-05, + "loss": 1.0124, + "step": 726 + }, + { + "epoch": 0.6719038817005545, + "grad_norm": 0.7523507475852966, + "learning_rate": 4.9408981624907815e-05, + "loss": 1.1916, + "step": 727 + }, + { + "epoch": 0.6728280961182994, + "grad_norm": 0.7019216418266296, + "learning_rate": 4.9156410581633317e-05, + "loss": 0.8977, + "step": 728 + }, + { + "epoch": 0.6737523105360443, + "grad_norm": 0.6230003833770752, + "learning_rate": 4.890427620102964e-05, + "loss": 1.1459, + "step": 729 + }, + { + "epoch": 0.6746765249537893, + "grad_norm": 0.6378732919692993, + "learning_rate": 4.865258064851579e-05, + "loss": 0.9107, + "step": 730 + }, + { + "epoch": 0.6756007393715342, + "grad_norm": 0.6095065474510193, + "learning_rate": 4.840132608574195e-05, + "loss": 1.0198, + "step": 731 + }, + { + "epoch": 0.6765249537892791, + "grad_norm": 0.6388225555419922, + "learning_rate": 4.8150514670570754e-05, + "loss": 0.7815, + "step": 732 + }, + { + "epoch": 0.677449168207024, + "grad_norm": 0.7553613781929016, + "learning_rate": 4.790014855705913e-05, + "loss": 1.1486, + "step": 733 + }, + { + "epoch": 0.678373382624769, + "grad_norm": 0.7106791138648987, + "learning_rate": 4.765022989543958e-05, + "loss": 1.1144, + "step": 734 + }, + { + "epoch": 0.6792975970425139, + "grad_norm": 0.7457973957061768, + "learning_rate": 4.740076083210158e-05, + "loss": 1.1348, + "step": 735 + }, + { + "epoch": 0.6802218114602587, + "grad_norm": 0.7399399280548096, + "learning_rate": 4.7151743509573444e-05, + "loss": 0.917, + "step": 736 + }, + { + "epoch": 0.6811460258780037, + "grad_norm": 0.699985146522522, + "learning_rate": 4.690318006650377e-05, + "loss": 1.1504, + "step": 737 + }, + { + "epoch": 0.6820702402957486, + "grad_norm": 0.7044080495834351, + "learning_rate": 4.665507263764299e-05, + "loss": 0.9508, + "step": 738 + }, + { + "epoch": 0.6829944547134935, + "grad_norm": 0.8153591752052307, + "learning_rate": 4.6407423353825166e-05, + "loss": 0.8932, + "step": 739 + }, + { + "epoch": 0.6839186691312384, + "grad_norm": 0.7146044969558716, + "learning_rate": 4.6160234341949646e-05, + "loss": 1.1081, + "step": 740 + }, + { + "epoch": 0.6848428835489834, + "grad_norm": 0.6797007918357849, + "learning_rate": 4.591350772496289e-05, + "loss": 0.941, + "step": 741 + }, + { + "epoch": 0.6857670979667283, + "grad_norm": 0.6130033135414124, + "learning_rate": 4.5667245621839974e-05, + "loss": 0.95, + "step": 742 + }, + { + "epoch": 0.6866913123844732, + "grad_norm": 0.6531005501747131, + "learning_rate": 4.5421450147566694e-05, + "loss": 0.8511, + "step": 743 + }, + { + "epoch": 0.6876155268022182, + "grad_norm": 0.6590886116027832, + "learning_rate": 4.5176123413121284e-05, + "loss": 0.9066, + "step": 744 + }, + { + "epoch": 0.6885397412199631, + "grad_norm": 0.6011480093002319, + "learning_rate": 4.493126752545619e-05, + "loss": 0.945, + "step": 745 + }, + { + "epoch": 0.6894639556377079, + "grad_norm": 0.6411030292510986, + "learning_rate": 4.468688458748006e-05, + "loss": 0.9913, + "step": 746 + }, + { + "epoch": 0.6903881700554528, + "grad_norm": 0.7760118246078491, + "learning_rate": 4.444297669803981e-05, + "loss": 1.0182, + "step": 747 + }, + { + "epoch": 0.6913123844731978, + "grad_norm": 0.6233190894126892, + "learning_rate": 4.4199545951902286e-05, + "loss": 1.0258, + "step": 748 + }, + { + "epoch": 0.6922365988909427, + "grad_norm": 0.6713120937347412, + "learning_rate": 4.395659443973661e-05, + "loss": 1.0924, + "step": 749 + }, + { + "epoch": 0.6931608133086876, + "grad_norm": 0.6258964538574219, + "learning_rate": 4.3714124248096067e-05, + "loss": 1.0035, + "step": 750 + }, + { + "epoch": 0.6940850277264325, + "grad_norm": 0.590328574180603, + "learning_rate": 4.3472137459400084e-05, + "loss": 0.8453, + "step": 751 + }, + { + "epoch": 0.6950092421441775, + "grad_norm": 0.5925545692443848, + "learning_rate": 4.3230636151916495e-05, + "loss": 0.8251, + "step": 752 + }, + { + "epoch": 0.6959334565619224, + "grad_norm": 0.6110025644302368, + "learning_rate": 4.2989622399743714e-05, + "loss": 0.9407, + "step": 753 + }, + { + "epoch": 0.6968576709796673, + "grad_norm": 0.7311002612113953, + "learning_rate": 4.274909827279283e-05, + "loss": 1.0881, + "step": 754 + }, + { + "epoch": 0.6977818853974121, + "grad_norm": 0.643674373626709, + "learning_rate": 4.250906583676978e-05, + "loss": 0.9522, + "step": 755 + }, + { + "epoch": 0.6987060998151571, + "grad_norm": 0.6032654047012329, + "learning_rate": 4.226952715315779e-05, + "loss": 0.9511, + "step": 756 + }, + { + "epoch": 0.699630314232902, + "grad_norm": 0.698158323764801, + "learning_rate": 4.203048427919954e-05, + "loss": 1.0076, + "step": 757 + }, + { + "epoch": 0.7005545286506469, + "grad_norm": 0.6970823407173157, + "learning_rate": 4.17919392678795e-05, + "loss": 1.0957, + "step": 758 + }, + { + "epoch": 0.7014787430683919, + "grad_norm": 0.8319569826126099, + "learning_rate": 4.155389416790627e-05, + "loss": 0.9885, + "step": 759 + }, + { + "epoch": 0.7024029574861368, + "grad_norm": 0.7980078458786011, + "learning_rate": 4.131635102369513e-05, + "loss": 0.972, + "step": 760 + }, + { + "epoch": 0.7033271719038817, + "grad_norm": 0.669026792049408, + "learning_rate": 4.10793118753504e-05, + "loss": 1.0931, + "step": 761 + }, + { + "epoch": 0.7042513863216266, + "grad_norm": 0.6558079719543457, + "learning_rate": 4.084277875864776e-05, + "loss": 0.9718, + "step": 762 + }, + { + "epoch": 0.7051756007393716, + "grad_norm": 0.6405876278877258, + "learning_rate": 4.0606753705017056e-05, + "loss": 1.1769, + "step": 763 + }, + { + "epoch": 0.7060998151571165, + "grad_norm": 0.6441617608070374, + "learning_rate": 4.037123874152472e-05, + "loss": 0.9154, + "step": 764 + }, + { + "epoch": 0.7070240295748613, + "grad_norm": 0.6752880811691284, + "learning_rate": 4.0136235890856155e-05, + "loss": 0.986, + "step": 765 + }, + { + "epoch": 0.7079482439926063, + "grad_norm": 0.7309328317642212, + "learning_rate": 3.9901747171298766e-05, + "loss": 1.0553, + "step": 766 + }, + { + "epoch": 0.7088724584103512, + "grad_norm": 0.6735094785690308, + "learning_rate": 3.966777459672437e-05, + "loss": 0.9524, + "step": 767 + }, + { + "epoch": 0.7097966728280961, + "grad_norm": 0.604626476764679, + "learning_rate": 3.943432017657186e-05, + "loss": 0.915, + "step": 768 + }, + { + "epoch": 0.710720887245841, + "grad_norm": 0.6388301253318787, + "learning_rate": 3.920138591583015e-05, + "loss": 0.861, + "step": 769 + }, + { + "epoch": 0.711645101663586, + "grad_norm": 0.6684121489524841, + "learning_rate": 3.8968973815020806e-05, + "loss": 0.9381, + "step": 770 + }, + { + "epoch": 0.7125693160813309, + "grad_norm": 0.5873221755027771, + "learning_rate": 3.873708587018086e-05, + "loss": 0.9364, + "step": 771 + }, + { + "epoch": 0.7134935304990758, + "grad_norm": 0.722499668598175, + "learning_rate": 3.850572407284569e-05, + "loss": 1.0897, + "step": 772 + }, + { + "epoch": 0.7144177449168208, + "grad_norm": 0.6424623727798462, + "learning_rate": 3.8274890410032035e-05, + "loss": 0.9891, + "step": 773 + }, + { + "epoch": 0.7153419593345656, + "grad_norm": 0.6826554536819458, + "learning_rate": 3.80445868642208e-05, + "loss": 1.0774, + "step": 774 + }, + { + "epoch": 0.7162661737523105, + "grad_norm": 0.6395336389541626, + "learning_rate": 3.7814815413339946e-05, + "loss": 0.9662, + "step": 775 + }, + { + "epoch": 0.7171903881700554, + "grad_norm": 0.5855088233947754, + "learning_rate": 3.7585578030747744e-05, + "loss": 0.9061, + "step": 776 + }, + { + "epoch": 0.7181146025878004, + "grad_norm": 0.7400807738304138, + "learning_rate": 3.7356876685215694e-05, + "loss": 1.0149, + "step": 777 + }, + { + "epoch": 0.7190388170055453, + "grad_norm": 0.6602422595024109, + "learning_rate": 3.7128713340911535e-05, + "loss": 0.9968, + "step": 778 + }, + { + "epoch": 0.7199630314232902, + "grad_norm": 0.6822240948677063, + "learning_rate": 3.690108995738247e-05, + "loss": 1.093, + "step": 779 + }, + { + "epoch": 0.7208872458410351, + "grad_norm": 0.5872569680213928, + "learning_rate": 3.667400848953845e-05, + "loss": 0.9131, + "step": 780 + }, + { + "epoch": 0.7218114602587801, + "grad_norm": 0.6617823243141174, + "learning_rate": 3.6447470887635096e-05, + "loss": 1.0699, + "step": 781 + }, + { + "epoch": 0.722735674676525, + "grad_norm": 0.8193539977073669, + "learning_rate": 3.622147909725724e-05, + "loss": 1.0984, + "step": 782 + }, + { + "epoch": 0.7236598890942699, + "grad_norm": 0.6807657480239868, + "learning_rate": 3.599603505930208e-05, + "loss": 0.9152, + "step": 783 + }, + { + "epoch": 0.7245841035120147, + "grad_norm": 0.7339295148849487, + "learning_rate": 3.577114070996247e-05, + "loss": 0.9734, + "step": 784 + }, + { + "epoch": 0.7255083179297597, + "grad_norm": 0.6891688704490662, + "learning_rate": 3.554679798071032e-05, + "loss": 0.8566, + "step": 785 + }, + { + "epoch": 0.7264325323475046, + "grad_norm": 0.7526949048042297, + "learning_rate": 3.532300879828013e-05, + "loss": 1.0738, + "step": 786 + }, + { + "epoch": 0.7273567467652495, + "grad_norm": 0.6762762665748596, + "learning_rate": 3.509977508465232e-05, + "loss": 0.9664, + "step": 787 + }, + { + "epoch": 0.7282809611829945, + "grad_norm": 0.651971697807312, + "learning_rate": 3.4877098757036665e-05, + "loss": 1.0146, + "step": 788 + }, + { + "epoch": 0.7292051756007394, + "grad_norm": 0.6197048425674438, + "learning_rate": 3.4654981727855995e-05, + "loss": 1.0939, + "step": 789 + }, + { + "epoch": 0.7301293900184843, + "grad_norm": 0.8312708139419556, + "learning_rate": 3.44334259047297e-05, + "loss": 1.0341, + "step": 790 + }, + { + "epoch": 0.7310536044362292, + "grad_norm": 0.6660441160202026, + "learning_rate": 3.421243319045727e-05, + "loss": 0.9819, + "step": 791 + }, + { + "epoch": 0.7319778188539742, + "grad_norm": 0.7062389254570007, + "learning_rate": 3.3992005483002e-05, + "loss": 1.1187, + "step": 792 + }, + { + "epoch": 0.732902033271719, + "grad_norm": 0.6733192801475525, + "learning_rate": 3.3772144675474835e-05, + "loss": 1.15, + "step": 793 + }, + { + "epoch": 0.7338262476894639, + "grad_norm": 0.7980753779411316, + "learning_rate": 3.355285265611784e-05, + "loss": 1.0291, + "step": 794 + }, + { + "epoch": 0.7347504621072088, + "grad_norm": 0.6174155473709106, + "learning_rate": 3.333413130828821e-05, + "loss": 1.01, + "step": 795 + }, + { + "epoch": 0.7356746765249538, + "grad_norm": 0.6196701526641846, + "learning_rate": 3.3115982510442014e-05, + "loss": 0.9446, + "step": 796 + }, + { + "epoch": 0.7365988909426987, + "grad_norm": 0.7163506150245667, + "learning_rate": 3.289840813611798e-05, + "loss": 0.8818, + "step": 797 + }, + { + "epoch": 0.7375231053604436, + "grad_norm": 0.6571723222732544, + "learning_rate": 3.268141005392163e-05, + "loss": 1.114, + "step": 798 + }, + { + "epoch": 0.7384473197781886, + "grad_norm": 0.6892349720001221, + "learning_rate": 3.24649901275089e-05, + "loss": 1.0825, + "step": 799 + }, + { + "epoch": 0.7393715341959335, + "grad_norm": 0.7074698805809021, + "learning_rate": 3.224915021557049e-05, + "loss": 0.7768, + "step": 800 + }, + { + "epoch": 0.7402957486136784, + "grad_norm": 0.6193884015083313, + "learning_rate": 3.2033892171815595e-05, + "loss": 0.9241, + "step": 801 + }, + { + "epoch": 0.7412199630314233, + "grad_norm": 0.6747057437896729, + "learning_rate": 3.1819217844956214e-05, + "loss": 1.1506, + "step": 802 + }, + { + "epoch": 0.7421441774491682, + "grad_norm": 0.7257170081138611, + "learning_rate": 3.1605129078691196e-05, + "loss": 1.1098, + "step": 803 + }, + { + "epoch": 0.7430683918669131, + "grad_norm": 0.7377750873565674, + "learning_rate": 3.1391627711690296e-05, + "loss": 0.9052, + "step": 804 + }, + { + "epoch": 0.743992606284658, + "grad_norm": 0.6712821125984192, + "learning_rate": 3.1178715577578486e-05, + "loss": 0.9886, + "step": 805 + }, + { + "epoch": 0.744916820702403, + "grad_norm": 0.6065827012062073, + "learning_rate": 3.0966394504920317e-05, + "loss": 0.7971, + "step": 806 + }, + { + "epoch": 0.7458410351201479, + "grad_norm": 0.6996814608573914, + "learning_rate": 3.075466631720394e-05, + "loss": 0.8769, + "step": 807 + }, + { + "epoch": 0.7467652495378928, + "grad_norm": 0.642219066619873, + "learning_rate": 3.0543532832825715e-05, + "loss": 0.9904, + "step": 808 + }, + { + "epoch": 0.7476894639556377, + "grad_norm": 0.5811806917190552, + "learning_rate": 3.0332995865074467e-05, + "loss": 0.8123, + "step": 809 + }, + { + "epoch": 0.7486136783733827, + "grad_norm": 0.6480695605278015, + "learning_rate": 3.0123057222115836e-05, + "loss": 1.0713, + "step": 810 + }, + { + "epoch": 0.7495378927911276, + "grad_norm": 0.6193245649337769, + "learning_rate": 2.9913718706976968e-05, + "loss": 0.8875, + "step": 811 + }, + { + "epoch": 0.7504621072088724, + "grad_norm": 0.7080812454223633, + "learning_rate": 2.9704982117530777e-05, + "loss": 1.0016, + "step": 812 + }, + { + "epoch": 0.7513863216266173, + "grad_norm": 0.6948220729827881, + "learning_rate": 2.949684924648073e-05, + "loss": 0.916, + "step": 813 + }, + { + "epoch": 0.7513863216266173, + "eval_loss": 0.9761598110198975, + "eval_runtime": 283.949, + "eval_samples_per_second": 17.707, + "eval_steps_per_second": 8.854, + "step": 813 + }, + { + "epoch": 0.7523105360443623, + "grad_norm": 0.6497867703437805, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.9617, + "step": 814 + }, + { + "epoch": 0.7532347504621072, + "grad_norm": 0.6489603519439697, + "learning_rate": 2.908240180444255e-05, + "loss": 0.9084, + "step": 815 + }, + { + "epoch": 0.7541589648798521, + "grad_norm": 0.5857310891151428, + "learning_rate": 2.887609079287521e-05, + "loss": 0.8453, + "step": 816 + }, + { + "epoch": 0.755083179297597, + "grad_norm": 0.6517936587333679, + "learning_rate": 2.8670390618514898e-05, + "loss": 0.9456, + "step": 817 + }, + { + "epoch": 0.756007393715342, + "grad_norm": 0.5996655821800232, + "learning_rate": 2.8465303047987267e-05, + "loss": 0.9636, + "step": 818 + }, + { + "epoch": 0.7569316081330869, + "grad_norm": 0.6206315159797668, + "learning_rate": 2.826082984265662e-05, + "loss": 0.8866, + "step": 819 + }, + { + "epoch": 0.7578558225508318, + "grad_norm": 0.5903576612472534, + "learning_rate": 2.805697275861101e-05, + "loss": 0.8831, + "step": 820 + }, + { + "epoch": 0.7587800369685767, + "grad_norm": 0.6713946461677551, + "learning_rate": 2.7853733546646866e-05, + "loss": 0.9299, + "step": 821 + }, + { + "epoch": 0.7597042513863216, + "grad_norm": 0.7191796898841858, + "learning_rate": 2.765111395225424e-05, + "loss": 1.0731, + "step": 822 + }, + { + "epoch": 0.7606284658040665, + "grad_norm": 0.7199848890304565, + "learning_rate": 2.744911571560165e-05, + "loss": 0.8607, + "step": 823 + }, + { + "epoch": 0.7615526802218114, + "grad_norm": 0.5861576795578003, + "learning_rate": 2.7247740571521118e-05, + "loss": 0.8636, + "step": 824 + }, + { + "epoch": 0.7624768946395564, + "grad_norm": 0.9151943325996399, + "learning_rate": 2.7046990249493443e-05, + "loss": 1.0415, + "step": 825 + }, + { + "epoch": 0.7634011090573013, + "grad_norm": 0.6626155972480774, + "learning_rate": 2.6846866473633125e-05, + "loss": 0.9483, + "step": 826 + }, + { + "epoch": 0.7643253234750462, + "grad_norm": 0.6170555949211121, + "learning_rate": 2.664737096267368e-05, + "loss": 0.9418, + "step": 827 + }, + { + "epoch": 0.7652495378927912, + "grad_norm": 0.6758602261543274, + "learning_rate": 2.6448505429952917e-05, + "loss": 0.9937, + "step": 828 + }, + { + "epoch": 0.7661737523105361, + "grad_norm": 0.6960089802742004, + "learning_rate": 2.6250271583398166e-05, + "loss": 1.0431, + "step": 829 + }, + { + "epoch": 0.767097966728281, + "grad_norm": 0.6415757536888123, + "learning_rate": 2.605267112551154e-05, + "loss": 0.9279, + "step": 830 + }, + { + "epoch": 0.7680221811460258, + "grad_norm": 0.7221881151199341, + "learning_rate": 2.58557057533555e-05, + "loss": 1.0798, + "step": 831 + }, + { + "epoch": 0.7689463955637708, + "grad_norm": 0.6958536505699158, + "learning_rate": 2.5659377158538012e-05, + "loss": 1.0316, + "step": 832 + }, + { + "epoch": 0.7698706099815157, + "grad_norm": 0.6595907807350159, + "learning_rate": 2.5463687027198356e-05, + "loss": 1.1578, + "step": 833 + }, + { + "epoch": 0.7707948243992606, + "grad_norm": 0.6337928771972656, + "learning_rate": 2.5268637039992293e-05, + "loss": 0.9473, + "step": 834 + }, + { + "epoch": 0.7717190388170055, + "grad_norm": 0.6634678244590759, + "learning_rate": 2.507422887207792e-05, + "loss": 1.1073, + "step": 835 + }, + { + "epoch": 0.7726432532347505, + "grad_norm": 0.640538215637207, + "learning_rate": 2.488046419310114e-05, + "loss": 0.9584, + "step": 836 + }, + { + "epoch": 0.7735674676524954, + "grad_norm": 0.6693369150161743, + "learning_rate": 2.4687344667181276e-05, + "loss": 1.2819, + "step": 837 + }, + { + "epoch": 0.7744916820702403, + "grad_norm": 0.714787483215332, + "learning_rate": 2.4494871952896947e-05, + "loss": 1.004, + "step": 838 + }, + { + "epoch": 0.7754158964879853, + "grad_norm": 0.5982414484024048, + "learning_rate": 2.4303047703271676e-05, + "loss": 0.9905, + "step": 839 + }, + { + "epoch": 0.7763401109057301, + "grad_norm": 0.7579019665718079, + "learning_rate": 2.411187356575969e-05, + "loss": 0.9345, + "step": 840 + }, + { + "epoch": 0.777264325323475, + "grad_norm": 0.6938247680664062, + "learning_rate": 2.3921351182231932e-05, + "loss": 1.0321, + "step": 841 + }, + { + "epoch": 0.7781885397412199, + "grad_norm": 0.6298626661300659, + "learning_rate": 2.3731482188961818e-05, + "loss": 1.0062, + "step": 842 + }, + { + "epoch": 0.7791127541589649, + "grad_norm": 0.5831551551818848, + "learning_rate": 2.354226821661114e-05, + "loss": 0.8964, + "step": 843 + }, + { + "epoch": 0.7800369685767098, + "grad_norm": 0.6120291948318481, + "learning_rate": 2.335371089021623e-05, + "loss": 0.87, + "step": 844 + }, + { + "epoch": 0.7809611829944547, + "grad_norm": 0.7116260528564453, + "learning_rate": 2.3165811829173923e-05, + "loss": 0.9359, + "step": 845 + }, + { + "epoch": 0.7818853974121996, + "grad_norm": 0.6599657535552979, + "learning_rate": 2.297857264722756e-05, + "loss": 0.9178, + "step": 846 + }, + { + "epoch": 0.7828096118299446, + "grad_norm": 0.6972012519836426, + "learning_rate": 2.2791994952453223e-05, + "loss": 0.7967, + "step": 847 + }, + { + "epoch": 0.7837338262476895, + "grad_norm": 0.698115348815918, + "learning_rate": 2.260608034724595e-05, + "loss": 0.9709, + "step": 848 + }, + { + "epoch": 0.7846580406654344, + "grad_norm": 0.5562407374382019, + "learning_rate": 2.242083042830595e-05, + "loss": 0.8812, + "step": 849 + }, + { + "epoch": 0.7855822550831792, + "grad_norm": 0.7722761631011963, + "learning_rate": 2.2236246786624792e-05, + "loss": 0.9895, + "step": 850 + }, + { + "epoch": 0.7865064695009242, + "grad_norm": 0.6705393195152283, + "learning_rate": 2.2052331007471915e-05, + "loss": 0.8756, + "step": 851 + }, + { + "epoch": 0.7874306839186691, + "grad_norm": 0.7007122039794922, + "learning_rate": 2.1869084670380835e-05, + "loss": 1.1128, + "step": 852 + }, + { + "epoch": 0.788354898336414, + "grad_norm": 0.5517435073852539, + "learning_rate": 2.1686509349135666e-05, + "loss": 0.8259, + "step": 853 + }, + { + "epoch": 0.789279112754159, + "grad_norm": 0.6680299043655396, + "learning_rate": 2.150460661175768e-05, + "loss": 0.9461, + "step": 854 + }, + { + "epoch": 0.7902033271719039, + "grad_norm": 0.9063950181007385, + "learning_rate": 2.1323378020491736e-05, + "loss": 0.9831, + "step": 855 + }, + { + "epoch": 0.7911275415896488, + "grad_norm": 0.626876711845398, + "learning_rate": 2.114282513179281e-05, + "loss": 1.0165, + "step": 856 + }, + { + "epoch": 0.7920517560073937, + "grad_norm": 0.5945959687232971, + "learning_rate": 2.096294949631278e-05, + "loss": 0.8775, + "step": 857 + }, + { + "epoch": 0.7929759704251387, + "grad_norm": 0.5666338205337524, + "learning_rate": 2.0783752658887066e-05, + "loss": 0.7815, + "step": 858 + }, + { + "epoch": 0.7939001848428835, + "grad_norm": 0.7798132300376892, + "learning_rate": 2.0605236158521256e-05, + "loss": 1.0216, + "step": 859 + }, + { + "epoch": 0.7948243992606284, + "grad_norm": 0.6861162781715393, + "learning_rate": 2.0427401528377953e-05, + "loss": 1.0544, + "step": 860 + }, + { + "epoch": 0.7957486136783734, + "grad_norm": 0.5692929625511169, + "learning_rate": 2.0250250295763683e-05, + "loss": 0.9492, + "step": 861 + }, + { + "epoch": 0.7966728280961183, + "grad_norm": 0.6331738829612732, + "learning_rate": 2.0073783982115723e-05, + "loss": 1.059, + "step": 862 + }, + { + "epoch": 0.7975970425138632, + "grad_norm": 0.597899317741394, + "learning_rate": 1.9898004102988933e-05, + "loss": 0.902, + "step": 863 + }, + { + "epoch": 0.7985212569316081, + "grad_norm": 0.6317451596260071, + "learning_rate": 1.9722912168042897e-05, + "loss": 0.9995, + "step": 864 + }, + { + "epoch": 0.7994454713493531, + "grad_norm": 0.6468867063522339, + "learning_rate": 1.954850968102895e-05, + "loss": 1.1114, + "step": 865 + }, + { + "epoch": 0.800369685767098, + "grad_norm": 0.7402564287185669, + "learning_rate": 1.937479813977703e-05, + "loss": 1.0089, + "step": 866 + }, + { + "epoch": 0.8012939001848429, + "grad_norm": 0.6300607323646545, + "learning_rate": 1.9201779036183142e-05, + "loss": 0.9192, + "step": 867 + }, + { + "epoch": 0.8022181146025879, + "grad_norm": 0.6695246696472168, + "learning_rate": 1.9029453856196376e-05, + "loss": 1.0276, + "step": 868 + }, + { + "epoch": 0.8031423290203327, + "grad_norm": 0.6602306962013245, + "learning_rate": 1.8857824079806086e-05, + "loss": 0.9744, + "step": 869 + }, + { + "epoch": 0.8040665434380776, + "grad_norm": 0.6365578770637512, + "learning_rate": 1.868689118102931e-05, + "loss": 0.9094, + "step": 870 + }, + { + "epoch": 0.8049907578558225, + "grad_norm": 0.7565975189208984, + "learning_rate": 1.8516656627898078e-05, + "loss": 0.8729, + "step": 871 + }, + { + "epoch": 0.8059149722735675, + "grad_norm": 0.7196916937828064, + "learning_rate": 1.8347121882446717e-05, + "loss": 0.9134, + "step": 872 + }, + { + "epoch": 0.8068391866913124, + "grad_norm": 0.6568743586540222, + "learning_rate": 1.8178288400699373e-05, + "loss": 1.0021, + "step": 873 + }, + { + "epoch": 0.8077634011090573, + "grad_norm": 0.6259042620658875, + "learning_rate": 1.8010157632657543e-05, + "loss": 0.9428, + "step": 874 + }, + { + "epoch": 0.8086876155268022, + "grad_norm": 0.6161400675773621, + "learning_rate": 1.784273102228754e-05, + "loss": 0.771, + "step": 875 + }, + { + "epoch": 0.8096118299445472, + "grad_norm": 0.6403729319572449, + "learning_rate": 1.7676010007508092e-05, + "loss": 0.8493, + "step": 876 + }, + { + "epoch": 0.8105360443622921, + "grad_norm": 0.6279831528663635, + "learning_rate": 1.7509996020178065e-05, + "loss": 1.0006, + "step": 877 + }, + { + "epoch": 0.8114602587800369, + "grad_norm": 0.7798029780387878, + "learning_rate": 1.7344690486084137e-05, + "loss": 1.228, + "step": 878 + }, + { + "epoch": 0.8123844731977818, + "grad_norm": 0.5822129845619202, + "learning_rate": 1.7180094824928493e-05, + "loss": 0.8871, + "step": 879 + }, + { + "epoch": 0.8133086876155268, + "grad_norm": 0.6040850877761841, + "learning_rate": 1.701621045031666e-05, + "loss": 0.8844, + "step": 880 + }, + { + "epoch": 0.8142329020332717, + "grad_norm": 0.6440210342407227, + "learning_rate": 1.6853038769745467e-05, + "loss": 1.1657, + "step": 881 + }, + { + "epoch": 0.8151571164510166, + "grad_norm": 0.7323306202888489, + "learning_rate": 1.6690581184590858e-05, + "loss": 1.0426, + "step": 882 + }, + { + "epoch": 0.8160813308687616, + "grad_norm": 0.6303192377090454, + "learning_rate": 1.652883909009578e-05, + "loss": 0.7934, + "step": 883 + }, + { + "epoch": 0.8170055452865065, + "grad_norm": 0.6480567455291748, + "learning_rate": 1.636781387535843e-05, + "loss": 1.0154, + "step": 884 + }, + { + "epoch": 0.8179297597042514, + "grad_norm": 0.636605441570282, + "learning_rate": 1.6207506923320092e-05, + "loss": 0.8304, + "step": 885 + }, + { + "epoch": 0.8188539741219963, + "grad_norm": 0.747488796710968, + "learning_rate": 1.604791961075336e-05, + "loss": 1.2283, + "step": 886 + }, + { + "epoch": 0.8197781885397413, + "grad_norm": 0.6267029643058777, + "learning_rate": 1.5889053308250368e-05, + "loss": 0.9121, + "step": 887 + }, + { + "epoch": 0.8207024029574861, + "grad_norm": 0.6464413404464722, + "learning_rate": 1.5730909380210945e-05, + "loss": 1.1461, + "step": 888 + }, + { + "epoch": 0.821626617375231, + "grad_norm": 0.5881391763687134, + "learning_rate": 1.557348918483086e-05, + "loss": 0.992, + "step": 889 + }, + { + "epoch": 0.822550831792976, + "grad_norm": 0.7358114123344421, + "learning_rate": 1.5416794074090258e-05, + "loss": 0.9164, + "step": 890 + }, + { + "epoch": 0.8234750462107209, + "grad_norm": 0.6415307521820068, + "learning_rate": 1.526082539374203e-05, + "loss": 0.9177, + "step": 891 + }, + { + "epoch": 0.8243992606284658, + "grad_norm": 0.6181014180183411, + "learning_rate": 1.5105584483300162e-05, + "loss": 1.0209, + "step": 892 + }, + { + "epoch": 0.8253234750462107, + "grad_norm": 0.6032791137695312, + "learning_rate": 1.495107267602831e-05, + "loss": 0.9944, + "step": 893 + }, + { + "epoch": 0.8262476894639557, + "grad_norm": 0.6096965670585632, + "learning_rate": 1.479729129892835e-05, + "loss": 0.9559, + "step": 894 + }, + { + "epoch": 0.8271719038817006, + "grad_norm": 0.7166401147842407, + "learning_rate": 1.4644241672729008e-05, + "loss": 1.0887, + "step": 895 + }, + { + "epoch": 0.8280961182994455, + "grad_norm": 0.7282501459121704, + "learning_rate": 1.4491925111874383e-05, + "loss": 1.1423, + "step": 896 + }, + { + "epoch": 0.8290203327171903, + "grad_norm": 0.7395532727241516, + "learning_rate": 1.4340342924512806e-05, + "loss": 1.1972, + "step": 897 + }, + { + "epoch": 0.8299445471349353, + "grad_norm": 0.5643945932388306, + "learning_rate": 1.4189496412485592e-05, + "loss": 0.7635, + "step": 898 + }, + { + "epoch": 0.8308687615526802, + "grad_norm": 0.598728358745575, + "learning_rate": 1.4039386871315696e-05, + "loss": 0.8482, + "step": 899 + }, + { + "epoch": 0.8317929759704251, + "grad_norm": 0.6027209758758545, + "learning_rate": 1.3890015590196803e-05, + "loss": 0.8689, + "step": 900 + }, + { + "epoch": 0.83271719038817, + "grad_norm": 0.7764828205108643, + "learning_rate": 1.374138385198217e-05, + "loss": 1.1675, + "step": 901 + }, + { + "epoch": 0.833641404805915, + "grad_norm": 0.7254951596260071, + "learning_rate": 1.3593492933173512e-05, + "loss": 1.0654, + "step": 902 + }, + { + "epoch": 0.8345656192236599, + "grad_norm": 0.7342492938041687, + "learning_rate": 1.3446344103910203e-05, + "loss": 0.9426, + "step": 903 + }, + { + "epoch": 0.8354898336414048, + "grad_norm": 0.6517652273178101, + "learning_rate": 1.3299938627958297e-05, + "loss": 0.9824, + "step": 904 + }, + { + "epoch": 0.8364140480591498, + "grad_norm": 0.7110716104507446, + "learning_rate": 1.3154277762699607e-05, + "loss": 1.0306, + "step": 905 + }, + { + "epoch": 0.8373382624768947, + "grad_norm": 0.7344281673431396, + "learning_rate": 1.300936275912098e-05, + "loss": 0.9224, + "step": 906 + }, + { + "epoch": 0.8382624768946395, + "grad_norm": 0.6213476061820984, + "learning_rate": 1.2865194861803564e-05, + "loss": 0.8624, + "step": 907 + }, + { + "epoch": 0.8391866913123844, + "grad_norm": 0.6333045363426208, + "learning_rate": 1.2721775308912132e-05, + "loss": 0.9168, + "step": 908 + }, + { + "epoch": 0.8401109057301294, + "grad_norm": 0.7005438804626465, + "learning_rate": 1.2579105332184304e-05, + "loss": 1.0093, + "step": 909 + }, + { + "epoch": 0.8410351201478743, + "grad_norm": 0.7070121169090271, + "learning_rate": 1.2437186156920167e-05, + "loss": 0.909, + "step": 910 + }, + { + "epoch": 0.8419593345656192, + "grad_norm": 0.6689184904098511, + "learning_rate": 1.229601900197166e-05, + "loss": 0.983, + "step": 911 + }, + { + "epoch": 0.8428835489833642, + "grad_norm": 0.6764459609985352, + "learning_rate": 1.2155605079732046e-05, + "loss": 0.9389, + "step": 912 + }, + { + "epoch": 0.8438077634011091, + "grad_norm": 0.5936344861984253, + "learning_rate": 1.2015945596125567e-05, + "loss": 0.9702, + "step": 913 + }, + { + "epoch": 0.844731977818854, + "grad_norm": 0.615927517414093, + "learning_rate": 1.1877041750597173e-05, + "loss": 0.9069, + "step": 914 + }, + { + "epoch": 0.8456561922365989, + "grad_norm": 0.6893014907836914, + "learning_rate": 1.1738894736101991e-05, + "loss": 1.0167, + "step": 915 + }, + { + "epoch": 0.8465804066543438, + "grad_norm": 0.7017319798469543, + "learning_rate": 1.160150573909532e-05, + "loss": 1.0938, + "step": 916 + }, + { + "epoch": 0.8475046210720887, + "grad_norm": 0.6698352694511414, + "learning_rate": 1.1464875939522312e-05, + "loss": 0.9357, + "step": 917 + }, + { + "epoch": 0.8484288354898336, + "grad_norm": 0.5998057723045349, + "learning_rate": 1.132900651080785e-05, + "loss": 0.8941, + "step": 918 + }, + { + "epoch": 0.8493530499075785, + "grad_norm": 0.6822893023490906, + "learning_rate": 1.1193898619846465e-05, + "loss": 1.0767, + "step": 919 + }, + { + "epoch": 0.8502772643253235, + "grad_norm": 0.6629787087440491, + "learning_rate": 1.1059553426992365e-05, + "loss": 0.7995, + "step": 920 + }, + { + "epoch": 0.8512014787430684, + "grad_norm": 0.677261233329773, + "learning_rate": 1.0925972086049452e-05, + "loss": 1.0926, + "step": 921 + }, + { + "epoch": 0.8521256931608133, + "grad_norm": 0.635703980922699, + "learning_rate": 1.0793155744261351e-05, + "loss": 0.9412, + "step": 922 + }, + { + "epoch": 0.8530499075785583, + "grad_norm": 0.6668652296066284, + "learning_rate": 1.0661105542301642e-05, + "loss": 1.0261, + "step": 923 + }, + { + "epoch": 0.8539741219963032, + "grad_norm": 0.639267086982727, + "learning_rate": 1.0529822614264018e-05, + "loss": 1.0228, + "step": 924 + }, + { + "epoch": 0.8548983364140481, + "grad_norm": 0.6785367131233215, + "learning_rate": 1.039930808765256e-05, + "loss": 0.9784, + "step": 925 + }, + { + "epoch": 0.8558225508317929, + "grad_norm": 0.7142111659049988, + "learning_rate": 1.026956308337199e-05, + "loss": 1.0, + "step": 926 + }, + { + "epoch": 0.8567467652495379, + "grad_norm": 0.7111828327178955, + "learning_rate": 1.0140588715718191e-05, + "loss": 1.1785, + "step": 927 + }, + { + "epoch": 0.8576709796672828, + "grad_norm": 0.732770562171936, + "learning_rate": 1.0012386092368475e-05, + "loss": 1.1113, + "step": 928 + }, + { + "epoch": 0.8585951940850277, + "grad_norm": 0.7839651107788086, + "learning_rate": 9.884956314372174e-06, + "loss": 1.107, + "step": 929 + }, + { + "epoch": 0.8595194085027726, + "grad_norm": 0.6124250292778015, + "learning_rate": 9.75830047614117e-06, + "loss": 0.9972, + "step": 930 + }, + { + "epoch": 0.8604436229205176, + "grad_norm": 0.6591629981994629, + "learning_rate": 9.632419665440428e-06, + "loss": 1.0742, + "step": 931 + }, + { + "epoch": 0.8613678373382625, + "grad_norm": 0.8186694979667664, + "learning_rate": 9.507314963378745e-06, + "loss": 1.24, + "step": 932 + }, + { + "epoch": 0.8622920517560074, + "grad_norm": 0.6867371201515198, + "learning_rate": 9.382987444399394e-06, + "loss": 1.0846, + "step": 933 + }, + { + "epoch": 0.8632162661737524, + "grad_norm": 0.6072695851325989, + "learning_rate": 9.259438176270962e-06, + "loss": 1.0114, + "step": 934 + }, + { + "epoch": 0.8641404805914972, + "grad_norm": 0.7523977160453796, + "learning_rate": 9.136668220078094e-06, + "loss": 1.2716, + "step": 935 + }, + { + "epoch": 0.8650646950092421, + "grad_norm": 0.6731902956962585, + "learning_rate": 9.014678630212469e-06, + "loss": 0.981, + "step": 936 + }, + { + "epoch": 0.865988909426987, + "grad_norm": 0.5975137948989868, + "learning_rate": 8.893470454363706e-06, + "loss": 0.9502, + "step": 937 + }, + { + "epoch": 0.866913123844732, + "grad_norm": 0.7133376598358154, + "learning_rate": 8.773044733510338e-06, + "loss": 0.9886, + "step": 938 + }, + { + "epoch": 0.8678373382624769, + "grad_norm": 0.5946080684661865, + "learning_rate": 8.65340250191089e-06, + "loss": 0.8133, + "step": 939 + }, + { + "epoch": 0.8687615526802218, + "grad_norm": 0.68995201587677, + "learning_rate": 8.534544787095078e-06, + "loss": 1.207, + "step": 940 + }, + { + "epoch": 0.8696857670979667, + "grad_norm": 0.6029350161552429, + "learning_rate": 8.416472609854808e-06, + "loss": 0.8217, + "step": 941 + }, + { + "epoch": 0.8706099815157117, + "grad_norm": 0.6694270968437195, + "learning_rate": 8.299186984235585e-06, + "loss": 1.1204, + "step": 942 + }, + { + "epoch": 0.8715341959334566, + "grad_norm": 0.6296148300170898, + "learning_rate": 8.182688917527726e-06, + "loss": 0.8465, + "step": 943 + }, + { + "epoch": 0.8724584103512015, + "grad_norm": 0.733870804309845, + "learning_rate": 8.06697941025768e-06, + "loss": 1.1511, + "step": 944 + }, + { + "epoch": 0.8733826247689463, + "grad_norm": 0.6709319353103638, + "learning_rate": 7.952059456179507e-06, + "loss": 1.0078, + "step": 945 + }, + { + "epoch": 0.8743068391866913, + "grad_norm": 0.6589116454124451, + "learning_rate": 7.837930042266262e-06, + "loss": 1.0012, + "step": 946 + }, + { + "epoch": 0.8752310536044362, + "grad_norm": 0.6906008124351501, + "learning_rate": 7.724592148701615e-06, + "loss": 0.8909, + "step": 947 + }, + { + "epoch": 0.8761552680221811, + "grad_norm": 0.6157503128051758, + "learning_rate": 7.612046748871327e-06, + "loss": 0.9777, + "step": 948 + }, + { + "epoch": 0.8770794824399261, + "grad_norm": 0.6792111992835999, + "learning_rate": 7.50029480935498e-06, + "loss": 1.0109, + "step": 949 + }, + { + "epoch": 0.878003696857671, + "grad_norm": 0.7034531235694885, + "learning_rate": 7.389337289917652e-06, + "loss": 1.0279, + "step": 950 + }, + { + "epoch": 0.8789279112754159, + "grad_norm": 0.6194853186607361, + "learning_rate": 7.279175143501604e-06, + "loss": 0.9406, + "step": 951 + }, + { + "epoch": 0.8798521256931608, + "grad_norm": 0.6334475874900818, + "learning_rate": 7.1698093162182125e-06, + "loss": 0.9443, + "step": 952 + }, + { + "epoch": 0.8807763401109058, + "grad_norm": 0.6958860754966736, + "learning_rate": 7.061240747339737e-06, + "loss": 1.0342, + "step": 953 + }, + { + "epoch": 0.8817005545286506, + "grad_norm": 0.6618797183036804, + "learning_rate": 6.953470369291348e-06, + "loss": 0.9296, + "step": 954 + }, + { + "epoch": 0.8826247689463955, + "grad_norm": 0.6090981960296631, + "learning_rate": 6.846499107643012e-06, + "loss": 0.8807, + "step": 955 + }, + { + "epoch": 0.8835489833641405, + "grad_norm": 0.6621676087379456, + "learning_rate": 6.74032788110166e-06, + "loss": 0.948, + "step": 956 + }, + { + "epoch": 0.8844731977818854, + "grad_norm": 0.6716128587722778, + "learning_rate": 6.634957601503233e-06, + "loss": 1.0541, + "step": 957 + }, + { + "epoch": 0.8853974121996303, + "grad_norm": 0.7759876847267151, + "learning_rate": 6.530389173804807e-06, + "loss": 1.214, + "step": 958 + }, + { + "epoch": 0.8863216266173752, + "grad_norm": 0.7628396153450012, + "learning_rate": 6.4266234960769226e-06, + "loss": 1.0514, + "step": 959 + }, + { + "epoch": 0.8872458410351202, + "grad_norm": 0.6943473219871521, + "learning_rate": 6.323661459495811e-06, + "loss": 0.8367, + "step": 960 + }, + { + "epoch": 0.8881700554528651, + "grad_norm": 0.7233004570007324, + "learning_rate": 6.221503948335705e-06, + "loss": 1.0337, + "step": 961 + }, + { + "epoch": 0.88909426987061, + "grad_norm": 0.625541627407074, + "learning_rate": 6.1201518399613635e-06, + "loss": 0.9529, + "step": 962 + }, + { + "epoch": 0.890018484288355, + "grad_norm": 0.7505578398704529, + "learning_rate": 6.019606004820422e-06, + "loss": 0.8923, + "step": 963 + }, + { + "epoch": 0.8909426987060998, + "grad_norm": 0.8915427923202515, + "learning_rate": 5.919867306435934e-06, + "loss": 1.1187, + "step": 964 + }, + { + "epoch": 0.8918669131238447, + "grad_norm": 0.6175352931022644, + "learning_rate": 5.820936601399029e-06, + "loss": 0.9357, + "step": 965 + }, + { + "epoch": 0.8927911275415896, + "grad_norm": 0.6581393480300903, + "learning_rate": 5.722814739361459e-06, + "loss": 0.9969, + "step": 966 + }, + { + "epoch": 0.8937153419593346, + "grad_norm": 0.5962516665458679, + "learning_rate": 5.625502563028407e-06, + "loss": 0.9355, + "step": 967 + }, + { + "epoch": 0.8946395563770795, + "grad_norm": 0.6447887420654297, + "learning_rate": 5.529000908151105e-06, + "loss": 1.0156, + "step": 968 + }, + { + "epoch": 0.8955637707948244, + "grad_norm": 0.6103833317756653, + "learning_rate": 5.4333106035198035e-06, + "loss": 0.8779, + "step": 969 + }, + { + "epoch": 0.8964879852125693, + "grad_norm": 0.6438865065574646, + "learning_rate": 5.338432470956589e-06, + "loss": 1.0516, + "step": 970 + }, + { + "epoch": 0.8974121996303143, + "grad_norm": 0.6992761492729187, + "learning_rate": 5.244367325308286e-06, + "loss": 1.1486, + "step": 971 + }, + { + "epoch": 0.8983364140480592, + "grad_norm": 0.676099419593811, + "learning_rate": 5.151115974439569e-06, + "loss": 1.0771, + "step": 972 + }, + { + "epoch": 0.899260628465804, + "grad_norm": 0.6803602576255798, + "learning_rate": 5.058679219225881e-06, + "loss": 1.0922, + "step": 973 + }, + { + "epoch": 0.9001848428835489, + "grad_norm": 0.5867182016372681, + "learning_rate": 4.967057853546653e-06, + "loss": 0.8814, + "step": 974 + }, + { + "epoch": 0.9011090573012939, + "grad_norm": 0.6458511352539062, + "learning_rate": 4.876252664278502e-06, + "loss": 1.0786, + "step": 975 + }, + { + "epoch": 0.9020332717190388, + "grad_norm": 0.6744295358657837, + "learning_rate": 4.786264431288423e-06, + "loss": 1.0097, + "step": 976 + }, + { + "epoch": 0.9029574861367837, + "grad_norm": 0.7309542298316956, + "learning_rate": 4.697093927427032e-06, + "loss": 1.0585, + "step": 977 + }, + { + "epoch": 0.9038817005545287, + "grad_norm": 0.6538105607032776, + "learning_rate": 4.608741918522097e-06, + "loss": 0.9625, + "step": 978 + }, + { + "epoch": 0.9048059149722736, + "grad_norm": 0.6101495623588562, + "learning_rate": 4.521209163371809e-06, + "loss": 1.0183, + "step": 979 + }, + { + "epoch": 0.9057301293900185, + "grad_norm": 0.7289119958877563, + "learning_rate": 4.434496413738332e-06, + "loss": 1.1892, + "step": 980 + }, + { + "epoch": 0.9066543438077634, + "grad_norm": 0.7019684314727783, + "learning_rate": 4.348604414341306e-06, + "loss": 0.9988, + "step": 981 + }, + { + "epoch": 0.9075785582255084, + "grad_norm": 0.8499860167503357, + "learning_rate": 4.263533902851535e-06, + "loss": 1.1156, + "step": 982 + }, + { + "epoch": 0.9085027726432532, + "grad_norm": 0.677656888961792, + "learning_rate": 4.179285609884554e-06, + "loss": 0.9712, + "step": 983 + }, + { + "epoch": 0.9094269870609981, + "grad_norm": 0.6847138404846191, + "learning_rate": 4.095860258994388e-06, + "loss": 1.1126, + "step": 984 + }, + { + "epoch": 0.910351201478743, + "grad_norm": 0.6552377343177795, + "learning_rate": 4.013258566667388e-06, + "loss": 0.9818, + "step": 985 + }, + { + "epoch": 0.911275415896488, + "grad_norm": 0.6576426029205322, + "learning_rate": 3.931481242315993e-06, + "loss": 0.8636, + "step": 986 + }, + { + "epoch": 0.9121996303142329, + "grad_norm": 1.0121254920959473, + "learning_rate": 3.850528988272684e-06, + "loss": 1.3646, + "step": 987 + }, + { + "epoch": 0.9131238447319778, + "grad_norm": 0.7985464334487915, + "learning_rate": 3.770402499783976e-06, + "loss": 1.1095, + "step": 988 + }, + { + "epoch": 0.9140480591497228, + "grad_norm": 0.6861594915390015, + "learning_rate": 3.691102465004415e-06, + "loss": 1.0093, + "step": 989 + }, + { + "epoch": 0.9149722735674677, + "grad_norm": 0.6798709630966187, + "learning_rate": 3.6126295649906216e-06, + "loss": 1.1888, + "step": 990 + }, + { + "epoch": 0.9158964879852126, + "grad_norm": 0.6571061015129089, + "learning_rate": 3.534984473695535e-06, + "loss": 0.9303, + "step": 991 + }, + { + "epoch": 0.9168207024029574, + "grad_norm": 0.5712608098983765, + "learning_rate": 3.458167857962613e-06, + "loss": 0.8497, + "step": 992 + }, + { + "epoch": 0.9177449168207024, + "grad_norm": 0.7173097133636475, + "learning_rate": 3.3821803775199832e-06, + "loss": 0.9325, + "step": 993 + }, + { + "epoch": 0.9186691312384473, + "grad_norm": 0.6583461165428162, + "learning_rate": 3.3070226849749366e-06, + "loss": 0.9897, + "step": 994 + }, + { + "epoch": 0.9195933456561922, + "grad_norm": 0.6946638226509094, + "learning_rate": 3.2326954258082408e-06, + "loss": 0.9596, + "step": 995 + }, + { + "epoch": 0.9205175600739371, + "grad_norm": 0.6112401485443115, + "learning_rate": 3.159199238368593e-06, + "loss": 0.8923, + "step": 996 + }, + { + "epoch": 0.9214417744916821, + "grad_norm": 0.7291046977043152, + "learning_rate": 3.086534753867154e-06, + "loss": 0.9113, + "step": 997 + }, + { + "epoch": 0.922365988909427, + "grad_norm": 0.7725870609283447, + "learning_rate": 3.0147025963721433e-06, + "loss": 1.0421, + "step": 998 + }, + { + "epoch": 0.9232902033271719, + "grad_norm": 0.6525258421897888, + "learning_rate": 2.9437033828034866e-06, + "loss": 0.9522, + "step": 999 + }, + { + "epoch": 0.9242144177449169, + "grad_norm": 0.7263964414596558, + "learning_rate": 2.8735377229273998e-06, + "loss": 1.2127, + "step": 1000 + }, + { + "epoch": 0.9251386321626618, + "grad_norm": 0.6609349846839905, + "learning_rate": 2.8042062193513353e-06, + "loss": 0.8555, + "step": 1001 + }, + { + "epoch": 0.9260628465804066, + "grad_norm": 0.6099725961685181, + "learning_rate": 2.735709467518699e-06, + "loss": 0.8729, + "step": 1002 + }, + { + "epoch": 0.9269870609981515, + "grad_norm": 0.6820710897445679, + "learning_rate": 2.668048055703731e-06, + "loss": 0.985, + "step": 1003 + }, + { + "epoch": 0.9279112754158965, + "grad_norm": 0.615996241569519, + "learning_rate": 2.6012225650064893e-06, + "loss": 1.0151, + "step": 1004 + }, + { + "epoch": 0.9288354898336414, + "grad_norm": 0.6031765937805176, + "learning_rate": 2.535233569347861e-06, + "loss": 0.9829, + "step": 1005 + }, + { + "epoch": 0.9297597042513863, + "grad_norm": 0.6296886801719666, + "learning_rate": 2.470081635464594e-06, + "loss": 0.9831, + "step": 1006 + }, + { + "epoch": 0.9306839186691312, + "grad_norm": 0.6596071720123291, + "learning_rate": 2.405767322904462e-06, + "loss": 0.9571, + "step": 1007 + }, + { + "epoch": 0.9316081330868762, + "grad_norm": 0.6262056827545166, + "learning_rate": 2.342291184021461e-06, + "loss": 0.9011, + "step": 1008 + }, + { + "epoch": 0.9325323475046211, + "grad_norm": 0.6384697556495667, + "learning_rate": 2.279653763971057e-06, + "loss": 0.8355, + "step": 1009 + }, + { + "epoch": 0.933456561922366, + "grad_norm": 0.8274184465408325, + "learning_rate": 2.2178556007054872e-06, + "loss": 1.1984, + "step": 1010 + }, + { + "epoch": 0.9343807763401109, + "grad_norm": 0.6482036113739014, + "learning_rate": 2.1568972249691566e-06, + "loss": 1.0143, + "step": 1011 + }, + { + "epoch": 0.9353049907578558, + "grad_norm": 0.6427212953567505, + "learning_rate": 2.0967791602941154e-06, + "loss": 0.9057, + "step": 1012 + }, + { + "epoch": 0.9362292051756007, + "grad_norm": 0.6750584840774536, + "learning_rate": 2.037501922995477e-06, + "loss": 1.065, + "step": 1013 + }, + { + "epoch": 0.9371534195933456, + "grad_norm": 0.5887543559074402, + "learning_rate": 1.979066022167042e-06, + "loss": 0.8667, + "step": 1014 + }, + { + "epoch": 0.9380776340110906, + "grad_norm": 0.6207574605941772, + "learning_rate": 1.921471959676957e-06, + "loss": 0.9039, + "step": 1015 + }, + { + "epoch": 0.9390018484288355, + "grad_norm": 0.6421559453010559, + "learning_rate": 1.8647202301633194e-06, + "loss": 0.8849, + "step": 1016 + }, + { + "epoch": 0.9399260628465804, + "grad_norm": 0.6757624745368958, + "learning_rate": 1.8088113210299795e-06, + "loss": 0.82, + "step": 1017 + }, + { + "epoch": 0.9408502772643254, + "grad_norm": 0.6625667810440063, + "learning_rate": 1.7537457124423895e-06, + "loss": 1.1408, + "step": 1018 + }, + { + "epoch": 0.9417744916820703, + "grad_norm": 0.5866093039512634, + "learning_rate": 1.6995238773233835e-06, + "loss": 0.7394, + "step": 1019 + }, + { + "epoch": 0.9426987060998152, + "grad_norm": 0.6744884252548218, + "learning_rate": 1.6461462813492034e-06, + "loss": 0.8599, + "step": 1020 + }, + { + "epoch": 0.94362292051756, + "grad_norm": 0.6200700998306274, + "learning_rate": 1.5936133829454691e-06, + "loss": 0.9052, + "step": 1021 + }, + { + "epoch": 0.944547134935305, + "grad_norm": 0.7099212408065796, + "learning_rate": 1.5419256332832255e-06, + "loss": 1.0159, + "step": 1022 + }, + { + "epoch": 0.9454713493530499, + "grad_norm": 0.6735338568687439, + "learning_rate": 1.4910834762750902e-06, + "loss": 0.7598, + "step": 1023 + }, + { + "epoch": 0.9463955637707948, + "grad_norm": 0.6843352317810059, + "learning_rate": 1.4410873485714238e-06, + "loss": 0.9318, + "step": 1024 + }, + { + "epoch": 0.9473197781885397, + "grad_norm": 0.6304380297660828, + "learning_rate": 1.3919376795566209e-06, + "loss": 0.9991, + "step": 1025 + }, + { + "epoch": 0.9482439926062847, + "grad_norm": 0.6792097091674805, + "learning_rate": 1.3436348913453578e-06, + "loss": 1.0106, + "step": 1026 + }, + { + "epoch": 0.9491682070240296, + "grad_norm": 0.6038525104522705, + "learning_rate": 1.296179398778985e-06, + "loss": 0.8326, + "step": 1027 + }, + { + "epoch": 0.9500924214417745, + "grad_norm": 0.6601313948631287, + "learning_rate": 1.249571609422029e-06, + "loss": 1.0222, + "step": 1028 + }, + { + "epoch": 0.9510166358595195, + "grad_norm": 0.655646026134491, + "learning_rate": 1.2038119235586176e-06, + "loss": 0.9135, + "step": 1029 + }, + { + "epoch": 0.9519408502772643, + "grad_norm": 0.7122117280960083, + "learning_rate": 1.158900734189039e-06, + "loss": 0.9461, + "step": 1030 + }, + { + "epoch": 0.9528650646950092, + "grad_norm": 0.6188072562217712, + "learning_rate": 1.1148384270264544e-06, + "loss": 0.9335, + "step": 1031 + }, + { + "epoch": 0.9537892791127541, + "grad_norm": 0.6677936911582947, + "learning_rate": 1.0716253804934795e-06, + "loss": 1.0937, + "step": 1032 + }, + { + "epoch": 0.9547134935304991, + "grad_norm": 0.6385655403137207, + "learning_rate": 1.0292619657189751e-06, + "loss": 0.9609, + "step": 1033 + }, + { + "epoch": 0.955637707948244, + "grad_norm": 0.6372730135917664, + "learning_rate": 9.877485465349058e-07, + "loss": 1.0493, + "step": 1034 + }, + { + "epoch": 0.9565619223659889, + "grad_norm": 0.7411220073699951, + "learning_rate": 9.470854794731421e-07, + "loss": 0.9944, + "step": 1035 + }, + { + "epoch": 0.9574861367837338, + "grad_norm": 0.7947145104408264, + "learning_rate": 9.072731137624413e-07, + "loss": 1.0633, + "step": 1036 + }, + { + "epoch": 0.9584103512014788, + "grad_norm": 0.6582641005516052, + "learning_rate": 8.683117913254268e-07, + "loss": 0.989, + "step": 1037 + }, + { + "epoch": 0.9593345656192237, + "grad_norm": 0.5844302177429199, + "learning_rate": 8.3020184677568e-07, + "loss": 0.8765, + "step": 1038 + }, + { + "epoch": 0.9602587800369686, + "grad_norm": 0.5882364511489868, + "learning_rate": 7.929436074148533e-07, + "loss": 0.7434, + "step": 1039 + }, + { + "epoch": 0.9611829944547134, + "grad_norm": 0.6744895577430725, + "learning_rate": 7.56537393229817e-07, + "loss": 0.9731, + "step": 1040 + }, + { + "epoch": 0.9621072088724584, + "grad_norm": 0.6401616930961609, + "learning_rate": 7.209835168899836e-07, + "loss": 0.917, + "step": 1041 + }, + { + "epoch": 0.9630314232902033, + "grad_norm": 0.6167658567428589, + "learning_rate": 6.862822837445881e-07, + "loss": 0.8884, + "step": 1042 + }, + { + "epoch": 0.9639556377079482, + "grad_norm": 0.6094913482666016, + "learning_rate": 6.524339918200339e-07, + "loss": 0.9436, + "step": 1043 + }, + { + "epoch": 0.9648798521256932, + "grad_norm": 0.6686248183250427, + "learning_rate": 6.194389318173954e-07, + "loss": 0.9989, + "step": 1044 + }, + { + "epoch": 0.9658040665434381, + "grad_norm": 0.6612945795059204, + "learning_rate": 5.872973871098975e-07, + "loss": 1.0624, + "step": 1045 + }, + { + "epoch": 0.966728280961183, + "grad_norm": 0.6373345851898193, + "learning_rate": 5.560096337404397e-07, + "loss": 0.8676, + "step": 1046 + }, + { + "epoch": 0.967652495378928, + "grad_norm": 0.6785854697227478, + "learning_rate": 5.255759404192651e-07, + "loss": 0.8961, + "step": 1047 + }, + { + "epoch": 0.9685767097966729, + "grad_norm": 0.743614137172699, + "learning_rate": 4.959965685216949e-07, + "loss": 0.9324, + "step": 1048 + }, + { + "epoch": 0.9695009242144177, + "grad_norm": 0.7066333889961243, + "learning_rate": 4.6727177208577553e-07, + "loss": 0.7884, + "step": 1049 + }, + { + "epoch": 0.9704251386321626, + "grad_norm": 0.5863295793533325, + "learning_rate": 4.3940179781019055e-07, + "loss": 0.8154, + "step": 1050 + }, + { + "epoch": 0.9713493530499075, + "grad_norm": 0.6181464195251465, + "learning_rate": 4.1238688505211845e-07, + "loss": 0.8859, + "step": 1051 + }, + { + "epoch": 0.9722735674676525, + "grad_norm": 0.6221921443939209, + "learning_rate": 3.8622726582514537e-07, + "loss": 0.9566, + "step": 1052 + }, + { + "epoch": 0.9731977818853974, + "grad_norm": 0.6333780288696289, + "learning_rate": 3.609231647972999e-07, + "loss": 0.9461, + "step": 1053 + }, + { + "epoch": 0.9741219963031423, + "grad_norm": 0.6009412407875061, + "learning_rate": 3.364747992891215e-07, + "loss": 0.9492, + "step": 1054 + }, + { + "epoch": 0.9750462107208873, + "grad_norm": 0.6830005645751953, + "learning_rate": 3.128823792718061e-07, + "loss": 0.9962, + "step": 1055 + }, + { + "epoch": 0.9759704251386322, + "grad_norm": 0.6181619167327881, + "learning_rate": 2.901461073653633e-07, + "loss": 0.9007, + "step": 1056 + }, + { + "epoch": 0.9768946395563771, + "grad_norm": 0.6359338164329529, + "learning_rate": 2.6826617883690673e-07, + "loss": 0.9266, + "step": 1057 + }, + { + "epoch": 0.977818853974122, + "grad_norm": 0.6645272374153137, + "learning_rate": 2.472427815989886e-07, + "loss": 0.8317, + "step": 1058 + }, + { + "epoch": 0.9787430683918669, + "grad_norm": 0.6051627993583679, + "learning_rate": 2.2707609620795656e-07, + "loss": 0.9413, + "step": 1059 + }, + { + "epoch": 0.9796672828096118, + "grad_norm": 0.6268156170845032, + "learning_rate": 2.0776629586239936e-07, + "loss": 0.9499, + "step": 1060 + }, + { + "epoch": 0.9805914972273567, + "grad_norm": 0.7013781666755676, + "learning_rate": 1.8931354640171484e-07, + "loss": 1.0324, + "step": 1061 + }, + { + "epoch": 0.9815157116451017, + "grad_norm": 0.6652680039405823, + "learning_rate": 1.7171800630458868e-07, + "loss": 0.8782, + "step": 1062 + }, + { + "epoch": 0.9824399260628466, + "grad_norm": 0.6849194765090942, + "learning_rate": 1.5497982668775113e-07, + "loss": 1.0386, + "step": 1063 + }, + { + "epoch": 0.9833641404805915, + "grad_norm": 0.6166237592697144, + "learning_rate": 1.3909915130457808e-07, + "loss": 0.9326, + "step": 1064 + }, + { + "epoch": 0.9842883548983364, + "grad_norm": 0.6930829882621765, + "learning_rate": 1.2407611654390305e-07, + "loss": 1.0126, + "step": 1065 + }, + { + "epoch": 0.9852125693160814, + "grad_norm": 0.6337749361991882, + "learning_rate": 1.0991085142886271e-07, + "loss": 0.8914, + "step": 1066 + }, + { + "epoch": 0.9861367837338263, + "grad_norm": 0.7357750535011292, + "learning_rate": 9.660347761571986e-08, + "loss": 0.9678, + "step": 1067 + }, + { + "epoch": 0.9870609981515711, + "grad_norm": 0.6513310670852661, + "learning_rate": 8.41541093929199e-08, + "loss": 1.1726, + "step": 1068 + }, + { + "epoch": 0.987985212569316, + "grad_norm": 0.5771944522857666, + "learning_rate": 7.256285368001381e-08, + "loss": 0.8207, + "step": 1069 + }, + { + "epoch": 0.988909426987061, + "grad_norm": 0.6897679567337036, + "learning_rate": 6.182981002679223e-08, + "loss": 1.0962, + "step": 1070 + }, + { + "epoch": 0.9898336414048059, + "grad_norm": 0.6474549174308777, + "learning_rate": 5.195507061240834e-08, + "loss": 1.0342, + "step": 1071 + }, + { + "epoch": 0.9907578558225508, + "grad_norm": 0.6482800245285034, + "learning_rate": 4.293872024463408e-08, + "loss": 0.9149, + "step": 1072 + }, + { + "epoch": 0.9916820702402958, + "grad_norm": 0.6202918887138367, + "learning_rate": 3.4780836359038505e-08, + "loss": 0.8358, + "step": 1073 + }, + { + "epoch": 0.9926062846580407, + "grad_norm": 0.6662231087684631, + "learning_rate": 2.7481489018410523e-08, + "loss": 1.0221, + "step": 1074 + }, + { + "epoch": 0.9935304990757856, + "grad_norm": 0.5381618142127991, + "learning_rate": 2.1040740912126046e-08, + "loss": 0.8288, + "step": 1075 + }, + { + "epoch": 0.9944547134935305, + "grad_norm": 0.6266561150550842, + "learning_rate": 1.545864735558178e-08, + "loss": 1.0198, + "step": 1076 + }, + { + "epoch": 0.9953789279112755, + "grad_norm": 0.7117410898208618, + "learning_rate": 1.0735256289795548e-08, + "loss": 1.1612, + "step": 1077 + }, + { + "epoch": 0.9963031423290203, + "grad_norm": 0.6411619782447815, + "learning_rate": 6.8706082808955855e-09, + "loss": 0.9829, + "step": 1078 + }, + { + "epoch": 0.9972273567467652, + "grad_norm": 0.6944364309310913, + "learning_rate": 3.8647365198429815e-09, + "loss": 1.1419, + "step": 1079 + }, + { + "epoch": 0.9981515711645101, + "grad_norm": 0.6280131340026855, + "learning_rate": 1.7176668221208225e-09, + "loss": 0.9486, + "step": 1080 + }, + { + "epoch": 0.9990757855822551, + "grad_norm": 0.5805585980415344, + "learning_rate": 4.294176275232431e-10, + "loss": 0.8376, + "step": 1081 + }, + { + "epoch": 1.0, + "grad_norm": 0.7144219875335693, + "learning_rate": 0.0, + "loss": 0.8613, + "step": 1082 + } + ], + "logging_steps": 1, + "max_steps": 1082, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.338302143660032e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}