{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02333926539662164, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001166963269831082, "grad_norm": 0.5096844434738159, "learning_rate": 4e-05, "loss": 0.5061, "step": 1 }, { "epoch": 0.0002333926539662164, "grad_norm": 0.5573846697807312, "learning_rate": 8e-05, "loss": 0.536, "step": 2 }, { "epoch": 0.0003500889809493246, "grad_norm": 0.5168461203575134, "learning_rate": 0.00012, "loss": 0.5209, "step": 3 }, { "epoch": 0.0004667853079324328, "grad_norm": 0.4047943651676178, "learning_rate": 0.00016, "loss": 0.4604, "step": 4 }, { "epoch": 0.000583481634915541, "grad_norm": 0.20825760066509247, "learning_rate": 0.0002, "loss": 0.3089, "step": 5 }, { "epoch": 0.0007001779618986492, "grad_norm": 0.20194634795188904, "learning_rate": 0.00019997664642690334, "loss": 0.3358, "step": 6 }, { "epoch": 0.0008168742888817575, "grad_norm": 0.193731427192688, "learning_rate": 0.00019995329285380664, "loss": 0.3042, "step": 7 }, { "epoch": 0.0009335706158648656, "grad_norm": 0.21419042348861694, "learning_rate": 0.00019992993928070996, "loss": 0.341, "step": 8 }, { "epoch": 0.001050266942847974, "grad_norm": 0.18434806168079376, "learning_rate": 0.00019990658570761326, "loss": 0.2665, "step": 9 }, { "epoch": 0.001166963269831082, "grad_norm": 0.16945861279964447, "learning_rate": 0.0001998832321345166, "loss": 0.3113, "step": 10 }, { "epoch": 0.0012836595968141902, "grad_norm": 0.1379944235086441, "learning_rate": 0.0001998598785614199, "loss": 0.213, "step": 11 }, { "epoch": 0.0014003559237972985, "grad_norm": 0.12341434508562088, "learning_rate": 0.00019983652498832322, "loss": 0.1954, "step": 12 }, { "epoch": 0.0015170522507804067, "grad_norm": 0.12021443247795105, "learning_rate": 0.00019981317141522654, "loss": 0.2118, "step": 13 }, { "epoch": 0.001633748577763515, "grad_norm": 0.16578464210033417, "learning_rate": 0.00019978981784212984, "loss": 0.2335, "step": 14 }, { "epoch": 0.0017504449047466232, "grad_norm": 0.13460949063301086, "learning_rate": 0.00019976646426903317, "loss": 0.1664, "step": 15 }, { "epoch": 0.0018671412317297312, "grad_norm": 0.13434286415576935, "learning_rate": 0.0001997431106959365, "loss": 0.2334, "step": 16 }, { "epoch": 0.0019838375587128395, "grad_norm": 0.15617124736309052, "learning_rate": 0.00019971975712283982, "loss": 0.267, "step": 17 }, { "epoch": 0.002100533885695948, "grad_norm": 0.12410859763622284, "learning_rate": 0.00019969640354974312, "loss": 0.2141, "step": 18 }, { "epoch": 0.002217230212679056, "grad_norm": 0.10960312187671661, "learning_rate": 0.00019967304997664645, "loss": 0.1625, "step": 19 }, { "epoch": 0.002333926539662164, "grad_norm": 0.1421806961297989, "learning_rate": 0.00019964969640354975, "loss": 0.2529, "step": 20 }, { "epoch": 0.0024506228666452724, "grad_norm": 0.1390838772058487, "learning_rate": 0.00019962634283045308, "loss": 0.1938, "step": 21 }, { "epoch": 0.0025673191936283805, "grad_norm": 0.11065292358398438, "learning_rate": 0.00019960298925735637, "loss": 0.1892, "step": 22 }, { "epoch": 0.002684015520611489, "grad_norm": 0.12330306321382523, "learning_rate": 0.0001995796356842597, "loss": 0.2334, "step": 23 }, { "epoch": 0.002800711847594597, "grad_norm": 0.10883963108062744, "learning_rate": 0.00019955628211116303, "loss": 0.1953, "step": 24 }, { "epoch": 0.002917408174577705, "grad_norm": 0.11904892325401306, "learning_rate": 0.00019953292853806633, "loss": 0.191, "step": 25 }, { "epoch": 0.0030341045015608134, "grad_norm": 0.09328188002109528, "learning_rate": 0.00019950957496496965, "loss": 0.1517, "step": 26 }, { "epoch": 0.0031508008285439214, "grad_norm": 0.10651596635580063, "learning_rate": 0.00019948622139187295, "loss": 0.1778, "step": 27 }, { "epoch": 0.00326749715552703, "grad_norm": 0.09824363142251968, "learning_rate": 0.00019946286781877628, "loss": 0.1412, "step": 28 }, { "epoch": 0.003384193482510138, "grad_norm": 0.0911468043923378, "learning_rate": 0.00019943951424567958, "loss": 0.1413, "step": 29 }, { "epoch": 0.0035008898094932464, "grad_norm": 0.10707972198724747, "learning_rate": 0.0001994161606725829, "loss": 0.166, "step": 30 }, { "epoch": 0.0036175861364763544, "grad_norm": 0.1010749414563179, "learning_rate": 0.00019939280709948623, "loss": 0.1762, "step": 31 }, { "epoch": 0.0037342824634594624, "grad_norm": 0.11201727390289307, "learning_rate": 0.00019936945352638953, "loss": 0.1691, "step": 32 }, { "epoch": 0.003850978790442571, "grad_norm": 0.10435190051794052, "learning_rate": 0.00019934609995329286, "loss": 0.1615, "step": 33 }, { "epoch": 0.003967675117425679, "grad_norm": 0.10385840386152267, "learning_rate": 0.0001993227463801962, "loss": 0.1395, "step": 34 }, { "epoch": 0.004084371444408787, "grad_norm": 0.10580045729875565, "learning_rate": 0.00019929939280709951, "loss": 0.188, "step": 35 }, { "epoch": 0.004201067771391896, "grad_norm": 0.10571294277906418, "learning_rate": 0.00019927603923400281, "loss": 0.1778, "step": 36 }, { "epoch": 0.004317764098375003, "grad_norm": 0.11938229203224182, "learning_rate": 0.00019925268566090614, "loss": 0.2181, "step": 37 }, { "epoch": 0.004434460425358112, "grad_norm": 0.09765143692493439, "learning_rate": 0.00019922933208780944, "loss": 0.13, "step": 38 }, { "epoch": 0.00455115675234122, "grad_norm": 0.11270410567522049, "learning_rate": 0.00019920597851471277, "loss": 0.1988, "step": 39 }, { "epoch": 0.004667853079324328, "grad_norm": 0.09199155867099762, "learning_rate": 0.0001991826249416161, "loss": 0.1616, "step": 40 }, { "epoch": 0.004784549406307436, "grad_norm": 0.10620560497045517, "learning_rate": 0.0001991592713685194, "loss": 0.1728, "step": 41 }, { "epoch": 0.004901245733290545, "grad_norm": 0.09054724127054214, "learning_rate": 0.00019913591779542272, "loss": 0.1369, "step": 42 }, { "epoch": 0.005017942060273653, "grad_norm": 0.10327401012182236, "learning_rate": 0.00019911256422232602, "loss": 0.1708, "step": 43 }, { "epoch": 0.005134638387256761, "grad_norm": 0.09473054111003876, "learning_rate": 0.00019908921064922935, "loss": 0.1589, "step": 44 }, { "epoch": 0.005251334714239869, "grad_norm": 0.10536584258079529, "learning_rate": 0.00019906585707613265, "loss": 0.1962, "step": 45 }, { "epoch": 0.005368031041222978, "grad_norm": 0.10594025254249573, "learning_rate": 0.00019904250350303597, "loss": 0.1893, "step": 46 }, { "epoch": 0.005484727368206085, "grad_norm": 0.09798738360404968, "learning_rate": 0.00019901914992993927, "loss": 0.1759, "step": 47 }, { "epoch": 0.005601423695189194, "grad_norm": 0.08874180912971497, "learning_rate": 0.0001989957963568426, "loss": 0.1558, "step": 48 }, { "epoch": 0.005718120022172302, "grad_norm": 0.11178728193044662, "learning_rate": 0.00019897244278374593, "loss": 0.1745, "step": 49 }, { "epoch": 0.00583481634915541, "grad_norm": 0.0964571163058281, "learning_rate": 0.00019894908921064923, "loss": 0.1751, "step": 50 }, { "epoch": 0.005951512676138518, "grad_norm": 0.10600943863391876, "learning_rate": 0.00019892573563755255, "loss": 0.1656, "step": 51 }, { "epoch": 0.006068209003121627, "grad_norm": 0.10203580558300018, "learning_rate": 0.00019890238206445585, "loss": 0.1621, "step": 52 }, { "epoch": 0.006184905330104735, "grad_norm": 0.11010047048330307, "learning_rate": 0.0001988790284913592, "loss": 0.1789, "step": 53 }, { "epoch": 0.006301601657087843, "grad_norm": 0.11551900953054428, "learning_rate": 0.0001988556749182625, "loss": 0.1794, "step": 54 }, { "epoch": 0.006418297984070951, "grad_norm": 0.11391794681549072, "learning_rate": 0.00019883232134516583, "loss": 0.1765, "step": 55 }, { "epoch": 0.00653499431105406, "grad_norm": 0.11572562158107758, "learning_rate": 0.00019880896777206913, "loss": 0.2161, "step": 56 }, { "epoch": 0.006651690638037167, "grad_norm": 0.09810175001621246, "learning_rate": 0.00019878561419897246, "loss": 0.1534, "step": 57 }, { "epoch": 0.006768386965020276, "grad_norm": 0.10156040638685226, "learning_rate": 0.00019876226062587579, "loss": 0.1514, "step": 58 }, { "epoch": 0.006885083292003384, "grad_norm": 0.09523003548383713, "learning_rate": 0.00019873890705277909, "loss": 0.1345, "step": 59 }, { "epoch": 0.007001779618986493, "grad_norm": 0.11223362386226654, "learning_rate": 0.0001987155534796824, "loss": 0.1995, "step": 60 }, { "epoch": 0.0071184759459696, "grad_norm": 0.09502169489860535, "learning_rate": 0.0001986921999065857, "loss": 0.1428, "step": 61 }, { "epoch": 0.007235172272952709, "grad_norm": 0.11790277063846588, "learning_rate": 0.00019866884633348904, "loss": 0.1592, "step": 62 }, { "epoch": 0.007351868599935817, "grad_norm": 0.0922728031873703, "learning_rate": 0.00019864549276039234, "loss": 0.1264, "step": 63 }, { "epoch": 0.007468564926918925, "grad_norm": 0.09835848957300186, "learning_rate": 0.00019862213918729566, "loss": 0.131, "step": 64 }, { "epoch": 0.007585261253902033, "grad_norm": 0.1120564341545105, "learning_rate": 0.000198598785614199, "loss": 0.1605, "step": 65 }, { "epoch": 0.007701957580885142, "grad_norm": 0.10365966707468033, "learning_rate": 0.0001985754320411023, "loss": 0.1458, "step": 66 }, { "epoch": 0.00781865390786825, "grad_norm": 0.12009326368570328, "learning_rate": 0.00019855207846800562, "loss": 0.1818, "step": 67 }, { "epoch": 0.007935350234851358, "grad_norm": 0.10382229834794998, "learning_rate": 0.00019852872489490892, "loss": 0.1695, "step": 68 }, { "epoch": 0.008052046561834466, "grad_norm": 0.12162943184375763, "learning_rate": 0.00019850537132181224, "loss": 0.1916, "step": 69 }, { "epoch": 0.008168742888817575, "grad_norm": 0.1090371385216713, "learning_rate": 0.00019848201774871554, "loss": 0.1733, "step": 70 }, { "epoch": 0.008285439215800683, "grad_norm": 0.1108129546046257, "learning_rate": 0.00019845866417561887, "loss": 0.1533, "step": 71 }, { "epoch": 0.008402135542783792, "grad_norm": 0.10778584331274033, "learning_rate": 0.0001984353106025222, "loss": 0.1729, "step": 72 }, { "epoch": 0.008518831869766898, "grad_norm": 0.14670732617378235, "learning_rate": 0.00019841195702942552, "loss": 0.1499, "step": 73 }, { "epoch": 0.008635528196750007, "grad_norm": 0.10100234299898148, "learning_rate": 0.00019838860345632882, "loss": 0.1556, "step": 74 }, { "epoch": 0.008752224523733115, "grad_norm": 0.11347773671150208, "learning_rate": 0.00019836524988323215, "loss": 0.1783, "step": 75 }, { "epoch": 0.008868920850716224, "grad_norm": 0.09582630544900894, "learning_rate": 0.00019834189631013548, "loss": 0.1516, "step": 76 }, { "epoch": 0.008985617177699332, "grad_norm": 0.09301317483186722, "learning_rate": 0.00019831854273703878, "loss": 0.1491, "step": 77 }, { "epoch": 0.00910231350468244, "grad_norm": 0.12455611675977707, "learning_rate": 0.0001982951891639421, "loss": 0.1654, "step": 78 }, { "epoch": 0.00921900983166555, "grad_norm": 0.11573786288499832, "learning_rate": 0.0001982718355908454, "loss": 0.2097, "step": 79 }, { "epoch": 0.009335706158648656, "grad_norm": 0.09937581419944763, "learning_rate": 0.00019824848201774873, "loss": 0.1563, "step": 80 }, { "epoch": 0.009452402485631764, "grad_norm": 0.11743341386318207, "learning_rate": 0.00019822512844465203, "loss": 0.1656, "step": 81 }, { "epoch": 0.009569098812614873, "grad_norm": 0.10934270918369293, "learning_rate": 0.00019820177487155536, "loss": 0.1468, "step": 82 }, { "epoch": 0.009685795139597981, "grad_norm": 0.11555736511945724, "learning_rate": 0.00019817842129845868, "loss": 0.1815, "step": 83 }, { "epoch": 0.00980249146658109, "grad_norm": 0.11791291832923889, "learning_rate": 0.00019815506772536198, "loss": 0.1807, "step": 84 }, { "epoch": 0.009919187793564198, "grad_norm": 0.1130499318242073, "learning_rate": 0.0001981317141522653, "loss": 0.1698, "step": 85 }, { "epoch": 0.010035884120547307, "grad_norm": 0.10540090501308441, "learning_rate": 0.0001981083605791686, "loss": 0.1582, "step": 86 }, { "epoch": 0.010152580447530413, "grad_norm": 0.09527558833360672, "learning_rate": 0.00019808500700607194, "loss": 0.1387, "step": 87 }, { "epoch": 0.010269276774513522, "grad_norm": 0.11643368750810623, "learning_rate": 0.00019806165343297524, "loss": 0.1842, "step": 88 }, { "epoch": 0.01038597310149663, "grad_norm": 0.11340148001909256, "learning_rate": 0.00019803829985987856, "loss": 0.1616, "step": 89 }, { "epoch": 0.010502669428479739, "grad_norm": 0.12303619831800461, "learning_rate": 0.0001980149462867819, "loss": 0.1872, "step": 90 }, { "epoch": 0.010619365755462847, "grad_norm": 0.09786645323038101, "learning_rate": 0.0001979915927136852, "loss": 0.1567, "step": 91 }, { "epoch": 0.010736062082445956, "grad_norm": 0.10433386266231537, "learning_rate": 0.00019796823914058854, "loss": 0.1783, "step": 92 }, { "epoch": 0.010852758409429062, "grad_norm": 0.31017664074897766, "learning_rate": 0.00019794488556749184, "loss": 0.1546, "step": 93 }, { "epoch": 0.01096945473641217, "grad_norm": 0.10538630187511444, "learning_rate": 0.00019792153199439517, "loss": 0.1562, "step": 94 }, { "epoch": 0.01108615106339528, "grad_norm": 0.10152962803840637, "learning_rate": 0.00019789817842129847, "loss": 0.1573, "step": 95 }, { "epoch": 0.011202847390378388, "grad_norm": 0.1028379276394844, "learning_rate": 0.0001978748248482018, "loss": 0.151, "step": 96 }, { "epoch": 0.011319543717361496, "grad_norm": 0.10292468219995499, "learning_rate": 0.0001978514712751051, "loss": 0.1642, "step": 97 }, { "epoch": 0.011436240044344605, "grad_norm": 0.10416844487190247, "learning_rate": 0.00019782811770200842, "loss": 0.1701, "step": 98 }, { "epoch": 0.011552936371327713, "grad_norm": 0.10852757841348648, "learning_rate": 0.00019780476412891172, "loss": 0.1474, "step": 99 }, { "epoch": 0.01166963269831082, "grad_norm": 0.10577951371669769, "learning_rate": 0.00019778141055581505, "loss": 0.18, "step": 100 }, { "epoch": 0.011786329025293928, "grad_norm": 0.08033058792352676, "learning_rate": 0.00019775805698271838, "loss": 0.1211, "step": 101 }, { "epoch": 0.011903025352277037, "grad_norm": 0.1296456754207611, "learning_rate": 0.00019773470340962167, "loss": 0.1822, "step": 102 }, { "epoch": 0.012019721679260145, "grad_norm": 0.114451102912426, "learning_rate": 0.000197711349836525, "loss": 0.1756, "step": 103 }, { "epoch": 0.012136418006243254, "grad_norm": 0.10711831599473953, "learning_rate": 0.0001976879962634283, "loss": 0.1622, "step": 104 }, { "epoch": 0.012253114333226362, "grad_norm": 0.11008073389530182, "learning_rate": 0.00019766464269033163, "loss": 0.1554, "step": 105 }, { "epoch": 0.01236981066020947, "grad_norm": 0.1160978451371193, "learning_rate": 0.00019764128911723493, "loss": 0.1667, "step": 106 }, { "epoch": 0.012486506987192577, "grad_norm": 0.1413351148366928, "learning_rate": 0.00019761793554413825, "loss": 0.191, "step": 107 }, { "epoch": 0.012603203314175686, "grad_norm": 0.08713728189468384, "learning_rate": 0.00019759458197104158, "loss": 0.1353, "step": 108 }, { "epoch": 0.012719899641158794, "grad_norm": 0.09029239416122437, "learning_rate": 0.00019757122839794488, "loss": 0.1078, "step": 109 }, { "epoch": 0.012836595968141903, "grad_norm": 0.09928172081708908, "learning_rate": 0.0001975478748248482, "loss": 0.136, "step": 110 }, { "epoch": 0.012953292295125011, "grad_norm": 0.11607584357261658, "learning_rate": 0.00019752452125175153, "loss": 0.1861, "step": 111 }, { "epoch": 0.01306998862210812, "grad_norm": 0.12424415349960327, "learning_rate": 0.00019750116767865486, "loss": 0.1987, "step": 112 }, { "epoch": 0.013186684949091228, "grad_norm": 0.12206408381462097, "learning_rate": 0.00019747781410555816, "loss": 0.2091, "step": 113 }, { "epoch": 0.013303381276074335, "grad_norm": 0.09520223736763, "learning_rate": 0.0001974544605324615, "loss": 0.1318, "step": 114 }, { "epoch": 0.013420077603057443, "grad_norm": 0.10052375495433807, "learning_rate": 0.0001974311069593648, "loss": 0.1675, "step": 115 }, { "epoch": 0.013536773930040552, "grad_norm": 0.09986284375190735, "learning_rate": 0.00019740775338626811, "loss": 0.1496, "step": 116 }, { "epoch": 0.01365347025702366, "grad_norm": 0.09899864345788956, "learning_rate": 0.00019738439981317144, "loss": 0.1427, "step": 117 }, { "epoch": 0.013770166584006769, "grad_norm": 0.09922472387552261, "learning_rate": 0.00019736104624007474, "loss": 0.1412, "step": 118 }, { "epoch": 0.013886862910989877, "grad_norm": 0.08671886473894119, "learning_rate": 0.00019733769266697807, "loss": 0.1305, "step": 119 }, { "epoch": 0.014003559237972986, "grad_norm": 0.10284463316202164, "learning_rate": 0.00019731433909388137, "loss": 0.1455, "step": 120 }, { "epoch": 0.014120255564956092, "grad_norm": 0.12423279136419296, "learning_rate": 0.0001972909855207847, "loss": 0.1984, "step": 121 }, { "epoch": 0.0142369518919392, "grad_norm": 0.12210292369127274, "learning_rate": 0.000197267631947688, "loss": 0.1854, "step": 122 }, { "epoch": 0.01435364821892231, "grad_norm": 0.10566538572311401, "learning_rate": 0.00019724427837459132, "loss": 0.1347, "step": 123 }, { "epoch": 0.014470344545905418, "grad_norm": 0.09597185254096985, "learning_rate": 0.00019722092480149465, "loss": 0.133, "step": 124 }, { "epoch": 0.014587040872888526, "grad_norm": 0.11847853660583496, "learning_rate": 0.00019719757122839795, "loss": 0.1788, "step": 125 }, { "epoch": 0.014703737199871635, "grad_norm": 0.10845934599637985, "learning_rate": 0.00019717421765530127, "loss": 0.1509, "step": 126 }, { "epoch": 0.014820433526854743, "grad_norm": 0.09963663667440414, "learning_rate": 0.00019715086408220457, "loss": 0.1677, "step": 127 }, { "epoch": 0.01493712985383785, "grad_norm": 0.09440722316503525, "learning_rate": 0.0001971275105091079, "loss": 0.1396, "step": 128 }, { "epoch": 0.015053826180820958, "grad_norm": 0.11402937024831772, "learning_rate": 0.0001971041569360112, "loss": 0.18, "step": 129 }, { "epoch": 0.015170522507804067, "grad_norm": 0.1282823383808136, "learning_rate": 0.00019708080336291455, "loss": 0.1817, "step": 130 }, { "epoch": 0.015287218834787175, "grad_norm": 0.09704853594303131, "learning_rate": 0.00019705744978981785, "loss": 0.1502, "step": 131 }, { "epoch": 0.015403915161770284, "grad_norm": 0.09895353019237518, "learning_rate": 0.00019703409621672118, "loss": 0.1477, "step": 132 }, { "epoch": 0.015520611488753392, "grad_norm": 0.10989242792129517, "learning_rate": 0.00019701074264362448, "loss": 0.1483, "step": 133 }, { "epoch": 0.0156373078157365, "grad_norm": 0.11348774284124374, "learning_rate": 0.0001969873890705278, "loss": 0.1787, "step": 134 }, { "epoch": 0.01575400414271961, "grad_norm": 0.10849590599536896, "learning_rate": 0.00019696403549743113, "loss": 0.1564, "step": 135 }, { "epoch": 0.015870700469702716, "grad_norm": 0.10929839313030243, "learning_rate": 0.00019694068192433443, "loss": 0.1481, "step": 136 }, { "epoch": 0.015987396796685826, "grad_norm": 0.09660619497299194, "learning_rate": 0.00019691732835123776, "loss": 0.1433, "step": 137 }, { "epoch": 0.016104093123668933, "grad_norm": 0.11423259973526001, "learning_rate": 0.00019689397477814106, "loss": 0.1342, "step": 138 }, { "epoch": 0.01622078945065204, "grad_norm": 0.10947205871343613, "learning_rate": 0.00019687062120504439, "loss": 0.1825, "step": 139 }, { "epoch": 0.01633748577763515, "grad_norm": 0.11746672540903091, "learning_rate": 0.00019684726763194768, "loss": 0.1766, "step": 140 }, { "epoch": 0.016454182104618256, "grad_norm": 0.1152237206697464, "learning_rate": 0.000196823914058851, "loss": 0.161, "step": 141 }, { "epoch": 0.016570878431601366, "grad_norm": 0.10848015546798706, "learning_rate": 0.00019680056048575434, "loss": 0.1541, "step": 142 }, { "epoch": 0.016687574758584473, "grad_norm": 0.12663570046424866, "learning_rate": 0.00019677720691265764, "loss": 0.1891, "step": 143 }, { "epoch": 0.016804271085567583, "grad_norm": 0.1088947057723999, "learning_rate": 0.00019675385333956096, "loss": 0.1505, "step": 144 }, { "epoch": 0.01692096741255069, "grad_norm": 0.10418037325143814, "learning_rate": 0.00019673049976646426, "loss": 0.151, "step": 145 }, { "epoch": 0.017037663739533797, "grad_norm": 0.08672403544187546, "learning_rate": 0.0001967071461933676, "loss": 0.1213, "step": 146 }, { "epoch": 0.017154360066516907, "grad_norm": 0.10863472521305084, "learning_rate": 0.0001966837926202709, "loss": 0.1552, "step": 147 }, { "epoch": 0.017271056393500014, "grad_norm": 0.10580800473690033, "learning_rate": 0.00019666043904717422, "loss": 0.1589, "step": 148 }, { "epoch": 0.017387752720483124, "grad_norm": 0.09545203298330307, "learning_rate": 0.00019663708547407754, "loss": 0.139, "step": 149 }, { "epoch": 0.01750444904746623, "grad_norm": 0.14016349613666534, "learning_rate": 0.00019661373190098087, "loss": 0.2126, "step": 150 }, { "epoch": 0.01762114537444934, "grad_norm": 0.11033914983272552, "learning_rate": 0.00019659037832788417, "loss": 0.1644, "step": 151 }, { "epoch": 0.017737841701432448, "grad_norm": 0.10455331206321716, "learning_rate": 0.0001965670247547875, "loss": 0.1499, "step": 152 }, { "epoch": 0.017854538028415554, "grad_norm": 0.11884409189224243, "learning_rate": 0.00019654367118169082, "loss": 0.1731, "step": 153 }, { "epoch": 0.017971234355398664, "grad_norm": 0.11076351255178452, "learning_rate": 0.00019652031760859412, "loss": 0.1782, "step": 154 }, { "epoch": 0.01808793068238177, "grad_norm": 0.11540203541517258, "learning_rate": 0.00019649696403549745, "loss": 0.1731, "step": 155 }, { "epoch": 0.01820462700936488, "grad_norm": 0.09334211051464081, "learning_rate": 0.00019647361046240075, "loss": 0.1576, "step": 156 }, { "epoch": 0.018321323336347988, "grad_norm": 0.11943213641643524, "learning_rate": 0.00019645025688930408, "loss": 0.1715, "step": 157 }, { "epoch": 0.0184380196633311, "grad_norm": 0.08858149498701096, "learning_rate": 0.00019642690331620738, "loss": 0.1282, "step": 158 }, { "epoch": 0.018554715990314205, "grad_norm": 0.10284683853387833, "learning_rate": 0.0001964035497431107, "loss": 0.1727, "step": 159 }, { "epoch": 0.018671412317297312, "grad_norm": 0.10812927782535553, "learning_rate": 0.00019638019617001403, "loss": 0.1741, "step": 160 }, { "epoch": 0.018788108644280422, "grad_norm": 0.08129740506410599, "learning_rate": 0.00019635684259691733, "loss": 0.107, "step": 161 }, { "epoch": 0.01890480497126353, "grad_norm": 0.11642193049192429, "learning_rate": 0.00019633348902382066, "loss": 0.1768, "step": 162 }, { "epoch": 0.01902150129824664, "grad_norm": 0.10036379098892212, "learning_rate": 0.00019631013545072396, "loss": 0.1458, "step": 163 }, { "epoch": 0.019138197625229746, "grad_norm": 0.0957493856549263, "learning_rate": 0.00019628678187762728, "loss": 0.143, "step": 164 }, { "epoch": 0.019254893952212856, "grad_norm": 0.12155473232269287, "learning_rate": 0.00019626342830453058, "loss": 0.1807, "step": 165 }, { "epoch": 0.019371590279195963, "grad_norm": 0.11061038821935654, "learning_rate": 0.0001962400747314339, "loss": 0.1699, "step": 166 }, { "epoch": 0.01948828660617907, "grad_norm": 0.10963009297847748, "learning_rate": 0.00019621672115833724, "loss": 0.1638, "step": 167 }, { "epoch": 0.01960498293316218, "grad_norm": 0.11229882389307022, "learning_rate": 0.00019619336758524056, "loss": 0.1788, "step": 168 }, { "epoch": 0.019721679260145286, "grad_norm": 0.10003431886434555, "learning_rate": 0.0001961700140121439, "loss": 0.1547, "step": 169 }, { "epoch": 0.019838375587128396, "grad_norm": 0.09698428958654404, "learning_rate": 0.0001961466604390472, "loss": 0.1193, "step": 170 }, { "epoch": 0.019955071914111503, "grad_norm": 0.10166200250387192, "learning_rate": 0.00019612330686595052, "loss": 0.1511, "step": 171 }, { "epoch": 0.020071768241094613, "grad_norm": 0.10210378468036652, "learning_rate": 0.00019609995329285382, "loss": 0.1563, "step": 172 }, { "epoch": 0.02018846456807772, "grad_norm": 0.09979470074176788, "learning_rate": 0.00019607659971975714, "loss": 0.1317, "step": 173 }, { "epoch": 0.020305160895060827, "grad_norm": 0.12395334988832474, "learning_rate": 0.00019605324614666044, "loss": 0.1791, "step": 174 }, { "epoch": 0.020421857222043937, "grad_norm": 0.11577446013689041, "learning_rate": 0.00019602989257356377, "loss": 0.1681, "step": 175 }, { "epoch": 0.020538553549027044, "grad_norm": 0.10168549418449402, "learning_rate": 0.00019600653900046707, "loss": 0.1404, "step": 176 }, { "epoch": 0.020655249876010154, "grad_norm": 0.1242406889796257, "learning_rate": 0.0001959831854273704, "loss": 0.1673, "step": 177 }, { "epoch": 0.02077194620299326, "grad_norm": 0.09891916811466217, "learning_rate": 0.00019595983185427372, "loss": 0.1298, "step": 178 }, { "epoch": 0.020888642529976367, "grad_norm": 0.15590757131576538, "learning_rate": 0.00019593647828117702, "loss": 0.1705, "step": 179 }, { "epoch": 0.021005338856959477, "grad_norm": 0.08277418464422226, "learning_rate": 0.00019591312470808035, "loss": 0.1104, "step": 180 }, { "epoch": 0.021122035183942584, "grad_norm": 0.10521771758794785, "learning_rate": 0.00019588977113498365, "loss": 0.1442, "step": 181 }, { "epoch": 0.021238731510925694, "grad_norm": 0.10389945656061172, "learning_rate": 0.00019586641756188698, "loss": 0.1448, "step": 182 }, { "epoch": 0.0213554278379088, "grad_norm": 0.11086277663707733, "learning_rate": 0.00019584306398879027, "loss": 0.1555, "step": 183 }, { "epoch": 0.02147212416489191, "grad_norm": 0.1056290939450264, "learning_rate": 0.0001958197104156936, "loss": 0.161, "step": 184 }, { "epoch": 0.021588820491875018, "grad_norm": 0.11331475526094437, "learning_rate": 0.00019579635684259693, "loss": 0.1236, "step": 185 }, { "epoch": 0.021705516818858125, "grad_norm": 0.09584867209196091, "learning_rate": 0.00019577300326950023, "loss": 0.1282, "step": 186 }, { "epoch": 0.021822213145841235, "grad_norm": 0.11787907034158707, "learning_rate": 0.00019574964969640358, "loss": 0.1843, "step": 187 }, { "epoch": 0.02193890947282434, "grad_norm": 0.10822898149490356, "learning_rate": 0.00019572629612330688, "loss": 0.1638, "step": 188 }, { "epoch": 0.022055605799807452, "grad_norm": 0.25267189741134644, "learning_rate": 0.0001957029425502102, "loss": 0.1718, "step": 189 }, { "epoch": 0.02217230212679056, "grad_norm": 0.1412399560213089, "learning_rate": 0.0001956795889771135, "loss": 0.1803, "step": 190 }, { "epoch": 0.02228899845377367, "grad_norm": 0.11052538454532623, "learning_rate": 0.00019565623540401683, "loss": 0.1638, "step": 191 }, { "epoch": 0.022405694780756776, "grad_norm": 0.11876215785741806, "learning_rate": 0.00019563288183092013, "loss": 0.2179, "step": 192 }, { "epoch": 0.022522391107739882, "grad_norm": 0.10292577743530273, "learning_rate": 0.00019560952825782346, "loss": 0.1568, "step": 193 }, { "epoch": 0.022639087434722992, "grad_norm": 0.10467737168073654, "learning_rate": 0.0001955861746847268, "loss": 0.156, "step": 194 }, { "epoch": 0.0227557837617061, "grad_norm": 0.116755910217762, "learning_rate": 0.0001955628211116301, "loss": 0.1665, "step": 195 }, { "epoch": 0.02287248008868921, "grad_norm": 0.09144476056098938, "learning_rate": 0.00019553946753853341, "loss": 0.1356, "step": 196 }, { "epoch": 0.022989176415672316, "grad_norm": 0.09073708951473236, "learning_rate": 0.00019551611396543671, "loss": 0.14, "step": 197 }, { "epoch": 0.023105872742655426, "grad_norm": 0.10096915066242218, "learning_rate": 0.00019549276039234004, "loss": 0.1595, "step": 198 }, { "epoch": 0.023222569069638533, "grad_norm": 0.10241077095270157, "learning_rate": 0.00019546940681924334, "loss": 0.1479, "step": 199 }, { "epoch": 0.02333926539662164, "grad_norm": 0.10773292183876038, "learning_rate": 0.00019544605324614667, "loss": 0.1548, "step": 200 } ], "logging_steps": 1, "max_steps": 8569, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4879936776325523e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }