|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.02333926539662164, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0001166963269831082, |
|
"grad_norm": 0.5096844434738159, |
|
"learning_rate": 4e-05, |
|
"loss": 0.5061, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002333926539662164, |
|
"grad_norm": 0.5573846697807312, |
|
"learning_rate": 8e-05, |
|
"loss": 0.536, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0003500889809493246, |
|
"grad_norm": 0.5168461203575134, |
|
"learning_rate": 0.00012, |
|
"loss": 0.5209, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0004667853079324328, |
|
"grad_norm": 0.4047943651676178, |
|
"learning_rate": 0.00016, |
|
"loss": 0.4604, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.000583481634915541, |
|
"grad_norm": 0.20825760066509247, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3089, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0007001779618986492, |
|
"grad_norm": 0.20194634795188904, |
|
"learning_rate": 0.00019997664642690334, |
|
"loss": 0.3358, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0008168742888817575, |
|
"grad_norm": 0.193731427192688, |
|
"learning_rate": 0.00019995329285380664, |
|
"loss": 0.3042, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0009335706158648656, |
|
"grad_norm": 0.21419042348861694, |
|
"learning_rate": 0.00019992993928070996, |
|
"loss": 0.341, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.001050266942847974, |
|
"grad_norm": 0.18434806168079376, |
|
"learning_rate": 0.00019990658570761326, |
|
"loss": 0.2665, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.001166963269831082, |
|
"grad_norm": 0.16945861279964447, |
|
"learning_rate": 0.0001998832321345166, |
|
"loss": 0.3113, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0012836595968141902, |
|
"grad_norm": 0.1379944235086441, |
|
"learning_rate": 0.0001998598785614199, |
|
"loss": 0.213, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0014003559237972985, |
|
"grad_norm": 0.12341434508562088, |
|
"learning_rate": 0.00019983652498832322, |
|
"loss": 0.1954, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0015170522507804067, |
|
"grad_norm": 0.12021443247795105, |
|
"learning_rate": 0.00019981317141522654, |
|
"loss": 0.2118, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.001633748577763515, |
|
"grad_norm": 0.16578464210033417, |
|
"learning_rate": 0.00019978981784212984, |
|
"loss": 0.2335, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0017504449047466232, |
|
"grad_norm": 0.13460949063301086, |
|
"learning_rate": 0.00019976646426903317, |
|
"loss": 0.1664, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0018671412317297312, |
|
"grad_norm": 0.13434286415576935, |
|
"learning_rate": 0.0001997431106959365, |
|
"loss": 0.2334, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0019838375587128395, |
|
"grad_norm": 0.15617124736309052, |
|
"learning_rate": 0.00019971975712283982, |
|
"loss": 0.267, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.002100533885695948, |
|
"grad_norm": 0.12410859763622284, |
|
"learning_rate": 0.00019969640354974312, |
|
"loss": 0.2141, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.002217230212679056, |
|
"grad_norm": 0.10960312187671661, |
|
"learning_rate": 0.00019967304997664645, |
|
"loss": 0.1625, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.002333926539662164, |
|
"grad_norm": 0.1421806961297989, |
|
"learning_rate": 0.00019964969640354975, |
|
"loss": 0.2529, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0024506228666452724, |
|
"grad_norm": 0.1390838772058487, |
|
"learning_rate": 0.00019962634283045308, |
|
"loss": 0.1938, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0025673191936283805, |
|
"grad_norm": 0.11065292358398438, |
|
"learning_rate": 0.00019960298925735637, |
|
"loss": 0.1892, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.002684015520611489, |
|
"grad_norm": 0.12330306321382523, |
|
"learning_rate": 0.0001995796356842597, |
|
"loss": 0.2334, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.002800711847594597, |
|
"grad_norm": 0.10883963108062744, |
|
"learning_rate": 0.00019955628211116303, |
|
"loss": 0.1953, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.002917408174577705, |
|
"grad_norm": 0.11904892325401306, |
|
"learning_rate": 0.00019953292853806633, |
|
"loss": 0.191, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0030341045015608134, |
|
"grad_norm": 0.09328188002109528, |
|
"learning_rate": 0.00019950957496496965, |
|
"loss": 0.1517, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0031508008285439214, |
|
"grad_norm": 0.10651596635580063, |
|
"learning_rate": 0.00019948622139187295, |
|
"loss": 0.1778, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00326749715552703, |
|
"grad_norm": 0.09824363142251968, |
|
"learning_rate": 0.00019946286781877628, |
|
"loss": 0.1412, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.003384193482510138, |
|
"grad_norm": 0.0911468043923378, |
|
"learning_rate": 0.00019943951424567958, |
|
"loss": 0.1413, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0035008898094932464, |
|
"grad_norm": 0.10707972198724747, |
|
"learning_rate": 0.0001994161606725829, |
|
"loss": 0.166, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0036175861364763544, |
|
"grad_norm": 0.1010749414563179, |
|
"learning_rate": 0.00019939280709948623, |
|
"loss": 0.1762, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0037342824634594624, |
|
"grad_norm": 0.11201727390289307, |
|
"learning_rate": 0.00019936945352638953, |
|
"loss": 0.1691, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.003850978790442571, |
|
"grad_norm": 0.10435190051794052, |
|
"learning_rate": 0.00019934609995329286, |
|
"loss": 0.1615, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.003967675117425679, |
|
"grad_norm": 0.10385840386152267, |
|
"learning_rate": 0.0001993227463801962, |
|
"loss": 0.1395, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.004084371444408787, |
|
"grad_norm": 0.10580045729875565, |
|
"learning_rate": 0.00019929939280709951, |
|
"loss": 0.188, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.004201067771391896, |
|
"grad_norm": 0.10571294277906418, |
|
"learning_rate": 0.00019927603923400281, |
|
"loss": 0.1778, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.004317764098375003, |
|
"grad_norm": 0.11938229203224182, |
|
"learning_rate": 0.00019925268566090614, |
|
"loss": 0.2181, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.004434460425358112, |
|
"grad_norm": 0.09765143692493439, |
|
"learning_rate": 0.00019922933208780944, |
|
"loss": 0.13, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.00455115675234122, |
|
"grad_norm": 0.11270410567522049, |
|
"learning_rate": 0.00019920597851471277, |
|
"loss": 0.1988, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.004667853079324328, |
|
"grad_norm": 0.09199155867099762, |
|
"learning_rate": 0.0001991826249416161, |
|
"loss": 0.1616, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004784549406307436, |
|
"grad_norm": 0.10620560497045517, |
|
"learning_rate": 0.0001991592713685194, |
|
"loss": 0.1728, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.004901245733290545, |
|
"grad_norm": 0.09054724127054214, |
|
"learning_rate": 0.00019913591779542272, |
|
"loss": 0.1369, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.005017942060273653, |
|
"grad_norm": 0.10327401012182236, |
|
"learning_rate": 0.00019911256422232602, |
|
"loss": 0.1708, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.005134638387256761, |
|
"grad_norm": 0.09473054111003876, |
|
"learning_rate": 0.00019908921064922935, |
|
"loss": 0.1589, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.005251334714239869, |
|
"grad_norm": 0.10536584258079529, |
|
"learning_rate": 0.00019906585707613265, |
|
"loss": 0.1962, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.005368031041222978, |
|
"grad_norm": 0.10594025254249573, |
|
"learning_rate": 0.00019904250350303597, |
|
"loss": 0.1893, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.005484727368206085, |
|
"grad_norm": 0.09798738360404968, |
|
"learning_rate": 0.00019901914992993927, |
|
"loss": 0.1759, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.005601423695189194, |
|
"grad_norm": 0.08874180912971497, |
|
"learning_rate": 0.0001989957963568426, |
|
"loss": 0.1558, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.005718120022172302, |
|
"grad_norm": 0.11178728193044662, |
|
"learning_rate": 0.00019897244278374593, |
|
"loss": 0.1745, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.00583481634915541, |
|
"grad_norm": 0.0964571163058281, |
|
"learning_rate": 0.00019894908921064923, |
|
"loss": 0.1751, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005951512676138518, |
|
"grad_norm": 0.10600943863391876, |
|
"learning_rate": 0.00019892573563755255, |
|
"loss": 0.1656, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.006068209003121627, |
|
"grad_norm": 0.10203580558300018, |
|
"learning_rate": 0.00019890238206445585, |
|
"loss": 0.1621, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.006184905330104735, |
|
"grad_norm": 0.11010047048330307, |
|
"learning_rate": 0.0001988790284913592, |
|
"loss": 0.1789, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.006301601657087843, |
|
"grad_norm": 0.11551900953054428, |
|
"learning_rate": 0.0001988556749182625, |
|
"loss": 0.1794, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.006418297984070951, |
|
"grad_norm": 0.11391794681549072, |
|
"learning_rate": 0.00019883232134516583, |
|
"loss": 0.1765, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00653499431105406, |
|
"grad_norm": 0.11572562158107758, |
|
"learning_rate": 0.00019880896777206913, |
|
"loss": 0.2161, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.006651690638037167, |
|
"grad_norm": 0.09810175001621246, |
|
"learning_rate": 0.00019878561419897246, |
|
"loss": 0.1534, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.006768386965020276, |
|
"grad_norm": 0.10156040638685226, |
|
"learning_rate": 0.00019876226062587579, |
|
"loss": 0.1514, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.006885083292003384, |
|
"grad_norm": 0.09523003548383713, |
|
"learning_rate": 0.00019873890705277909, |
|
"loss": 0.1345, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.007001779618986493, |
|
"grad_norm": 0.11223362386226654, |
|
"learning_rate": 0.0001987155534796824, |
|
"loss": 0.1995, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0071184759459696, |
|
"grad_norm": 0.09502169489860535, |
|
"learning_rate": 0.0001986921999065857, |
|
"loss": 0.1428, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.007235172272952709, |
|
"grad_norm": 0.11790277063846588, |
|
"learning_rate": 0.00019866884633348904, |
|
"loss": 0.1592, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.007351868599935817, |
|
"grad_norm": 0.0922728031873703, |
|
"learning_rate": 0.00019864549276039234, |
|
"loss": 0.1264, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.007468564926918925, |
|
"grad_norm": 0.09835848957300186, |
|
"learning_rate": 0.00019862213918729566, |
|
"loss": 0.131, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.007585261253902033, |
|
"grad_norm": 0.1120564341545105, |
|
"learning_rate": 0.000198598785614199, |
|
"loss": 0.1605, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.007701957580885142, |
|
"grad_norm": 0.10365966707468033, |
|
"learning_rate": 0.0001985754320411023, |
|
"loss": 0.1458, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.00781865390786825, |
|
"grad_norm": 0.12009326368570328, |
|
"learning_rate": 0.00019855207846800562, |
|
"loss": 0.1818, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.007935350234851358, |
|
"grad_norm": 0.10382229834794998, |
|
"learning_rate": 0.00019852872489490892, |
|
"loss": 0.1695, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.008052046561834466, |
|
"grad_norm": 0.12162943184375763, |
|
"learning_rate": 0.00019850537132181224, |
|
"loss": 0.1916, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.008168742888817575, |
|
"grad_norm": 0.1090371385216713, |
|
"learning_rate": 0.00019848201774871554, |
|
"loss": 0.1733, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.008285439215800683, |
|
"grad_norm": 0.1108129546046257, |
|
"learning_rate": 0.00019845866417561887, |
|
"loss": 0.1533, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.008402135542783792, |
|
"grad_norm": 0.10778584331274033, |
|
"learning_rate": 0.0001984353106025222, |
|
"loss": 0.1729, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.008518831869766898, |
|
"grad_norm": 0.14670732617378235, |
|
"learning_rate": 0.00019841195702942552, |
|
"loss": 0.1499, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.008635528196750007, |
|
"grad_norm": 0.10100234299898148, |
|
"learning_rate": 0.00019838860345632882, |
|
"loss": 0.1556, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.008752224523733115, |
|
"grad_norm": 0.11347773671150208, |
|
"learning_rate": 0.00019836524988323215, |
|
"loss": 0.1783, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.008868920850716224, |
|
"grad_norm": 0.09582630544900894, |
|
"learning_rate": 0.00019834189631013548, |
|
"loss": 0.1516, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.008985617177699332, |
|
"grad_norm": 0.09301317483186722, |
|
"learning_rate": 0.00019831854273703878, |
|
"loss": 0.1491, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.00910231350468244, |
|
"grad_norm": 0.12455611675977707, |
|
"learning_rate": 0.0001982951891639421, |
|
"loss": 0.1654, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.00921900983166555, |
|
"grad_norm": 0.11573786288499832, |
|
"learning_rate": 0.0001982718355908454, |
|
"loss": 0.2097, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.009335706158648656, |
|
"grad_norm": 0.09937581419944763, |
|
"learning_rate": 0.00019824848201774873, |
|
"loss": 0.1563, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.009452402485631764, |
|
"grad_norm": 0.11743341386318207, |
|
"learning_rate": 0.00019822512844465203, |
|
"loss": 0.1656, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.009569098812614873, |
|
"grad_norm": 0.10934270918369293, |
|
"learning_rate": 0.00019820177487155536, |
|
"loss": 0.1468, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.009685795139597981, |
|
"grad_norm": 0.11555736511945724, |
|
"learning_rate": 0.00019817842129845868, |
|
"loss": 0.1815, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.00980249146658109, |
|
"grad_norm": 0.11791291832923889, |
|
"learning_rate": 0.00019815506772536198, |
|
"loss": 0.1807, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.009919187793564198, |
|
"grad_norm": 0.1130499318242073, |
|
"learning_rate": 0.0001981317141522653, |
|
"loss": 0.1698, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.010035884120547307, |
|
"grad_norm": 0.10540090501308441, |
|
"learning_rate": 0.0001981083605791686, |
|
"loss": 0.1582, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.010152580447530413, |
|
"grad_norm": 0.09527558833360672, |
|
"learning_rate": 0.00019808500700607194, |
|
"loss": 0.1387, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.010269276774513522, |
|
"grad_norm": 0.11643368750810623, |
|
"learning_rate": 0.00019806165343297524, |
|
"loss": 0.1842, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.01038597310149663, |
|
"grad_norm": 0.11340148001909256, |
|
"learning_rate": 0.00019803829985987856, |
|
"loss": 0.1616, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.010502669428479739, |
|
"grad_norm": 0.12303619831800461, |
|
"learning_rate": 0.0001980149462867819, |
|
"loss": 0.1872, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.010619365755462847, |
|
"grad_norm": 0.09786645323038101, |
|
"learning_rate": 0.0001979915927136852, |
|
"loss": 0.1567, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.010736062082445956, |
|
"grad_norm": 0.10433386266231537, |
|
"learning_rate": 0.00019796823914058854, |
|
"loss": 0.1783, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.010852758409429062, |
|
"grad_norm": 0.31017664074897766, |
|
"learning_rate": 0.00019794488556749184, |
|
"loss": 0.1546, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.01096945473641217, |
|
"grad_norm": 0.10538630187511444, |
|
"learning_rate": 0.00019792153199439517, |
|
"loss": 0.1562, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.01108615106339528, |
|
"grad_norm": 0.10152962803840637, |
|
"learning_rate": 0.00019789817842129847, |
|
"loss": 0.1573, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.011202847390378388, |
|
"grad_norm": 0.1028379276394844, |
|
"learning_rate": 0.0001978748248482018, |
|
"loss": 0.151, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.011319543717361496, |
|
"grad_norm": 0.10292468219995499, |
|
"learning_rate": 0.0001978514712751051, |
|
"loss": 0.1642, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.011436240044344605, |
|
"grad_norm": 0.10416844487190247, |
|
"learning_rate": 0.00019782811770200842, |
|
"loss": 0.1701, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.011552936371327713, |
|
"grad_norm": 0.10852757841348648, |
|
"learning_rate": 0.00019780476412891172, |
|
"loss": 0.1474, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.01166963269831082, |
|
"grad_norm": 0.10577951371669769, |
|
"learning_rate": 0.00019778141055581505, |
|
"loss": 0.18, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011786329025293928, |
|
"grad_norm": 0.08033058792352676, |
|
"learning_rate": 0.00019775805698271838, |
|
"loss": 0.1211, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.011903025352277037, |
|
"grad_norm": 0.1296456754207611, |
|
"learning_rate": 0.00019773470340962167, |
|
"loss": 0.1822, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.012019721679260145, |
|
"grad_norm": 0.114451102912426, |
|
"learning_rate": 0.000197711349836525, |
|
"loss": 0.1756, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.012136418006243254, |
|
"grad_norm": 0.10711831599473953, |
|
"learning_rate": 0.0001976879962634283, |
|
"loss": 0.1622, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.012253114333226362, |
|
"grad_norm": 0.11008073389530182, |
|
"learning_rate": 0.00019766464269033163, |
|
"loss": 0.1554, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.01236981066020947, |
|
"grad_norm": 0.1160978451371193, |
|
"learning_rate": 0.00019764128911723493, |
|
"loss": 0.1667, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.012486506987192577, |
|
"grad_norm": 0.1413351148366928, |
|
"learning_rate": 0.00019761793554413825, |
|
"loss": 0.191, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.012603203314175686, |
|
"grad_norm": 0.08713728189468384, |
|
"learning_rate": 0.00019759458197104158, |
|
"loss": 0.1353, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.012719899641158794, |
|
"grad_norm": 0.09029239416122437, |
|
"learning_rate": 0.00019757122839794488, |
|
"loss": 0.1078, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.012836595968141903, |
|
"grad_norm": 0.09928172081708908, |
|
"learning_rate": 0.0001975478748248482, |
|
"loss": 0.136, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.012953292295125011, |
|
"grad_norm": 0.11607584357261658, |
|
"learning_rate": 0.00019752452125175153, |
|
"loss": 0.1861, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.01306998862210812, |
|
"grad_norm": 0.12424415349960327, |
|
"learning_rate": 0.00019750116767865486, |
|
"loss": 0.1987, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.013186684949091228, |
|
"grad_norm": 0.12206408381462097, |
|
"learning_rate": 0.00019747781410555816, |
|
"loss": 0.2091, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.013303381276074335, |
|
"grad_norm": 0.09520223736763, |
|
"learning_rate": 0.0001974544605324615, |
|
"loss": 0.1318, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.013420077603057443, |
|
"grad_norm": 0.10052375495433807, |
|
"learning_rate": 0.0001974311069593648, |
|
"loss": 0.1675, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.013536773930040552, |
|
"grad_norm": 0.09986284375190735, |
|
"learning_rate": 0.00019740775338626811, |
|
"loss": 0.1496, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.01365347025702366, |
|
"grad_norm": 0.09899864345788956, |
|
"learning_rate": 0.00019738439981317144, |
|
"loss": 0.1427, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.013770166584006769, |
|
"grad_norm": 0.09922472387552261, |
|
"learning_rate": 0.00019736104624007474, |
|
"loss": 0.1412, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.013886862910989877, |
|
"grad_norm": 0.08671886473894119, |
|
"learning_rate": 0.00019733769266697807, |
|
"loss": 0.1305, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.014003559237972986, |
|
"grad_norm": 0.10284463316202164, |
|
"learning_rate": 0.00019731433909388137, |
|
"loss": 0.1455, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.014120255564956092, |
|
"grad_norm": 0.12423279136419296, |
|
"learning_rate": 0.0001972909855207847, |
|
"loss": 0.1984, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.0142369518919392, |
|
"grad_norm": 0.12210292369127274, |
|
"learning_rate": 0.000197267631947688, |
|
"loss": 0.1854, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.01435364821892231, |
|
"grad_norm": 0.10566538572311401, |
|
"learning_rate": 0.00019724427837459132, |
|
"loss": 0.1347, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.014470344545905418, |
|
"grad_norm": 0.09597185254096985, |
|
"learning_rate": 0.00019722092480149465, |
|
"loss": 0.133, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.014587040872888526, |
|
"grad_norm": 0.11847853660583496, |
|
"learning_rate": 0.00019719757122839795, |
|
"loss": 0.1788, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.014703737199871635, |
|
"grad_norm": 0.10845934599637985, |
|
"learning_rate": 0.00019717421765530127, |
|
"loss": 0.1509, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.014820433526854743, |
|
"grad_norm": 0.09963663667440414, |
|
"learning_rate": 0.00019715086408220457, |
|
"loss": 0.1677, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.01493712985383785, |
|
"grad_norm": 0.09440722316503525, |
|
"learning_rate": 0.0001971275105091079, |
|
"loss": 0.1396, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.015053826180820958, |
|
"grad_norm": 0.11402937024831772, |
|
"learning_rate": 0.0001971041569360112, |
|
"loss": 0.18, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.015170522507804067, |
|
"grad_norm": 0.1282823383808136, |
|
"learning_rate": 0.00019708080336291455, |
|
"loss": 0.1817, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.015287218834787175, |
|
"grad_norm": 0.09704853594303131, |
|
"learning_rate": 0.00019705744978981785, |
|
"loss": 0.1502, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.015403915161770284, |
|
"grad_norm": 0.09895353019237518, |
|
"learning_rate": 0.00019703409621672118, |
|
"loss": 0.1477, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.015520611488753392, |
|
"grad_norm": 0.10989242792129517, |
|
"learning_rate": 0.00019701074264362448, |
|
"loss": 0.1483, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.0156373078157365, |
|
"grad_norm": 0.11348774284124374, |
|
"learning_rate": 0.0001969873890705278, |
|
"loss": 0.1787, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.01575400414271961, |
|
"grad_norm": 0.10849590599536896, |
|
"learning_rate": 0.00019696403549743113, |
|
"loss": 0.1564, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.015870700469702716, |
|
"grad_norm": 0.10929839313030243, |
|
"learning_rate": 0.00019694068192433443, |
|
"loss": 0.1481, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.015987396796685826, |
|
"grad_norm": 0.09660619497299194, |
|
"learning_rate": 0.00019691732835123776, |
|
"loss": 0.1433, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.016104093123668933, |
|
"grad_norm": 0.11423259973526001, |
|
"learning_rate": 0.00019689397477814106, |
|
"loss": 0.1342, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.01622078945065204, |
|
"grad_norm": 0.10947205871343613, |
|
"learning_rate": 0.00019687062120504439, |
|
"loss": 0.1825, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.01633748577763515, |
|
"grad_norm": 0.11746672540903091, |
|
"learning_rate": 0.00019684726763194768, |
|
"loss": 0.1766, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.016454182104618256, |
|
"grad_norm": 0.1152237206697464, |
|
"learning_rate": 0.000196823914058851, |
|
"loss": 0.161, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.016570878431601366, |
|
"grad_norm": 0.10848015546798706, |
|
"learning_rate": 0.00019680056048575434, |
|
"loss": 0.1541, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.016687574758584473, |
|
"grad_norm": 0.12663570046424866, |
|
"learning_rate": 0.00019677720691265764, |
|
"loss": 0.1891, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.016804271085567583, |
|
"grad_norm": 0.1088947057723999, |
|
"learning_rate": 0.00019675385333956096, |
|
"loss": 0.1505, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.01692096741255069, |
|
"grad_norm": 0.10418037325143814, |
|
"learning_rate": 0.00019673049976646426, |
|
"loss": 0.151, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.017037663739533797, |
|
"grad_norm": 0.08672403544187546, |
|
"learning_rate": 0.0001967071461933676, |
|
"loss": 0.1213, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.017154360066516907, |
|
"grad_norm": 0.10863472521305084, |
|
"learning_rate": 0.0001966837926202709, |
|
"loss": 0.1552, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.017271056393500014, |
|
"grad_norm": 0.10580800473690033, |
|
"learning_rate": 0.00019666043904717422, |
|
"loss": 0.1589, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.017387752720483124, |
|
"grad_norm": 0.09545203298330307, |
|
"learning_rate": 0.00019663708547407754, |
|
"loss": 0.139, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.01750444904746623, |
|
"grad_norm": 0.14016349613666534, |
|
"learning_rate": 0.00019661373190098087, |
|
"loss": 0.2126, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01762114537444934, |
|
"grad_norm": 0.11033914983272552, |
|
"learning_rate": 0.00019659037832788417, |
|
"loss": 0.1644, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.017737841701432448, |
|
"grad_norm": 0.10455331206321716, |
|
"learning_rate": 0.0001965670247547875, |
|
"loss": 0.1499, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.017854538028415554, |
|
"grad_norm": 0.11884409189224243, |
|
"learning_rate": 0.00019654367118169082, |
|
"loss": 0.1731, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.017971234355398664, |
|
"grad_norm": 0.11076351255178452, |
|
"learning_rate": 0.00019652031760859412, |
|
"loss": 0.1782, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.01808793068238177, |
|
"grad_norm": 0.11540203541517258, |
|
"learning_rate": 0.00019649696403549745, |
|
"loss": 0.1731, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.01820462700936488, |
|
"grad_norm": 0.09334211051464081, |
|
"learning_rate": 0.00019647361046240075, |
|
"loss": 0.1576, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.018321323336347988, |
|
"grad_norm": 0.11943213641643524, |
|
"learning_rate": 0.00019645025688930408, |
|
"loss": 0.1715, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.0184380196633311, |
|
"grad_norm": 0.08858149498701096, |
|
"learning_rate": 0.00019642690331620738, |
|
"loss": 0.1282, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.018554715990314205, |
|
"grad_norm": 0.10284683853387833, |
|
"learning_rate": 0.0001964035497431107, |
|
"loss": 0.1727, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.018671412317297312, |
|
"grad_norm": 0.10812927782535553, |
|
"learning_rate": 0.00019638019617001403, |
|
"loss": 0.1741, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.018788108644280422, |
|
"grad_norm": 0.08129740506410599, |
|
"learning_rate": 0.00019635684259691733, |
|
"loss": 0.107, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.01890480497126353, |
|
"grad_norm": 0.11642193049192429, |
|
"learning_rate": 0.00019633348902382066, |
|
"loss": 0.1768, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.01902150129824664, |
|
"grad_norm": 0.10036379098892212, |
|
"learning_rate": 0.00019631013545072396, |
|
"loss": 0.1458, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.019138197625229746, |
|
"grad_norm": 0.0957493856549263, |
|
"learning_rate": 0.00019628678187762728, |
|
"loss": 0.143, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.019254893952212856, |
|
"grad_norm": 0.12155473232269287, |
|
"learning_rate": 0.00019626342830453058, |
|
"loss": 0.1807, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.019371590279195963, |
|
"grad_norm": 0.11061038821935654, |
|
"learning_rate": 0.0001962400747314339, |
|
"loss": 0.1699, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.01948828660617907, |
|
"grad_norm": 0.10963009297847748, |
|
"learning_rate": 0.00019621672115833724, |
|
"loss": 0.1638, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.01960498293316218, |
|
"grad_norm": 0.11229882389307022, |
|
"learning_rate": 0.00019619336758524056, |
|
"loss": 0.1788, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.019721679260145286, |
|
"grad_norm": 0.10003431886434555, |
|
"learning_rate": 0.0001961700140121439, |
|
"loss": 0.1547, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.019838375587128396, |
|
"grad_norm": 0.09698428958654404, |
|
"learning_rate": 0.0001961466604390472, |
|
"loss": 0.1193, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.019955071914111503, |
|
"grad_norm": 0.10166200250387192, |
|
"learning_rate": 0.00019612330686595052, |
|
"loss": 0.1511, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.020071768241094613, |
|
"grad_norm": 0.10210378468036652, |
|
"learning_rate": 0.00019609995329285382, |
|
"loss": 0.1563, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.02018846456807772, |
|
"grad_norm": 0.09979470074176788, |
|
"learning_rate": 0.00019607659971975714, |
|
"loss": 0.1317, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.020305160895060827, |
|
"grad_norm": 0.12395334988832474, |
|
"learning_rate": 0.00019605324614666044, |
|
"loss": 0.1791, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.020421857222043937, |
|
"grad_norm": 0.11577446013689041, |
|
"learning_rate": 0.00019602989257356377, |
|
"loss": 0.1681, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.020538553549027044, |
|
"grad_norm": 0.10168549418449402, |
|
"learning_rate": 0.00019600653900046707, |
|
"loss": 0.1404, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.020655249876010154, |
|
"grad_norm": 0.1242406889796257, |
|
"learning_rate": 0.0001959831854273704, |
|
"loss": 0.1673, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.02077194620299326, |
|
"grad_norm": 0.09891916811466217, |
|
"learning_rate": 0.00019595983185427372, |
|
"loss": 0.1298, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.020888642529976367, |
|
"grad_norm": 0.15590757131576538, |
|
"learning_rate": 0.00019593647828117702, |
|
"loss": 0.1705, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.021005338856959477, |
|
"grad_norm": 0.08277418464422226, |
|
"learning_rate": 0.00019591312470808035, |
|
"loss": 0.1104, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.021122035183942584, |
|
"grad_norm": 0.10521771758794785, |
|
"learning_rate": 0.00019588977113498365, |
|
"loss": 0.1442, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.021238731510925694, |
|
"grad_norm": 0.10389945656061172, |
|
"learning_rate": 0.00019586641756188698, |
|
"loss": 0.1448, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.0213554278379088, |
|
"grad_norm": 0.11086277663707733, |
|
"learning_rate": 0.00019584306398879027, |
|
"loss": 0.1555, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.02147212416489191, |
|
"grad_norm": 0.1056290939450264, |
|
"learning_rate": 0.0001958197104156936, |
|
"loss": 0.161, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.021588820491875018, |
|
"grad_norm": 0.11331475526094437, |
|
"learning_rate": 0.00019579635684259693, |
|
"loss": 0.1236, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.021705516818858125, |
|
"grad_norm": 0.09584867209196091, |
|
"learning_rate": 0.00019577300326950023, |
|
"loss": 0.1282, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.021822213145841235, |
|
"grad_norm": 0.11787907034158707, |
|
"learning_rate": 0.00019574964969640358, |
|
"loss": 0.1843, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.02193890947282434, |
|
"grad_norm": 0.10822898149490356, |
|
"learning_rate": 0.00019572629612330688, |
|
"loss": 0.1638, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.022055605799807452, |
|
"grad_norm": 0.25267189741134644, |
|
"learning_rate": 0.0001957029425502102, |
|
"loss": 0.1718, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.02217230212679056, |
|
"grad_norm": 0.1412399560213089, |
|
"learning_rate": 0.0001956795889771135, |
|
"loss": 0.1803, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02228899845377367, |
|
"grad_norm": 0.11052538454532623, |
|
"learning_rate": 0.00019565623540401683, |
|
"loss": 0.1638, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.022405694780756776, |
|
"grad_norm": 0.11876215785741806, |
|
"learning_rate": 0.00019563288183092013, |
|
"loss": 0.2179, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.022522391107739882, |
|
"grad_norm": 0.10292577743530273, |
|
"learning_rate": 0.00019560952825782346, |
|
"loss": 0.1568, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.022639087434722992, |
|
"grad_norm": 0.10467737168073654, |
|
"learning_rate": 0.0001955861746847268, |
|
"loss": 0.156, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.0227557837617061, |
|
"grad_norm": 0.116755910217762, |
|
"learning_rate": 0.0001955628211116301, |
|
"loss": 0.1665, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.02287248008868921, |
|
"grad_norm": 0.09144476056098938, |
|
"learning_rate": 0.00019553946753853341, |
|
"loss": 0.1356, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.022989176415672316, |
|
"grad_norm": 0.09073708951473236, |
|
"learning_rate": 0.00019551611396543671, |
|
"loss": 0.14, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.023105872742655426, |
|
"grad_norm": 0.10096915066242218, |
|
"learning_rate": 0.00019549276039234004, |
|
"loss": 0.1595, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.023222569069638533, |
|
"grad_norm": 0.10241077095270157, |
|
"learning_rate": 0.00019546940681924334, |
|
"loss": 0.1479, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.02333926539662164, |
|
"grad_norm": 0.10773292183876038, |
|
"learning_rate": 0.00019544605324614667, |
|
"loss": 0.1548, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 8569, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.4879936776325523e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|