{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8396305625524769, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0041981528127623844, "grad_norm": 5.267488956451416, "learning_rate": 9e-06, "loss": 1.0193, "step": 10 }, { "epoch": 0.008396305625524769, "grad_norm": 2.3370044231414795, "learning_rate": 1.9e-05, "loss": 0.3362, "step": 20 }, { "epoch": 0.012594458438287154, "grad_norm": 2.1865203380584717, "learning_rate": 2.9e-05, "loss": 0.2021, "step": 30 }, { "epoch": 0.016792611251049538, "grad_norm": 1.625726342201233, "learning_rate": 3.9000000000000006e-05, "loss": 0.1466, "step": 40 }, { "epoch": 0.020990764063811923, "grad_norm": 1.1913143396377563, "learning_rate": 4.9e-05, "loss": 0.1184, "step": 50 }, { "epoch": 0.02518891687657431, "grad_norm": 1.6674305200576782, "learning_rate": 5.9e-05, "loss": 0.1028, "step": 60 }, { "epoch": 0.02938706968933669, "grad_norm": 0.8989318013191223, "learning_rate": 6.9e-05, "loss": 0.0836, "step": 70 }, { "epoch": 0.033585222502099076, "grad_norm": 0.8966989517211914, "learning_rate": 7.900000000000001e-05, "loss": 0.0851, "step": 80 }, { "epoch": 0.037783375314861464, "grad_norm": 0.7644183039665222, "learning_rate": 8.900000000000001e-05, "loss": 0.0717, "step": 90 }, { "epoch": 0.041981528127623846, "grad_norm": 1.272506833076477, "learning_rate": 9.900000000000001e-05, "loss": 0.0774, "step": 100 }, { "epoch": 0.04617968094038623, "grad_norm": 1.4885015487670898, "learning_rate": 9.999446382823013e-05, "loss": 0.0712, "step": 110 }, { "epoch": 0.05037783375314862, "grad_norm": 1.201401710510254, "learning_rate": 9.997532801828658e-05, "loss": 0.072, "step": 120 }, { "epoch": 0.054575986565911, "grad_norm": 1.367532730102539, "learning_rate": 9.99425294526634e-05, "loss": 0.0682, "step": 130 }, { "epoch": 0.05877413937867338, "grad_norm": 0.9127843976020813, "learning_rate": 9.989607709816091e-05, "loss": 0.0659, "step": 140 }, { "epoch": 0.06297229219143577, "grad_norm": 0.7713141441345215, "learning_rate": 9.983598365438902e-05, "loss": 0.0528, "step": 150 }, { "epoch": 0.06717044500419815, "grad_norm": 0.9186705946922302, "learning_rate": 9.976226555029522e-05, "loss": 0.0495, "step": 160 }, { "epoch": 0.07136859781696053, "grad_norm": 0.5967724919319153, "learning_rate": 9.967494293967312e-05, "loss": 0.0522, "step": 170 }, { "epoch": 0.07556675062972293, "grad_norm": 1.0857634544372559, "learning_rate": 9.95740396956525e-05, "loss": 0.0578, "step": 180 }, { "epoch": 0.07976490344248531, "grad_norm": 0.5992318987846375, "learning_rate": 9.945958340417283e-05, "loss": 0.0478, "step": 190 }, { "epoch": 0.08396305625524769, "grad_norm": 0.5689753293991089, "learning_rate": 9.93316053564413e-05, "loss": 0.0478, "step": 200 }, { "epoch": 0.08816120906801007, "grad_norm": 0.9272230863571167, "learning_rate": 9.919014054037836e-05, "loss": 0.0527, "step": 210 }, { "epoch": 0.09235936188077246, "grad_norm": 1.0262973308563232, "learning_rate": 9.903522763105218e-05, "loss": 0.051, "step": 220 }, { "epoch": 0.09655751469353484, "grad_norm": 0.6636225581169128, "learning_rate": 9.886690898010535e-05, "loss": 0.0429, "step": 230 }, { "epoch": 0.10075566750629723, "grad_norm": 0.6513682007789612, "learning_rate": 9.868523060417646e-05, "loss": 0.0443, "step": 240 }, { "epoch": 0.10495382031905962, "grad_norm": 0.7679399251937866, "learning_rate": 9.849024217231935e-05, "loss": 0.0406, "step": 250 }, { "epoch": 0.109151973131822, "grad_norm": 0.5520310401916504, "learning_rate": 9.82819969924244e-05, "loss": 0.0345, "step": 260 }, { "epoch": 0.11335012594458438, "grad_norm": 0.7136860489845276, "learning_rate": 9.806055199664446e-05, "loss": 0.0487, "step": 270 }, { "epoch": 0.11754827875734676, "grad_norm": 0.5643390417098999, "learning_rate": 9.782596772583026e-05, "loss": 0.0419, "step": 280 }, { "epoch": 0.12174643157010916, "grad_norm": 0.6041691899299622, "learning_rate": 9.757830831297914e-05, "loss": 0.0479, "step": 290 }, { "epoch": 0.12594458438287154, "grad_norm": 0.5313841700553894, "learning_rate": 9.731764146570173e-05, "loss": 0.037, "step": 300 }, { "epoch": 0.13014273719563393, "grad_norm": 0.4854077100753784, "learning_rate": 9.704403844771128e-05, "loss": 0.0368, "step": 310 }, { "epoch": 0.1343408900083963, "grad_norm": 0.44807952642440796, "learning_rate": 9.675757405934103e-05, "loss": 0.0363, "step": 320 }, { "epoch": 0.1385390428211587, "grad_norm": 0.51119464635849, "learning_rate": 9.645832661709444e-05, "loss": 0.0308, "step": 330 }, { "epoch": 0.14273719563392107, "grad_norm": 0.5140641927719116, "learning_rate": 9.614637793223425e-05, "loss": 0.0316, "step": 340 }, { "epoch": 0.14693534844668346, "grad_norm": 0.5101410150527954, "learning_rate": 9.582181328841611e-05, "loss": 0.0307, "step": 350 }, { "epoch": 0.15113350125944586, "grad_norm": 0.6274232864379883, "learning_rate": 9.548472141837286e-05, "loss": 0.0324, "step": 360 }, { "epoch": 0.15533165407220823, "grad_norm": 0.6747106313705444, "learning_rate": 9.513519447965595e-05, "loss": 0.0353, "step": 370 }, { "epoch": 0.15952980688497062, "grad_norm": 0.38412219285964966, "learning_rate": 9.477332802944044e-05, "loss": 0.0311, "step": 380 }, { "epoch": 0.163727959697733, "grad_norm": 0.3586186468601227, "learning_rate": 9.439922099840054e-05, "loss": 0.032, "step": 390 }, { "epoch": 0.16792611251049538, "grad_norm": 0.38126131892204285, "learning_rate": 9.401297566366318e-05, "loss": 0.0266, "step": 400 }, { "epoch": 0.17212426532325778, "grad_norm": 0.3684011995792389, "learning_rate": 9.36146976208462e-05, "loss": 0.0375, "step": 410 }, { "epoch": 0.17632241813602015, "grad_norm": 0.5006753206253052, "learning_rate": 9.320449575518972e-05, "loss": 0.028, "step": 420 }, { "epoch": 0.18052057094878254, "grad_norm": 0.5954798460006714, "learning_rate": 9.278248221178798e-05, "loss": 0.0329, "step": 430 }, { "epoch": 0.1847187237615449, "grad_norm": 0.745754063129425, "learning_rate": 9.234877236492997e-05, "loss": 0.0339, "step": 440 }, { "epoch": 0.1889168765743073, "grad_norm": 0.6667306423187256, "learning_rate": 9.190348478655724e-05, "loss": 0.0316, "step": 450 }, { "epoch": 0.19311502938706968, "grad_norm": 0.7456135153770447, "learning_rate": 9.144674121384757e-05, "loss": 0.0394, "step": 460 }, { "epoch": 0.19731318219983207, "grad_norm": 0.4713476002216339, "learning_rate": 9.097866651593317e-05, "loss": 0.031, "step": 470 }, { "epoch": 0.20151133501259447, "grad_norm": 0.9486730098724365, "learning_rate": 9.049938865976275e-05, "loss": 0.0335, "step": 480 }, { "epoch": 0.20570948782535683, "grad_norm": 0.8080450296401978, "learning_rate": 9.000903867511666e-05, "loss": 0.0326, "step": 490 }, { "epoch": 0.20990764063811923, "grad_norm": 0.6074867844581604, "learning_rate": 8.950775061878453e-05, "loss": 0.0326, "step": 500 }, { "epoch": 0.2141057934508816, "grad_norm": 0.4722858965396881, "learning_rate": 8.899566153791566e-05, "loss": 0.0362, "step": 510 }, { "epoch": 0.218303946263644, "grad_norm": 0.3163154721260071, "learning_rate": 8.84729114325516e-05, "loss": 0.0312, "step": 520 }, { "epoch": 0.2225020990764064, "grad_norm": 0.45854002237319946, "learning_rate": 8.79396432173515e-05, "loss": 0.0319, "step": 530 }, { "epoch": 0.22670025188916876, "grad_norm": 0.5331682562828064, "learning_rate": 8.739600268252078e-05, "loss": 0.0302, "step": 540 }, { "epoch": 0.23089840470193115, "grad_norm": 0.44824129343032837, "learning_rate": 8.684213845395339e-05, "loss": 0.0256, "step": 550 }, { "epoch": 0.23509655751469352, "grad_norm": 0.38797256350517273, "learning_rate": 8.627820195259918e-05, "loss": 0.024, "step": 560 }, { "epoch": 0.23929471032745592, "grad_norm": 0.44604647159576416, "learning_rate": 8.570434735306671e-05, "loss": 0.0265, "step": 570 }, { "epoch": 0.2434928631402183, "grad_norm": 0.46292269229888916, "learning_rate": 8.512073154147362e-05, "loss": 0.0238, "step": 580 }, { "epoch": 0.24769101595298068, "grad_norm": 0.6517325043678284, "learning_rate": 8.452751407255541e-05, "loss": 0.0283, "step": 590 }, { "epoch": 0.2518891687657431, "grad_norm": 0.3638853430747986, "learning_rate": 8.392485712604483e-05, "loss": 0.0269, "step": 600 }, { "epoch": 0.25608732157850544, "grad_norm": 0.3911242187023163, "learning_rate": 8.331292546233362e-05, "loss": 0.0298, "step": 610 }, { "epoch": 0.26028547439126787, "grad_norm": 0.4468225836753845, "learning_rate": 8.269188637742846e-05, "loss": 0.0235, "step": 620 }, { "epoch": 0.26448362720403024, "grad_norm": 0.4399245083332062, "learning_rate": 8.206190965721419e-05, "loss": 0.0217, "step": 630 }, { "epoch": 0.2686817800167926, "grad_norm": 0.34588077664375305, "learning_rate": 8.14231675310358e-05, "loss": 0.0212, "step": 640 }, { "epoch": 0.27287993282955497, "grad_norm": 0.3927046060562134, "learning_rate": 8.077583462461283e-05, "loss": 0.0228, "step": 650 }, { "epoch": 0.2770780856423174, "grad_norm": 0.42579185962677, "learning_rate": 8.012008791229826e-05, "loss": 0.0239, "step": 660 }, { "epoch": 0.28127623845507976, "grad_norm": 0.37947288155555725, "learning_rate": 7.945610666869568e-05, "loss": 0.0302, "step": 670 }, { "epoch": 0.28547439126784213, "grad_norm": 0.46608966588974, "learning_rate": 7.878407241964729e-05, "loss": 0.0269, "step": 680 }, { "epoch": 0.28967254408060455, "grad_norm": 0.4380934536457062, "learning_rate": 7.810416889260653e-05, "loss": 0.0259, "step": 690 }, { "epoch": 0.2938706968933669, "grad_norm": 0.4817158281803131, "learning_rate": 7.741658196640892e-05, "loss": 0.0246, "step": 700 }, { "epoch": 0.2980688497061293, "grad_norm": 0.3391236960887909, "learning_rate": 7.672149962045457e-05, "loss": 0.0285, "step": 710 }, { "epoch": 0.3022670025188917, "grad_norm": 0.5921253561973572, "learning_rate": 7.60191118833165e-05, "loss": 0.0192, "step": 720 }, { "epoch": 0.3064651553316541, "grad_norm": 0.3677942156791687, "learning_rate": 7.530961078078873e-05, "loss": 0.0259, "step": 730 }, { "epoch": 0.31066330814441645, "grad_norm": 0.4081045985221863, "learning_rate": 7.45931902833884e-05, "loss": 0.025, "step": 740 }, { "epoch": 0.3148614609571788, "grad_norm": 0.4675191342830658, "learning_rate": 7.387004625332608e-05, "loss": 0.0223, "step": 750 }, { "epoch": 0.31905961376994124, "grad_norm": 0.40352970361709595, "learning_rate": 7.3140376390959e-05, "loss": 0.0177, "step": 760 }, { "epoch": 0.3232577665827036, "grad_norm": 0.4271659851074219, "learning_rate": 7.240438018074189e-05, "loss": 0.0187, "step": 770 }, { "epoch": 0.327455919395466, "grad_norm": 0.4077198803424835, "learning_rate": 7.166225883668969e-05, "loss": 0.0198, "step": 780 }, { "epoch": 0.3316540722082284, "grad_norm": 0.24861399829387665, "learning_rate": 7.091421524736784e-05, "loss": 0.0182, "step": 790 }, { "epoch": 0.33585222502099077, "grad_norm": 0.3340529501438141, "learning_rate": 7.016045392042452e-05, "loss": 0.0195, "step": 800 }, { "epoch": 0.34005037783375314, "grad_norm": 0.30124542117118835, "learning_rate": 6.940118092668022e-05, "loss": 0.0202, "step": 810 }, { "epoch": 0.34424853064651556, "grad_norm": 0.28010880947113037, "learning_rate": 6.863660384379017e-05, "loss": 0.0192, "step": 820 }, { "epoch": 0.34844668345927793, "grad_norm": 0.34497565031051636, "learning_rate": 6.786693169949455e-05, "loss": 0.0188, "step": 830 }, { "epoch": 0.3526448362720403, "grad_norm": 0.38020557165145874, "learning_rate": 6.709237491447249e-05, "loss": 0.0159, "step": 840 }, { "epoch": 0.35684298908480266, "grad_norm": 0.677865207195282, "learning_rate": 6.631314524481513e-05, "loss": 0.0214, "step": 850 }, { "epoch": 0.3610411418975651, "grad_norm": 0.741203784942627, "learning_rate": 6.552945572413358e-05, "loss": 0.0203, "step": 860 }, { "epoch": 0.36523929471032746, "grad_norm": 0.35501229763031006, "learning_rate": 6.474152060531768e-05, "loss": 0.0193, "step": 870 }, { "epoch": 0.3694374475230898, "grad_norm": 0.3309725522994995, "learning_rate": 6.394955530196147e-05, "loss": 0.0152, "step": 880 }, { "epoch": 0.37363560033585225, "grad_norm": 0.36004316806793213, "learning_rate": 6.315377632947115e-05, "loss": 0.0151, "step": 890 }, { "epoch": 0.3778337531486146, "grad_norm": 0.35332006216049194, "learning_rate": 6.235440124587198e-05, "loss": 0.0174, "step": 900 }, { "epoch": 0.382031905961377, "grad_norm": 0.3157385587692261, "learning_rate": 6.155164859233012e-05, "loss": 0.0125, "step": 910 }, { "epoch": 0.38623005877413935, "grad_norm": 0.2917327284812927, "learning_rate": 6.074573783340562e-05, "loss": 0.0185, "step": 920 }, { "epoch": 0.3904282115869018, "grad_norm": 0.20623797178268433, "learning_rate": 5.9936889297052986e-05, "loss": 0.0141, "step": 930 }, { "epoch": 0.39462636439966414, "grad_norm": 0.30338916182518005, "learning_rate": 5.912532411438576e-05, "loss": 0.0124, "step": 940 }, { "epoch": 0.3988245172124265, "grad_norm": 0.46562105417251587, "learning_rate": 5.831126415922148e-05, "loss": 0.0187, "step": 950 }, { "epoch": 0.40302267002518893, "grad_norm": 0.303715318441391, "learning_rate": 5.74949319874235e-05, "loss": 0.013, "step": 960 }, { "epoch": 0.4072208228379513, "grad_norm": 0.33936807513237, "learning_rate": 5.667655077605659e-05, "loss": 0.0147, "step": 970 }, { "epoch": 0.41141897565071367, "grad_norm": 0.26057168841362, "learning_rate": 5.585634426237246e-05, "loss": 0.0147, "step": 980 }, { "epoch": 0.4156171284634761, "grad_norm": 0.30878400802612305, "learning_rate": 5.5034536682642224e-05, "loss": 0.0208, "step": 990 }, { "epoch": 0.41981528127623846, "grad_norm": 0.32619133591651917, "learning_rate": 5.4211352710852495e-05, "loss": 0.0147, "step": 1000 }, { "epoch": 0.42401343408900083, "grad_norm": 0.4049901068210602, "learning_rate": 5.3387017397281704e-05, "loss": 0.0136, "step": 1010 }, { "epoch": 0.4282115869017632, "grad_norm": 0.263507217168808, "learning_rate": 5.2561756106973656e-05, "loss": 0.0142, "step": 1020 }, { "epoch": 0.4324097397145256, "grad_norm": 0.3729311525821686, "learning_rate": 5.1735794458124956e-05, "loss": 0.0143, "step": 1030 }, { "epoch": 0.436607892527288, "grad_norm": 0.433525413274765, "learning_rate": 5.0909358260403186e-05, "loss": 0.0159, "step": 1040 }, { "epoch": 0.44080604534005036, "grad_norm": 0.20972338318824768, "learning_rate": 5.0082673453212914e-05, "loss": 0.0125, "step": 1050 }, { "epoch": 0.4450041981528128, "grad_norm": 0.44978395104408264, "learning_rate": 4.925596604392603e-05, "loss": 0.0201, "step": 1060 }, { "epoch": 0.44920235096557515, "grad_norm": 0.33291032910346985, "learning_rate": 4.8429462046093585e-05, "loss": 0.0145, "step": 1070 }, { "epoch": 0.4534005037783375, "grad_norm": 0.3551059663295746, "learning_rate": 4.7603387417656026e-05, "loss": 0.0164, "step": 1080 }, { "epoch": 0.45759865659109994, "grad_norm": 0.2902749478816986, "learning_rate": 4.677796799916845e-05, "loss": 0.0088, "step": 1090 }, { "epoch": 0.4617968094038623, "grad_norm": 0.4253701865673065, "learning_rate": 4.5953429452058135e-05, "loss": 0.0149, "step": 1100 }, { "epoch": 0.4659949622166247, "grad_norm": 0.25408294796943665, "learning_rate": 4.5129997196930845e-05, "loss": 0.014, "step": 1110 }, { "epoch": 0.47019311502938704, "grad_norm": 0.3801613450050354, "learning_rate": 4.430789635194324e-05, "loss": 0.0135, "step": 1120 }, { "epoch": 0.47439126784214947, "grad_norm": 0.3434455990791321, "learning_rate": 4.348735167125771e-05, "loss": 0.0175, "step": 1130 }, { "epoch": 0.47858942065491183, "grad_norm": 0.38031941652297974, "learning_rate": 4.2668587483596864e-05, "loss": 0.0131, "step": 1140 }, { "epoch": 0.4827875734676742, "grad_norm": 0.2573353350162506, "learning_rate": 4.1851827630914305e-05, "loss": 0.0137, "step": 1150 }, { "epoch": 0.4869857262804366, "grad_norm": 0.31360694766044617, "learning_rate": 4.103729540719847e-05, "loss": 0.0146, "step": 1160 }, { "epoch": 0.491183879093199, "grad_norm": 0.32134002447128296, "learning_rate": 4.0225213497426276e-05, "loss": 0.0133, "step": 1170 }, { "epoch": 0.49538203190596136, "grad_norm": 0.31462499499320984, "learning_rate": 3.9415803916683224e-05, "loss": 0.0124, "step": 1180 }, { "epoch": 0.4995801847187238, "grad_norm": 0.3015807569026947, "learning_rate": 3.860928794946682e-05, "loss": 0.0147, "step": 1190 }, { "epoch": 0.5037783375314862, "grad_norm": 0.26400843262672424, "learning_rate": 3.780588608918947e-05, "loss": 0.0125, "step": 1200 }, { "epoch": 0.5079764903442485, "grad_norm": 0.27983877062797546, "learning_rate": 3.700581797789786e-05, "loss": 0.0139, "step": 1210 }, { "epoch": 0.5121746431570109, "grad_norm": 0.34057655930519104, "learning_rate": 3.6209302346225006e-05, "loss": 0.0125, "step": 1220 }, { "epoch": 0.5163727959697733, "grad_norm": 0.3145323097705841, "learning_rate": 3.541655695359142e-05, "loss": 0.0141, "step": 1230 }, { "epoch": 0.5205709487825357, "grad_norm": 0.2706829905509949, "learning_rate": 3.462779852867197e-05, "loss": 0.0112, "step": 1240 }, { "epoch": 0.5247691015952981, "grad_norm": 0.3848113417625427, "learning_rate": 3.384324271014429e-05, "loss": 0.0147, "step": 1250 }, { "epoch": 0.5289672544080605, "grad_norm": 0.32747682929039, "learning_rate": 3.3063103987735433e-05, "loss": 0.0114, "step": 1260 }, { "epoch": 0.5331654072208228, "grad_norm": 0.2991146445274353, "learning_rate": 3.228759564358248e-05, "loss": 0.0141, "step": 1270 }, { "epoch": 0.5373635600335852, "grad_norm": 0.3067378103733063, "learning_rate": 3.1516929693923315e-05, "loss": 0.0129, "step": 1280 }, { "epoch": 0.5415617128463476, "grad_norm": 0.15431927144527435, "learning_rate": 3.075131683113352e-05, "loss": 0.013, "step": 1290 }, { "epoch": 0.5457598656591099, "grad_norm": 0.1479986160993576, "learning_rate": 2.999096636612518e-05, "loss": 0.0121, "step": 1300 }, { "epoch": 0.5499580184718724, "grad_norm": 0.1798468679189682, "learning_rate": 2.9236086171123404e-05, "loss": 0.0107, "step": 1310 }, { "epoch": 0.5541561712846348, "grad_norm": 0.21686813235282898, "learning_rate": 2.8486882622836026e-05, "loss": 0.0124, "step": 1320 }, { "epoch": 0.5583543240973972, "grad_norm": 0.40006551146507263, "learning_rate": 2.774356054603243e-05, "loss": 0.0103, "step": 1330 }, { "epoch": 0.5625524769101595, "grad_norm": 0.39905092120170593, "learning_rate": 2.7006323157546386e-05, "loss": 0.0146, "step": 1340 }, { "epoch": 0.5667506297229219, "grad_norm": 0.156532883644104, "learning_rate": 2.6275372010718635e-05, "loss": 0.0083, "step": 1350 }, { "epoch": 0.5709487825356843, "grad_norm": 0.418869286775589, "learning_rate": 2.555090694029421e-05, "loss": 0.0112, "step": 1360 }, { "epoch": 0.5751469353484466, "grad_norm": 0.3597627878189087, "learning_rate": 2.4833126007789653e-05, "loss": 0.0071, "step": 1370 }, { "epoch": 0.5793450881612091, "grad_norm": 0.3350473940372467, "learning_rate": 2.4122225447344875e-05, "loss": 0.0094, "step": 1380 }, { "epoch": 0.5835432409739715, "grad_norm": 0.2572272717952728, "learning_rate": 2.341839961207482e-05, "loss": 0.0086, "step": 1390 }, { "epoch": 0.5877413937867338, "grad_norm": 0.44453227519989014, "learning_rate": 2.2721840920935196e-05, "loss": 0.0104, "step": 1400 }, { "epoch": 0.5919395465994962, "grad_norm": 0.23089422285556793, "learning_rate": 2.2032739806117058e-05, "loss": 0.0075, "step": 1410 }, { "epoch": 0.5961376994122586, "grad_norm": 0.2570338547229767, "learning_rate": 2.1351284660984572e-05, "loss": 0.0128, "step": 1420 }, { "epoch": 0.600335852225021, "grad_norm": 0.4198639690876007, "learning_rate": 2.067766178857013e-05, "loss": 0.0133, "step": 1430 }, { "epoch": 0.6045340050377834, "grad_norm": 0.4144093990325928, "learning_rate": 2.0012055350640986e-05, "loss": 0.0095, "step": 1440 }, { "epoch": 0.6087321578505458, "grad_norm": 0.25941231846809387, "learning_rate": 1.9354647317351188e-05, "loss": 0.0105, "step": 1450 }, { "epoch": 0.6129303106633082, "grad_norm": 0.25724703073501587, "learning_rate": 1.8705617417492883e-05, "loss": 0.0096, "step": 1460 }, { "epoch": 0.6171284634760705, "grad_norm": 0.2537188231945038, "learning_rate": 1.8065143089360172e-05, "loss": 0.0094, "step": 1470 }, { "epoch": 0.6213266162888329, "grad_norm": 0.25936058163642883, "learning_rate": 1.743339943223926e-05, "loss": 0.0132, "step": 1480 }, { "epoch": 0.6255247691015953, "grad_norm": 0.27446359395980835, "learning_rate": 1.6810559158538092e-05, "loss": 0.0084, "step": 1490 }, { "epoch": 0.6297229219143576, "grad_norm": 0.27379536628723145, "learning_rate": 1.6196792546568472e-05, "loss": 0.0126, "step": 1500 }, { "epoch": 0.6339210747271201, "grad_norm": 0.16973455250263214, "learning_rate": 1.5592267393993716e-05, "loss": 0.0082, "step": 1510 }, { "epoch": 0.6381192275398825, "grad_norm": 0.19682660698890686, "learning_rate": 1.4997148971954344e-05, "loss": 0.0119, "step": 1520 }, { "epoch": 0.6423173803526449, "grad_norm": 0.16340869665145874, "learning_rate": 1.4411599979884744e-05, "loss": 0.0101, "step": 1530 }, { "epoch": 0.6465155331654072, "grad_norm": 0.27933576703071594, "learning_rate": 1.383578050103268e-05, "loss": 0.0117, "step": 1540 }, { "epoch": 0.6507136859781696, "grad_norm": 0.10343077033758163, "learning_rate": 1.3269847958694148e-05, "loss": 0.0101, "step": 1550 }, { "epoch": 0.654911838790932, "grad_norm": 0.15868137776851654, "learning_rate": 1.2713957073175425e-05, "loss": 0.0084, "step": 1560 }, { "epoch": 0.6591099916036943, "grad_norm": 0.22004792094230652, "learning_rate": 1.2168259819494066e-05, "loss": 0.0087, "step": 1570 }, { "epoch": 0.6633081444164568, "grad_norm": 0.11181829124689102, "learning_rate": 1.1632905385830484e-05, "loss": 0.0098, "step": 1580 }, { "epoch": 0.6675062972292192, "grad_norm": 0.1479823738336563, "learning_rate": 1.1108040132741354e-05, "loss": 0.0093, "step": 1590 }, { "epoch": 0.6717044500419815, "grad_norm": 0.2433646023273468, "learning_rate": 1.059380755314613e-05, "loss": 0.0088, "step": 1600 }, { "epoch": 0.6759026028547439, "grad_norm": 0.212657168507576, "learning_rate": 1.009034823309749e-05, "loss": 0.0061, "step": 1610 }, { "epoch": 0.6801007556675063, "grad_norm": 0.10653030127286911, "learning_rate": 9.597799813346525e-06, "loss": 0.0068, "step": 1620 }, { "epoch": 0.6842989084802686, "grad_norm": 0.10436981171369553, "learning_rate": 9.116296951713133e-06, "loss": 0.0064, "step": 1630 }, { "epoch": 0.6884970612930311, "grad_norm": 0.28674691915512085, "learning_rate": 8.645971286271904e-06, "loss": 0.0083, "step": 1640 }, { "epoch": 0.6926952141057935, "grad_norm": 0.2074134796857834, "learning_rate": 8.186951399363613e-06, "loss": 0.0103, "step": 1650 }, { "epoch": 0.6968933669185559, "grad_norm": 0.160618856549263, "learning_rate": 7.739362782442021e-06, "loss": 0.0051, "step": 1660 }, { "epoch": 0.7010915197313182, "grad_norm": 0.2147001475095749, "learning_rate": 7.30332780176588e-06, "loss": 0.0085, "step": 1670 }, { "epoch": 0.7052896725440806, "grad_norm": 0.27866339683532715, "learning_rate": 6.878965664945108e-06, "loss": 0.0089, "step": 1680 }, { "epoch": 0.709487825356843, "grad_norm": 0.22506844997406006, "learning_rate": 6.466392388350695e-06, "loss": 0.009, "step": 1690 }, { "epoch": 0.7136859781696053, "grad_norm": 0.2676888406276703, "learning_rate": 6.0657207653969315e-06, "loss": 0.0074, "step": 1700 }, { "epoch": 0.7178841309823678, "grad_norm": 0.19724851846694946, "learning_rate": 5.67706033570487e-06, "loss": 0.0074, "step": 1710 }, { "epoch": 0.7220822837951302, "grad_norm": 0.18323613703250885, "learning_rate": 5.300517355155215e-06, "loss": 0.0093, "step": 1720 }, { "epoch": 0.7262804366078925, "grad_norm": 0.2785090208053589, "learning_rate": 4.936194766839103e-06, "loss": 0.0093, "step": 1730 }, { "epoch": 0.7304785894206549, "grad_norm": 0.16792871057987213, "learning_rate": 4.5841921729144424e-06, "loss": 0.0063, "step": 1740 }, { "epoch": 0.7346767422334173, "grad_norm": 0.1223788782954216, "learning_rate": 4.244605807375679e-06, "loss": 0.009, "step": 1750 }, { "epoch": 0.7388748950461796, "grad_norm": 0.13132202625274658, "learning_rate": 3.917528509744412e-06, "loss": 0.0093, "step": 1760 }, { "epoch": 0.743073047858942, "grad_norm": 0.2555523216724396, "learning_rate": 3.60304969968796e-06, "loss": 0.0078, "step": 1770 }, { "epoch": 0.7472712006717045, "grad_norm": 0.16517667472362518, "learning_rate": 3.301255352572946e-06, "loss": 0.0074, "step": 1780 }, { "epoch": 0.7514693534844669, "grad_norm": 0.1717490404844284, "learning_rate": 3.0122279759604745e-06, "loss": 0.0116, "step": 1790 }, { "epoch": 0.7556675062972292, "grad_norm": 0.1151094138622284, "learning_rate": 2.73604658704939e-06, "loss": 0.0075, "step": 1800 }, { "epoch": 0.7598656591099916, "grad_norm": 0.19842703640460968, "learning_rate": 2.4727866910737583e-06, "loss": 0.0065, "step": 1810 }, { "epoch": 0.764063811922754, "grad_norm": 0.2035660296678543, "learning_rate": 2.222520260660521e-06, "loss": 0.0057, "step": 1820 }, { "epoch": 0.7682619647355163, "grad_norm": 0.16106460988521576, "learning_rate": 1.985315716152847e-06, "loss": 0.0058, "step": 1830 }, { "epoch": 0.7724601175482787, "grad_norm": 0.20101721584796906, "learning_rate": 1.7612379069047335e-06, "loss": 0.0059, "step": 1840 }, { "epoch": 0.7766582703610412, "grad_norm": 0.11036290228366852, "learning_rate": 1.550348093551829e-06, "loss": 0.0069, "step": 1850 }, { "epoch": 0.7808564231738035, "grad_norm": 0.14676934480667114, "learning_rate": 1.3527039312633827e-06, "loss": 0.0082, "step": 1860 }, { "epoch": 0.7850545759865659, "grad_norm": 0.21743223071098328, "learning_rate": 1.1683594539798893e-06, "loss": 0.0093, "step": 1870 }, { "epoch": 0.7892527287993283, "grad_norm": 0.09522661566734314, "learning_rate": 9.97365059640787e-07, "loss": 0.0058, "step": 1880 }, { "epoch": 0.7934508816120907, "grad_norm": 0.12366088479757309, "learning_rate": 8.397674964061075e-07, "loss": 0.0077, "step": 1890 }, { "epoch": 0.797649034424853, "grad_norm": 0.17567159235477448, "learning_rate": 6.956098498760389e-07, "loss": 0.0105, "step": 1900 }, { "epoch": 0.8018471872376155, "grad_norm": 0.17301349341869354, "learning_rate": 5.64931531311741e-07, "loss": 0.0072, "step": 1910 }, { "epoch": 0.8060453400503779, "grad_norm": 0.13477115333080292, "learning_rate": 4.4776826686069305e-07, "loss": 0.008, "step": 1920 }, { "epoch": 0.8102434928631402, "grad_norm": 0.16665753722190857, "learning_rate": 3.441520877895288e-07, "loss": 0.0078, "step": 1930 }, { "epoch": 0.8144416456759026, "grad_norm": 0.20697897672653198, "learning_rate": 2.5411132172700194e-07, "loss": 0.0079, "step": 1940 }, { "epoch": 0.818639798488665, "grad_norm": 0.23099485039710999, "learning_rate": 1.776705849195037e-07, "loss": 0.0104, "step": 1950 }, { "epoch": 0.8228379513014273, "grad_norm": 0.16362063586711884, "learning_rate": 1.1485077550122402e-07, "loss": 0.0057, "step": 1960 }, { "epoch": 0.8270361041141897, "grad_norm": 0.12974146008491516, "learning_rate": 6.566906778079917e-08, "loss": 0.0079, "step": 1970 }, { "epoch": 0.8312342569269522, "grad_norm": 0.1990516036748886, "learning_rate": 3.01389075460512e-08, "loss": 0.0101, "step": 1980 }, { "epoch": 0.8354324097397146, "grad_norm": 0.15594905614852905, "learning_rate": 8.270008388022721e-09, "loss": 0.0065, "step": 1990 }, { "epoch": 0.8396305625524769, "grad_norm": 0.07478626072406769, "learning_rate": 6.834904537900144e-11, "loss": 0.0054, "step": 2000 }, { "epoch": 0.8396305625524769, "step": 2000, "total_flos": 0.0, "train_loss": 0.030227677578106522, "train_runtime": 2758.5082, "train_samples_per_second": 23.201, "train_steps_per_second": 0.725 } ], "logging_steps": 10, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }