{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8575851393188856, "eval_steps": 500, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010319917440660475, "grad_norm": 2.8247148990631104, "learning_rate": 2.2727272727272728e-06, "loss": 0.8288, "step": 10 }, { "epoch": 0.02063983488132095, "grad_norm": 1.1619349718093872, "learning_rate": 4.5454545454545455e-06, "loss": 0.7902, "step": 20 }, { "epoch": 0.030959752321981424, "grad_norm": 0.7691543698310852, "learning_rate": 6.818181818181818e-06, "loss": 0.7388, "step": 30 }, { "epoch": 0.0412796697626419, "grad_norm": 0.687256395816803, "learning_rate": 9.090909090909091e-06, "loss": 0.7177, "step": 40 }, { "epoch": 0.05159958720330237, "grad_norm": 0.6163066029548645, "learning_rate": 1.1363636363636366e-05, "loss": 0.701, "step": 50 }, { "epoch": 0.06191950464396285, "grad_norm": 0.6468276381492615, "learning_rate": 1.3636363636363637e-05, "loss": 0.6853, "step": 60 }, { "epoch": 0.07223942208462332, "grad_norm": 0.9129849672317505, "learning_rate": 1.590909090909091e-05, "loss": 0.6749, "step": 70 }, { "epoch": 0.0825593395252838, "grad_norm": 0.9610547423362732, "learning_rate": 1.8181818181818182e-05, "loss": 0.664, "step": 80 }, { "epoch": 0.09287925696594428, "grad_norm": 0.9436660408973694, "learning_rate": 1.9999975160696756e-05, "loss": 0.6637, "step": 90 }, { "epoch": 0.10319917440660474, "grad_norm": 0.828860878944397, "learning_rate": 1.999910579803988e-05, "loss": 0.6578, "step": 100 }, { "epoch": 0.11351909184726522, "grad_norm": 0.8615094423294067, "learning_rate": 1.9996994593616145e-05, "loss": 0.6473, "step": 110 }, { "epoch": 0.1238390092879257, "grad_norm": 0.8153389096260071, "learning_rate": 1.9993641809627166e-05, "loss": 0.6402, "step": 120 }, { "epoch": 0.13415892672858618, "grad_norm": 0.8015602827072144, "learning_rate": 1.9989047862472904e-05, "loss": 0.6378, "step": 130 }, { "epoch": 0.14447884416924664, "grad_norm": 0.7367793321609497, "learning_rate": 1.9983213322699926e-05, "loss": 0.6346, "step": 140 }, { "epoch": 0.15479876160990713, "grad_norm": 0.913837194442749, "learning_rate": 1.997613891493054e-05, "loss": 0.6322, "step": 150 }, { "epoch": 0.1651186790505676, "grad_norm": 0.7812018990516663, "learning_rate": 1.996782551777282e-05, "loss": 0.6206, "step": 160 }, { "epoch": 0.17543859649122806, "grad_norm": 0.7282320857048035, "learning_rate": 1.995827416371147e-05, "loss": 0.6127, "step": 170 }, { "epoch": 0.18575851393188855, "grad_norm": 0.7379522919654846, "learning_rate": 1.9947486038979606e-05, "loss": 0.6098, "step": 180 }, { "epoch": 0.19607843137254902, "grad_norm": 0.7425850033760071, "learning_rate": 1.993546248341142e-05, "loss": 0.6079, "step": 190 }, { "epoch": 0.20639834881320948, "grad_norm": 0.6972795724868774, "learning_rate": 1.9922204990275788e-05, "loss": 0.6006, "step": 200 }, { "epoch": 0.21671826625386997, "grad_norm": 0.7257381677627563, "learning_rate": 1.9907715206090817e-05, "loss": 0.6042, "step": 210 }, { "epoch": 0.22703818369453044, "grad_norm": 0.6420859098434448, "learning_rate": 1.989199493041935e-05, "loss": 0.593, "step": 220 }, { "epoch": 0.23735810113519093, "grad_norm": 0.7002107501029968, "learning_rate": 1.9875046115645443e-05, "loss": 0.5931, "step": 230 }, { "epoch": 0.2476780185758514, "grad_norm": 0.7185678482055664, "learning_rate": 1.9856870866731946e-05, "loss": 0.5926, "step": 240 }, { "epoch": 0.2579979360165119, "grad_norm": 0.6459465026855469, "learning_rate": 1.983747144095902e-05, "loss": 0.5878, "step": 250 }, { "epoch": 0.26831785345717235, "grad_norm": 0.6379982233047485, "learning_rate": 1.9816850247643834e-05, "loss": 0.5796, "step": 260 }, { "epoch": 0.2786377708978328, "grad_norm": 0.7094199061393738, "learning_rate": 1.97950098478413e-05, "loss": 0.5771, "step": 270 }, { "epoch": 0.2889576883384933, "grad_norm": 0.6646308302879333, "learning_rate": 1.9771952954026038e-05, "loss": 0.5767, "step": 280 }, { "epoch": 0.29927760577915374, "grad_norm": 0.6392974257469177, "learning_rate": 1.9747682429755493e-05, "loss": 0.5737, "step": 290 }, { "epoch": 0.30959752321981426, "grad_norm": 0.5905966758728027, "learning_rate": 1.972220128931427e-05, "loss": 0.576, "step": 300 }, { "epoch": 0.31991744066047473, "grad_norm": 0.8001016974449158, "learning_rate": 1.9695512697339797e-05, "loss": 0.5698, "step": 310 }, { "epoch": 0.3302373581011352, "grad_norm": 0.5997283458709717, "learning_rate": 1.966761996842929e-05, "loss": 0.5703, "step": 320 }, { "epoch": 0.34055727554179566, "grad_norm": 0.6440294981002808, "learning_rate": 1.9638526566728088e-05, "loss": 0.5584, "step": 330 }, { "epoch": 0.3508771929824561, "grad_norm": 0.7667876482009888, "learning_rate": 1.960823610549943e-05, "loss": 0.5585, "step": 340 }, { "epoch": 0.36119711042311664, "grad_norm": 0.6358545422554016, "learning_rate": 1.9576752346675692e-05, "loss": 0.5578, "step": 350 }, { "epoch": 0.3715170278637771, "grad_norm": 0.6375100612640381, "learning_rate": 1.954407920039119e-05, "loss": 0.5621, "step": 360 }, { "epoch": 0.38183694530443757, "grad_norm": 0.7324113845825195, "learning_rate": 1.951022072449655e-05, "loss": 0.5527, "step": 370 }, { "epoch": 0.39215686274509803, "grad_norm": 0.658400297164917, "learning_rate": 1.9475181124054742e-05, "loss": 0.5538, "step": 380 }, { "epoch": 0.4024767801857585, "grad_norm": 0.7300146222114563, "learning_rate": 1.9438964750818833e-05, "loss": 0.5494, "step": 390 }, { "epoch": 0.41279669762641896, "grad_norm": 0.7315788865089417, "learning_rate": 1.940157610269152e-05, "loss": 0.5493, "step": 400 }, { "epoch": 0.4231166150670795, "grad_norm": 0.6689688563346863, "learning_rate": 1.9363019823166506e-05, "loss": 0.5509, "step": 410 }, { "epoch": 0.43343653250773995, "grad_norm": 0.6882718205451965, "learning_rate": 1.9323300700751816e-05, "loss": 0.5473, "step": 420 }, { "epoch": 0.4437564499484004, "grad_norm": 0.6466957330703735, "learning_rate": 1.9282423668375064e-05, "loss": 0.5435, "step": 430 }, { "epoch": 0.4540763673890609, "grad_norm": 0.6492331624031067, "learning_rate": 1.9240393802770824e-05, "loss": 0.5449, "step": 440 }, { "epoch": 0.46439628482972134, "grad_norm": 0.5815872550010681, "learning_rate": 1.9197216323850122e-05, "loss": 0.5398, "step": 450 }, { "epoch": 0.47471620227038186, "grad_norm": 0.6003971099853516, "learning_rate": 1.9152896594052134e-05, "loss": 0.533, "step": 460 }, { "epoch": 0.4850361197110423, "grad_norm": 0.5987655520439148, "learning_rate": 1.910744011767821e-05, "loss": 0.5309, "step": 470 }, { "epoch": 0.4953560371517028, "grad_norm": 0.6432524919509888, "learning_rate": 1.9060852540208277e-05, "loss": 0.5344, "step": 480 }, { "epoch": 0.5056759545923633, "grad_norm": 0.5650415420532227, "learning_rate": 1.9013139647599656e-05, "loss": 0.5333, "step": 490 }, { "epoch": 0.5159958720330238, "grad_norm": 0.6225659847259521, "learning_rate": 1.8964307365568513e-05, "loss": 0.5231, "step": 500 }, { "epoch": 0.5263157894736842, "grad_norm": 0.6020525097846985, "learning_rate": 1.89143617588539e-05, "loss": 0.5241, "step": 510 }, { "epoch": 0.5366357069143447, "grad_norm": 0.5726006031036377, "learning_rate": 1.886330903046454e-05, "loss": 0.5278, "step": 520 }, { "epoch": 0.5469556243550051, "grad_norm": 0.5783742666244507, "learning_rate": 1.8811155520908445e-05, "loss": 0.5253, "step": 530 }, { "epoch": 0.5572755417956656, "grad_norm": 0.5478541254997253, "learning_rate": 1.8757907707405456e-05, "loss": 0.5166, "step": 540 }, { "epoch": 0.5675954592363261, "grad_norm": 0.5668419003486633, "learning_rate": 1.8703572203082795e-05, "loss": 0.5206, "step": 550 }, { "epoch": 0.5779153766769866, "grad_norm": 0.5729948282241821, "learning_rate": 1.8648155756153768e-05, "loss": 0.516, "step": 560 }, { "epoch": 0.5882352941176471, "grad_norm": 0.651300311088562, "learning_rate": 1.859166524907963e-05, "loss": 0.5183, "step": 570 }, { "epoch": 0.5985552115583075, "grad_norm": 0.6236013174057007, "learning_rate": 1.8534107697714864e-05, "loss": 0.5242, "step": 580 }, { "epoch": 0.608875128998968, "grad_norm": 0.5427743196487427, "learning_rate": 1.84754902504358e-05, "loss": 0.5291, "step": 590 }, { "epoch": 0.6191950464396285, "grad_norm": 0.5849993824958801, "learning_rate": 1.8415820187252847e-05, "loss": 0.5213, "step": 600 }, { "epoch": 0.6295149638802889, "grad_norm": 0.6405364274978638, "learning_rate": 1.8355104918906353e-05, "loss": 0.5187, "step": 610 }, { "epoch": 0.6398348813209495, "grad_norm": 0.5616128444671631, "learning_rate": 1.8293351985946194e-05, "loss": 0.5108, "step": 620 }, { "epoch": 0.6501547987616099, "grad_norm": 0.5770090222358704, "learning_rate": 1.823056905779532e-05, "loss": 0.5172, "step": 630 }, { "epoch": 0.6604747162022704, "grad_norm": 0.5251275300979614, "learning_rate": 1.816676393179721e-05, "loss": 0.5116, "step": 640 }, { "epoch": 0.6707946336429309, "grad_norm": 0.5879736542701721, "learning_rate": 1.8101944532247495e-05, "loss": 0.5157, "step": 650 }, { "epoch": 0.6811145510835913, "grad_norm": 0.5661890506744385, "learning_rate": 1.80361189094098e-05, "loss": 0.5088, "step": 660 }, { "epoch": 0.6914344685242518, "grad_norm": 0.5618740916252136, "learning_rate": 1.796929523851593e-05, "loss": 0.5111, "step": 670 }, { "epoch": 0.7017543859649122, "grad_norm": 0.5378845930099487, "learning_rate": 1.790148181875055e-05, "loss": 0.5118, "step": 680 }, { "epoch": 0.7120743034055728, "grad_norm": 0.5547090172767639, "learning_rate": 1.783268707222048e-05, "loss": 0.5088, "step": 690 }, { "epoch": 0.7223942208462333, "grad_norm": 0.5933310389518738, "learning_rate": 1.776291954290867e-05, "loss": 0.5063, "step": 700 }, { "epoch": 0.7327141382868937, "grad_norm": 0.5393312573432922, "learning_rate": 1.769218789561312e-05, "loss": 0.5014, "step": 710 }, { "epoch": 0.7430340557275542, "grad_norm": 0.5515422821044922, "learning_rate": 1.7620500914870734e-05, "loss": 0.5116, "step": 720 }, { "epoch": 0.7533539731682146, "grad_norm": 0.5601432919502258, "learning_rate": 1.7547867503866315e-05, "loss": 0.5024, "step": 730 }, { "epoch": 0.7636738906088751, "grad_norm": 0.5876237154006958, "learning_rate": 1.7474296683326844e-05, "loss": 0.5098, "step": 740 }, { "epoch": 0.7739938080495357, "grad_norm": 0.518947184085846, "learning_rate": 1.739979759040114e-05, "loss": 0.5017, "step": 750 }, { "epoch": 0.7843137254901961, "grad_norm": 0.5550107955932617, "learning_rate": 1.7324379477525086e-05, "loss": 0.5044, "step": 760 }, { "epoch": 0.7946336429308566, "grad_norm": 0.5430490374565125, "learning_rate": 1.724805171127249e-05, "loss": 0.5029, "step": 770 }, { "epoch": 0.804953560371517, "grad_norm": 0.5498166680335999, "learning_rate": 1.7170823771191824e-05, "loss": 0.499, "step": 780 }, { "epoch": 0.8152734778121775, "grad_norm": 0.5843333601951599, "learning_rate": 1.709270524862891e-05, "loss": 0.4968, "step": 790 }, { "epoch": 0.8255933952528379, "grad_norm": 0.5710884928703308, "learning_rate": 1.7013705845535704e-05, "loss": 0.5024, "step": 800 }, { "epoch": 0.8359133126934984, "grad_norm": 0.5185025930404663, "learning_rate": 1.6933835373265373e-05, "loss": 0.503, "step": 810 }, { "epoch": 0.846233230134159, "grad_norm": 0.5252718329429626, "learning_rate": 1.685310375135376e-05, "loss": 0.5028, "step": 820 }, { "epoch": 0.8565531475748194, "grad_norm": 0.5351059436798096, "learning_rate": 1.6771521006287442e-05, "loss": 0.4927, "step": 830 }, { "epoch": 0.8668730650154799, "grad_norm": 0.5176792740821838, "learning_rate": 1.6689097270258463e-05, "loss": 0.5012, "step": 840 }, { "epoch": 0.8771929824561403, "grad_norm": 0.5016619563102722, "learning_rate": 1.6605842779905984e-05, "loss": 0.4941, "step": 850 }, { "epoch": 0.8875128998968008, "grad_norm": 0.536718487739563, "learning_rate": 1.6521767875044935e-05, "loss": 0.488, "step": 860 }, { "epoch": 0.8978328173374613, "grad_norm": 0.49594587087631226, "learning_rate": 1.643688299738186e-05, "loss": 0.4901, "step": 870 }, { "epoch": 0.9081527347781218, "grad_norm": 0.5281170606613159, "learning_rate": 1.635119868921809e-05, "loss": 0.4979, "step": 880 }, { "epoch": 0.9184726522187823, "grad_norm": 0.5000081658363342, "learning_rate": 1.6264725592140468e-05, "loss": 0.4935, "step": 890 }, { "epoch": 0.9287925696594427, "grad_norm": 0.5359088182449341, "learning_rate": 1.6177474445699695e-05, "loss": 0.4854, "step": 900 }, { "epoch": 0.9391124871001032, "grad_norm": 0.5657668709754944, "learning_rate": 1.6089456086076527e-05, "loss": 0.4877, "step": 910 }, { "epoch": 0.9494324045407637, "grad_norm": 0.507234513759613, "learning_rate": 1.6000681444735976e-05, "loss": 0.4903, "step": 920 }, { "epoch": 0.9597523219814241, "grad_norm": 0.5578757524490356, "learning_rate": 1.5911161547069688e-05, "loss": 0.4884, "step": 930 }, { "epoch": 0.9700722394220846, "grad_norm": 0.5635477304458618, "learning_rate": 1.582090751102662e-05, "loss": 0.4973, "step": 940 }, { "epoch": 0.9803921568627451, "grad_norm": 0.5168154835700989, "learning_rate": 1.5729930545732247e-05, "loss": 0.4818, "step": 950 }, { "epoch": 0.9907120743034056, "grad_norm": 0.5357134342193604, "learning_rate": 1.5638241950096458e-05, "loss": 0.4863, "step": 960 }, { "epoch": 1.001031991744066, "grad_norm": 1.1038967370986938, "learning_rate": 1.554585311141027e-05, "loss": 0.4791, "step": 970 }, { "epoch": 1.0113519091847265, "grad_norm": 0.6728698015213013, "learning_rate": 1.5452775503931566e-05, "loss": 0.4229, "step": 980 }, { "epoch": 1.021671826625387, "grad_norm": 0.5582284331321716, "learning_rate": 1.5359020687460096e-05, "loss": 0.4193, "step": 990 }, { "epoch": 1.0319917440660475, "grad_norm": 0.5344264507293701, "learning_rate": 1.5264600305901744e-05, "loss": 0.4241, "step": 1000 }, { "epoch": 1.0423116615067078, "grad_norm": 0.5118332505226135, "learning_rate": 1.5169526085822451e-05, "loss": 0.4178, "step": 1010 }, { "epoch": 1.0526315789473684, "grad_norm": 0.54106605052948, "learning_rate": 1.5073809834991816e-05, "loss": 0.4167, "step": 1020 }, { "epoch": 1.0629514963880289, "grad_norm": 0.591042697429657, "learning_rate": 1.4977463440916621e-05, "loss": 0.4154, "step": 1030 }, { "epoch": 1.0732714138286894, "grad_norm": 0.5546119809150696, "learning_rate": 1.4880498869364482e-05, "loss": 0.4211, "step": 1040 }, { "epoch": 1.08359133126935, "grad_norm": 0.5102314352989197, "learning_rate": 1.4782928162877722e-05, "loss": 0.4187, "step": 1050 }, { "epoch": 1.0939112487100102, "grad_norm": 0.5234063863754272, "learning_rate": 1.468476343927778e-05, "loss": 0.4177, "step": 1060 }, { "epoch": 1.1042311661506707, "grad_norm": 0.5099871158599854, "learning_rate": 1.4586016890160208e-05, "loss": 0.4213, "step": 1070 }, { "epoch": 1.1145510835913313, "grad_norm": 0.5453868508338928, "learning_rate": 1.4486700779380547e-05, "loss": 0.4192, "step": 1080 }, { "epoch": 1.1248710010319918, "grad_norm": 0.5475857257843018, "learning_rate": 1.4386827441531202e-05, "loss": 0.4178, "step": 1090 }, { "epoch": 1.1351909184726523, "grad_norm": 0.5636183619499207, "learning_rate": 1.4286409280409558e-05, "loss": 0.4167, "step": 1100 }, { "epoch": 1.1455108359133126, "grad_norm": 0.5477967262268066, "learning_rate": 1.4185458767477487e-05, "loss": 0.4184, "step": 1110 }, { "epoch": 1.1558307533539731, "grad_norm": 0.5478163361549377, "learning_rate": 1.4083988440312429e-05, "loss": 0.419, "step": 1120 }, { "epoch": 1.1661506707946336, "grad_norm": 0.5689426064491272, "learning_rate": 1.3982010901050305e-05, "loss": 0.4239, "step": 1130 }, { "epoch": 1.1764705882352942, "grad_norm": 0.5106656551361084, "learning_rate": 1.3879538814820395e-05, "loss": 0.4135, "step": 1140 }, { "epoch": 1.1867905056759547, "grad_norm": 0.5251624584197998, "learning_rate": 1.3776584908172364e-05, "loss": 0.4202, "step": 1150 }, { "epoch": 1.197110423116615, "grad_norm": 0.5535441040992737, "learning_rate": 1.3673161967495708e-05, "loss": 0.4181, "step": 1160 }, { "epoch": 1.2074303405572755, "grad_norm": 0.5619220733642578, "learning_rate": 1.3569282837431737e-05, "loss": 0.4202, "step": 1170 }, { "epoch": 1.217750257997936, "grad_norm": 0.5495029091835022, "learning_rate": 1.3464960419278332e-05, "loss": 0.4135, "step": 1180 }, { "epoch": 1.2280701754385965, "grad_norm": 0.5409591197967529, "learning_rate": 1.336020766938766e-05, "loss": 0.4099, "step": 1190 }, { "epoch": 1.238390092879257, "grad_norm": 0.5582126379013062, "learning_rate": 1.3255037597557057e-05, "loss": 0.4168, "step": 1200 }, { "epoch": 1.2487100103199174, "grad_norm": 0.5315924882888794, "learning_rate": 1.3149463265413282e-05, "loss": 0.4163, "step": 1210 }, { "epoch": 1.2590299277605779, "grad_norm": 0.5000606775283813, "learning_rate": 1.3043497784790315e-05, "loss": 0.4155, "step": 1220 }, { "epoch": 1.2693498452012384, "grad_norm": 0.5188019275665283, "learning_rate": 1.2937154316100927e-05, "loss": 0.4155, "step": 1230 }, { "epoch": 1.279669762641899, "grad_norm": 0.5054394006729126, "learning_rate": 1.283044606670223e-05, "loss": 0.4079, "step": 1240 }, { "epoch": 1.2899896800825594, "grad_norm": 0.5096462368965149, "learning_rate": 1.2723386289255374e-05, "loss": 0.4149, "step": 1250 }, { "epoch": 1.3003095975232197, "grad_norm": 0.5191652178764343, "learning_rate": 1.2615988280079645e-05, "loss": 0.4103, "step": 1260 }, { "epoch": 1.3106295149638802, "grad_norm": 0.4963880777359009, "learning_rate": 1.2508265377501102e-05, "loss": 0.4117, "step": 1270 }, { "epoch": 1.3209494324045408, "grad_norm": 0.5644184947013855, "learning_rate": 1.240023096019603e-05, "loss": 0.4139, "step": 1280 }, { "epoch": 1.3312693498452013, "grad_norm": 0.521536111831665, "learning_rate": 1.2291898445529384e-05, "loss": 0.4107, "step": 1290 }, { "epoch": 1.3415892672858618, "grad_norm": 0.5256720781326294, "learning_rate": 1.2183281287888398e-05, "loss": 0.4104, "step": 1300 }, { "epoch": 1.351909184726522, "grad_norm": 0.531589686870575, "learning_rate": 1.2074392977011629e-05, "loss": 0.4111, "step": 1310 }, { "epoch": 1.3622291021671826, "grad_norm": 0.534598171710968, "learning_rate": 1.1965247036313573e-05, "loss": 0.416, "step": 1320 }, { "epoch": 1.3725490196078431, "grad_norm": 0.5281124711036682, "learning_rate": 1.185585702120515e-05, "loss": 0.4041, "step": 1330 }, { "epoch": 1.3828689370485037, "grad_norm": 0.5332800149917603, "learning_rate": 1.1746236517410155e-05, "loss": 0.4076, "step": 1340 }, { "epoch": 1.3931888544891642, "grad_norm": 0.4961317181587219, "learning_rate": 1.1636399139277998e-05, "loss": 0.4067, "step": 1350 }, { "epoch": 1.4035087719298245, "grad_norm": 0.5210182070732117, "learning_rate": 1.1526358528092861e-05, "loss": 0.4071, "step": 1360 }, { "epoch": 1.413828689370485, "grad_norm": 0.518181324005127, "learning_rate": 1.1416128350379503e-05, "loss": 0.4118, "step": 1370 }, { "epoch": 1.4241486068111455, "grad_norm": 0.5396980047225952, "learning_rate": 1.1305722296205968e-05, "loss": 0.4073, "step": 1380 }, { "epoch": 1.434468524251806, "grad_norm": 0.5073665976524353, "learning_rate": 1.1195154077483313e-05, "loss": 0.4083, "step": 1390 }, { "epoch": 1.4447884416924666, "grad_norm": 0.5103346705436707, "learning_rate": 1.1084437426262666e-05, "loss": 0.4094, "step": 1400 }, { "epoch": 1.4551083591331269, "grad_norm": 0.5441737174987793, "learning_rate": 1.097358609302978e-05, "loss": 0.4124, "step": 1410 }, { "epoch": 1.4654282765737874, "grad_norm": 0.49091413617134094, "learning_rate": 1.0862613844997272e-05, "loss": 0.4059, "step": 1420 }, { "epoch": 1.475748194014448, "grad_norm": 0.49451103806495667, "learning_rate": 1.0751534464394809e-05, "loss": 0.4028, "step": 1430 }, { "epoch": 1.4860681114551084, "grad_norm": 0.5205165147781372, "learning_rate": 1.0640361746757413e-05, "loss": 0.4038, "step": 1440 }, { "epoch": 1.496388028895769, "grad_norm": 0.5233325958251953, "learning_rate": 1.0529109499212137e-05, "loss": 0.4097, "step": 1450 }, { "epoch": 1.5067079463364292, "grad_norm": 0.5237818956375122, "learning_rate": 1.0417791538763269e-05, "loss": 0.4059, "step": 1460 }, { "epoch": 1.5170278637770898, "grad_norm": 0.5263275504112244, "learning_rate": 1.0306421690576318e-05, "loss": 0.4074, "step": 1470 }, { "epoch": 1.5273477812177503, "grad_norm": 0.5042173862457275, "learning_rate": 1.0195013786261017e-05, "loss": 0.4061, "step": 1480 }, { "epoch": 1.5376676986584106, "grad_norm": 0.48727792501449585, "learning_rate": 1.0083581662153488e-05, "loss": 0.4021, "step": 1490 }, { "epoch": 1.5479876160990713, "grad_norm": 0.5014871954917908, "learning_rate": 9.972139157597836e-06, "loss": 0.411, "step": 1500 }, { "epoch": 1.5583075335397316, "grad_norm": 0.49665823578834534, "learning_rate": 9.86070011322737e-06, "loss": 0.4069, "step": 1510 }, { "epoch": 1.5686274509803921, "grad_norm": 0.48189592361450195, "learning_rate": 9.749278369245658e-06, "loss": 0.4055, "step": 1520 }, { "epoch": 1.5789473684210527, "grad_norm": 0.5003267526626587, "learning_rate": 9.637887763707649e-06, "loss": 0.4023, "step": 1530 }, { "epoch": 1.589267285861713, "grad_norm": 0.4762038290500641, "learning_rate": 9.52654213080103e-06, "loss": 0.4063, "step": 1540 }, { "epoch": 1.5995872033023737, "grad_norm": 0.48036977648735046, "learning_rate": 9.415255299128115e-06, "loss": 0.3991, "step": 1550 }, { "epoch": 1.609907120743034, "grad_norm": 1.7054091691970825, "learning_rate": 9.304041089988367e-06, "loss": 0.4099, "step": 1560 }, { "epoch": 1.6202270381836945, "grad_norm": 0.5128041505813599, "learning_rate": 9.192913315661887e-06, "loss": 0.4093, "step": 1570 }, { "epoch": 1.630546955624355, "grad_norm": 0.5168408751487732, "learning_rate": 9.081885777693969e-06, "loss": 0.4012, "step": 1580 }, { "epoch": 1.6408668730650153, "grad_norm": 0.4789281189441681, "learning_rate": 8.97097226518103e-06, "loss": 0.4024, "step": 1590 }, { "epoch": 1.651186790505676, "grad_norm": 0.4675295650959015, "learning_rate": 8.860186553058066e-06, "loss": 0.3992, "step": 1600 }, { "epoch": 1.6615067079463364, "grad_norm": 0.4954163730144501, "learning_rate": 8.749542400387861e-06, "loss": 0.3986, "step": 1610 }, { "epoch": 1.671826625386997, "grad_norm": 0.4895382523536682, "learning_rate": 8.639053548652183e-06, "loss": 0.3949, "step": 1620 }, { "epoch": 1.6821465428276574, "grad_norm": 0.49679800868034363, "learning_rate": 8.528733720045162e-06, "loss": 0.4042, "step": 1630 }, { "epoch": 1.6924664602683177, "grad_norm": 0.470292866230011, "learning_rate": 8.418596615769048e-06, "loss": 0.3977, "step": 1640 }, { "epoch": 1.7027863777089784, "grad_norm": 0.46729475259780884, "learning_rate": 8.308655914332599e-06, "loss": 0.4022, "step": 1650 }, { "epoch": 1.7131062951496387, "grad_norm": 0.49843648076057434, "learning_rate": 8.198925269852251e-06, "loss": 0.3953, "step": 1660 }, { "epoch": 1.7234262125902993, "grad_norm": 0.4577590227127075, "learning_rate": 8.089418310356379e-06, "loss": 0.398, "step": 1670 }, { "epoch": 1.7337461300309598, "grad_norm": 0.45520010590553284, "learning_rate": 7.980148636092719e-06, "loss": 0.3986, "step": 1680 }, { "epoch": 1.74406604747162, "grad_norm": 0.48741379380226135, "learning_rate": 7.871129817839304e-06, "loss": 0.3926, "step": 1690 }, { "epoch": 1.7543859649122808, "grad_norm": 0.47943034768104553, "learning_rate": 7.762375395219045e-06, "loss": 0.403, "step": 1700 }, { "epoch": 1.7647058823529411, "grad_norm": 0.4822390675544739, "learning_rate": 7.653898875018151e-06, "loss": 0.3967, "step": 1710 }, { "epoch": 1.7750257997936016, "grad_norm": 0.47492411732673645, "learning_rate": 7.545713729508673e-06, "loss": 0.3955, "step": 1720 }, { "epoch": 1.7853457172342622, "grad_norm": 0.48685282468795776, "learning_rate": 7.437833394775283e-06, "loss": 0.3974, "step": 1730 }, { "epoch": 1.7956656346749225, "grad_norm": 0.47495120763778687, "learning_rate": 7.330271269046614e-06, "loss": 0.3997, "step": 1740 }, { "epoch": 1.8059855521155832, "grad_norm": 0.4861559271812439, "learning_rate": 7.223040711031225e-06, "loss": 0.3972, "step": 1750 }, { "epoch": 1.8163054695562435, "grad_norm": 0.4717768728733063, "learning_rate": 7.116155038258531e-06, "loss": 0.3963, "step": 1760 }, { "epoch": 1.826625386996904, "grad_norm": 0.47078821063041687, "learning_rate": 7.009627525424836e-06, "loss": 0.3962, "step": 1770 }, { "epoch": 1.8369453044375645, "grad_norm": 0.4606710374355316, "learning_rate": 6.903471402744662e-06, "loss": 0.3929, "step": 1780 }, { "epoch": 1.8472652218782248, "grad_norm": 0.45694735646247864, "learning_rate": 6.797699854307631e-06, "loss": 0.3897, "step": 1790 }, { "epoch": 1.8575851393188856, "grad_norm": 0.4747222661972046, "learning_rate": 6.692326016441054e-06, "loss": 0.3904, "step": 1800 } ], "logging_steps": 10, "max_steps": 2907, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.247296645602371e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }