akhanriz's picture
initial commit
e112632 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9998697407841606,
"eval_steps": 500,
"global_step": 34545,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004341973861317355,
"grad_norm": 32.08115005493164,
"learning_rate": 9.98697351280938e-06,
"loss": 2.7201,
"step": 50
},
{
"epoch": 0.00868394772263471,
"grad_norm": 11.602313041687012,
"learning_rate": 9.972499638153135e-06,
"loss": 0.8616,
"step": 100
},
{
"epoch": 0.013025921583952065,
"grad_norm": 13.351936340332031,
"learning_rate": 9.95802576349689e-06,
"loss": 0.3809,
"step": 150
},
{
"epoch": 0.01736789544526942,
"grad_norm": 11.841660499572754,
"learning_rate": 9.943551888840644e-06,
"loss": 0.3299,
"step": 200
},
{
"epoch": 0.021709869306586773,
"grad_norm": 13.042312622070312,
"learning_rate": 9.929078014184398e-06,
"loss": 0.3422,
"step": 250
},
{
"epoch": 0.02605184316790413,
"grad_norm": 12.432320594787598,
"learning_rate": 9.914604139528152e-06,
"loss": 0.3427,
"step": 300
},
{
"epoch": 0.030393817029221483,
"grad_norm": 11.445119857788086,
"learning_rate": 9.900130264871906e-06,
"loss": 0.3318,
"step": 350
},
{
"epoch": 0.03473579089053884,
"grad_norm": 16.4143009185791,
"learning_rate": 9.885656390215662e-06,
"loss": 0.2945,
"step": 400
},
{
"epoch": 0.039077764751856196,
"grad_norm": 10.369935989379883,
"learning_rate": 9.871182515559416e-06,
"loss": 0.3232,
"step": 450
},
{
"epoch": 0.043419738613173546,
"grad_norm": 10.7285737991333,
"learning_rate": 9.856708640903172e-06,
"loss": 0.3065,
"step": 500
},
{
"epoch": 0.0477617124744909,
"grad_norm": 9.987811088562012,
"learning_rate": 9.842234766246926e-06,
"loss": 0.2889,
"step": 550
},
{
"epoch": 0.05210368633580826,
"grad_norm": 13.18855094909668,
"learning_rate": 9.82776089159068e-06,
"loss": 0.359,
"step": 600
},
{
"epoch": 0.056445660197125616,
"grad_norm": 7.523106098175049,
"learning_rate": 9.813287016934434e-06,
"loss": 0.3753,
"step": 650
},
{
"epoch": 0.060787634058442966,
"grad_norm": 8.611462593078613,
"learning_rate": 9.798813142278188e-06,
"loss": 0.327,
"step": 700
},
{
"epoch": 0.06512960791976033,
"grad_norm": 9.731183052062988,
"learning_rate": 9.784339267621942e-06,
"loss": 0.3218,
"step": 750
},
{
"epoch": 0.06947158178107768,
"grad_norm": 14.622904777526855,
"learning_rate": 9.769865392965696e-06,
"loss": 0.3116,
"step": 800
},
{
"epoch": 0.07381355564239503,
"grad_norm": 14.486369132995605,
"learning_rate": 9.755391518309452e-06,
"loss": 0.302,
"step": 850
},
{
"epoch": 0.07815552950371239,
"grad_norm": 9.997029304504395,
"learning_rate": 9.740917643653206e-06,
"loss": 0.3332,
"step": 900
},
{
"epoch": 0.08249750336502974,
"grad_norm": 11.677199363708496,
"learning_rate": 9.726443768996962e-06,
"loss": 0.2776,
"step": 950
},
{
"epoch": 0.08683947722634709,
"grad_norm": 15.453819274902344,
"learning_rate": 9.711969894340716e-06,
"loss": 0.304,
"step": 1000
},
{
"epoch": 0.09118145108766446,
"grad_norm": 10.257116317749023,
"learning_rate": 9.69749601968447e-06,
"loss": 0.3241,
"step": 1050
},
{
"epoch": 0.0955234249489818,
"grad_norm": 12.801012992858887,
"learning_rate": 9.683022145028224e-06,
"loss": 0.3245,
"step": 1100
},
{
"epoch": 0.09986539881029916,
"grad_norm": 9.938461303710938,
"learning_rate": 9.668548270371979e-06,
"loss": 0.3146,
"step": 1150
},
{
"epoch": 0.10420737267161652,
"grad_norm": 7.490509986877441,
"learning_rate": 9.654074395715733e-06,
"loss": 0.2823,
"step": 1200
},
{
"epoch": 0.10854934653293387,
"grad_norm": 13.965841293334961,
"learning_rate": 9.639600521059488e-06,
"loss": 0.3684,
"step": 1250
},
{
"epoch": 0.11289132039425123,
"grad_norm": 12.257745742797852,
"learning_rate": 9.625126646403243e-06,
"loss": 0.3316,
"step": 1300
},
{
"epoch": 0.11723329425556858,
"grad_norm": 13.204375267028809,
"learning_rate": 9.610652771746998e-06,
"loss": 0.2866,
"step": 1350
},
{
"epoch": 0.12157526811688593,
"grad_norm": 14.752259254455566,
"learning_rate": 9.596178897090753e-06,
"loss": 0.2965,
"step": 1400
},
{
"epoch": 0.12591724197820328,
"grad_norm": 14.065028190612793,
"learning_rate": 9.581705022434507e-06,
"loss": 0.2953,
"step": 1450
},
{
"epoch": 0.13025921583952066,
"grad_norm": 7.947408676147461,
"learning_rate": 9.56723114777826e-06,
"loss": 0.3143,
"step": 1500
},
{
"epoch": 0.134601189700838,
"grad_norm": 14.14394474029541,
"learning_rate": 9.552757273122015e-06,
"loss": 0.2843,
"step": 1550
},
{
"epoch": 0.13894316356215536,
"grad_norm": 8.776320457458496,
"learning_rate": 9.538283398465769e-06,
"loss": 0.2435,
"step": 1600
},
{
"epoch": 0.1432851374234727,
"grad_norm": 8.089349746704102,
"learning_rate": 9.523809523809525e-06,
"loss": 0.3011,
"step": 1650
},
{
"epoch": 0.14762711128479006,
"grad_norm": 9.590496063232422,
"learning_rate": 9.509335649153279e-06,
"loss": 0.2947,
"step": 1700
},
{
"epoch": 0.1519690851461074,
"grad_norm": 9.659972190856934,
"learning_rate": 9.494861774497035e-06,
"loss": 0.3485,
"step": 1750
},
{
"epoch": 0.15631105900742479,
"grad_norm": 10.331656455993652,
"learning_rate": 9.480387899840789e-06,
"loss": 0.2765,
"step": 1800
},
{
"epoch": 0.16065303286874213,
"grad_norm": 12.779083251953125,
"learning_rate": 9.465914025184543e-06,
"loss": 0.2817,
"step": 1850
},
{
"epoch": 0.16499500673005948,
"grad_norm": 9.097893714904785,
"learning_rate": 9.451440150528297e-06,
"loss": 0.3581,
"step": 1900
},
{
"epoch": 0.16933698059137683,
"grad_norm": 18.47726058959961,
"learning_rate": 9.436966275872051e-06,
"loss": 0.2617,
"step": 1950
},
{
"epoch": 0.17367895445269418,
"grad_norm": 7.92081356048584,
"learning_rate": 9.422492401215805e-06,
"loss": 0.3314,
"step": 2000
},
{
"epoch": 0.17802092831401156,
"grad_norm": 11.863414764404297,
"learning_rate": 9.408018526559561e-06,
"loss": 0.3495,
"step": 2050
},
{
"epoch": 0.1823629021753289,
"grad_norm": 12.667139053344727,
"learning_rate": 9.393544651903315e-06,
"loss": 0.318,
"step": 2100
},
{
"epoch": 0.18670487603664626,
"grad_norm": 13.5122652053833,
"learning_rate": 9.379070777247071e-06,
"loss": 0.3017,
"step": 2150
},
{
"epoch": 0.1910468498979636,
"grad_norm": 6.8720526695251465,
"learning_rate": 9.364596902590825e-06,
"loss": 0.2728,
"step": 2200
},
{
"epoch": 0.19538882375928096,
"grad_norm": 15.49057388305664,
"learning_rate": 9.350123027934579e-06,
"loss": 0.2858,
"step": 2250
},
{
"epoch": 0.1997307976205983,
"grad_norm": 13.840463638305664,
"learning_rate": 9.335649153278333e-06,
"loss": 0.2733,
"step": 2300
},
{
"epoch": 0.2040727714819157,
"grad_norm": 12.819256782531738,
"learning_rate": 9.321175278622087e-06,
"loss": 0.3099,
"step": 2350
},
{
"epoch": 0.20841474534323304,
"grad_norm": 14.073617935180664,
"learning_rate": 9.306701403965841e-06,
"loss": 0.3347,
"step": 2400
},
{
"epoch": 0.2127567192045504,
"grad_norm": 9.651225090026855,
"learning_rate": 9.292227529309597e-06,
"loss": 0.3033,
"step": 2450
},
{
"epoch": 0.21709869306586774,
"grad_norm": 8.687524795532227,
"learning_rate": 9.277753654653351e-06,
"loss": 0.2807,
"step": 2500
},
{
"epoch": 0.2214406669271851,
"grad_norm": 10.235136032104492,
"learning_rate": 9.263279779997107e-06,
"loss": 0.2196,
"step": 2550
},
{
"epoch": 0.22578264078850246,
"grad_norm": 12.453109741210938,
"learning_rate": 9.248805905340861e-06,
"loss": 0.336,
"step": 2600
},
{
"epoch": 0.23012461464981981,
"grad_norm": 11.39977741241455,
"learning_rate": 9.234332030684615e-06,
"loss": 0.287,
"step": 2650
},
{
"epoch": 0.23446658851113716,
"grad_norm": 15.636773109436035,
"learning_rate": 9.21985815602837e-06,
"loss": 0.2747,
"step": 2700
},
{
"epoch": 0.2388085623724545,
"grad_norm": 9.498825073242188,
"learning_rate": 9.205384281372124e-06,
"loss": 0.2462,
"step": 2750
},
{
"epoch": 0.24315053623377186,
"grad_norm": 3.69565486907959,
"learning_rate": 9.190910406715878e-06,
"loss": 0.2816,
"step": 2800
},
{
"epoch": 0.24749251009508924,
"grad_norm": 12.666276931762695,
"learning_rate": 9.176436532059632e-06,
"loss": 0.3002,
"step": 2850
},
{
"epoch": 0.25183448395640656,
"grad_norm": 16.72841453552246,
"learning_rate": 9.161962657403388e-06,
"loss": 0.2584,
"step": 2900
},
{
"epoch": 0.2561764578177239,
"grad_norm": 11.932791709899902,
"learning_rate": 9.147488782747142e-06,
"loss": 0.311,
"step": 2950
},
{
"epoch": 0.2605184316790413,
"grad_norm": 7.06049919128418,
"learning_rate": 9.133014908090898e-06,
"loss": 0.2333,
"step": 3000
},
{
"epoch": 0.26486040554035867,
"grad_norm": 13.139076232910156,
"learning_rate": 9.118541033434652e-06,
"loss": 0.2429,
"step": 3050
},
{
"epoch": 0.269202379401676,
"grad_norm": 4.549722671508789,
"learning_rate": 9.104067158778406e-06,
"loss": 0.2384,
"step": 3100
},
{
"epoch": 0.27354435326299337,
"grad_norm": 15.118318557739258,
"learning_rate": 9.08959328412216e-06,
"loss": 0.3034,
"step": 3150
},
{
"epoch": 0.2778863271243107,
"grad_norm": 11.380433082580566,
"learning_rate": 9.075119409465914e-06,
"loss": 0.2701,
"step": 3200
},
{
"epoch": 0.28222830098562807,
"grad_norm": 8.537070274353027,
"learning_rate": 9.060645534809668e-06,
"loss": 0.2854,
"step": 3250
},
{
"epoch": 0.2865702748469454,
"grad_norm": 15.4993314743042,
"learning_rate": 9.046171660153424e-06,
"loss": 0.3197,
"step": 3300
},
{
"epoch": 0.29091224870826277,
"grad_norm": 39.80760192871094,
"learning_rate": 9.031697785497178e-06,
"loss": 0.3389,
"step": 3350
},
{
"epoch": 0.2952542225695801,
"grad_norm": 11.710318565368652,
"learning_rate": 9.017223910840934e-06,
"loss": 0.2886,
"step": 3400
},
{
"epoch": 0.29959619643089747,
"grad_norm": 10.475316047668457,
"learning_rate": 9.002750036184688e-06,
"loss": 0.285,
"step": 3450
},
{
"epoch": 0.3039381702922148,
"grad_norm": 7.522524833679199,
"learning_rate": 8.988276161528442e-06,
"loss": 0.298,
"step": 3500
},
{
"epoch": 0.3082801441535322,
"grad_norm": 5.4467244148254395,
"learning_rate": 8.973802286872196e-06,
"loss": 0.2792,
"step": 3550
},
{
"epoch": 0.31262211801484957,
"grad_norm": 12.941061973571777,
"learning_rate": 8.95932841221595e-06,
"loss": 0.2212,
"step": 3600
},
{
"epoch": 0.3169640918761669,
"grad_norm": 13.501566886901855,
"learning_rate": 8.944854537559704e-06,
"loss": 0.2792,
"step": 3650
},
{
"epoch": 0.32130606573748427,
"grad_norm": 5.546044826507568,
"learning_rate": 8.93038066290346e-06,
"loss": 0.3064,
"step": 3700
},
{
"epoch": 0.3256480395988016,
"grad_norm": 9.782098770141602,
"learning_rate": 8.915906788247214e-06,
"loss": 0.2601,
"step": 3750
},
{
"epoch": 0.32999001346011897,
"grad_norm": 4.493556022644043,
"learning_rate": 8.90143291359097e-06,
"loss": 0.2417,
"step": 3800
},
{
"epoch": 0.3343319873214363,
"grad_norm": 10.674678802490234,
"learning_rate": 8.886959038934724e-06,
"loss": 0.2525,
"step": 3850
},
{
"epoch": 0.33867396118275367,
"grad_norm": 12.951682090759277,
"learning_rate": 8.872485164278478e-06,
"loss": 0.2882,
"step": 3900
},
{
"epoch": 0.343015935044071,
"grad_norm": 15.413273811340332,
"learning_rate": 8.858011289622232e-06,
"loss": 0.268,
"step": 3950
},
{
"epoch": 0.34735790890538837,
"grad_norm": 6.657174110412598,
"learning_rate": 8.843537414965987e-06,
"loss": 0.2952,
"step": 4000
},
{
"epoch": 0.3516998827667057,
"grad_norm": 6.247474670410156,
"learning_rate": 8.82906354030974e-06,
"loss": 0.2185,
"step": 4050
},
{
"epoch": 0.3560418566280231,
"grad_norm": 14.013585090637207,
"learning_rate": 8.814589665653496e-06,
"loss": 0.2911,
"step": 4100
},
{
"epoch": 0.3603838304893405,
"grad_norm": 4.614453315734863,
"learning_rate": 8.80011579099725e-06,
"loss": 0.2627,
"step": 4150
},
{
"epoch": 0.3647258043506578,
"grad_norm": 18.471010208129883,
"learning_rate": 8.785641916341006e-06,
"loss": 0.2986,
"step": 4200
},
{
"epoch": 0.3690677782119752,
"grad_norm": 20.08515167236328,
"learning_rate": 8.771457519177886e-06,
"loss": 0.2659,
"step": 4250
},
{
"epoch": 0.3734097520732925,
"grad_norm": 11.523843765258789,
"learning_rate": 8.75698364452164e-06,
"loss": 0.2879,
"step": 4300
},
{
"epoch": 0.37775172593460987,
"grad_norm": 12.521258354187012,
"learning_rate": 8.742509769865394e-06,
"loss": 0.2479,
"step": 4350
},
{
"epoch": 0.3820936997959272,
"grad_norm": 4.8445587158203125,
"learning_rate": 8.728035895209148e-06,
"loss": 0.3093,
"step": 4400
},
{
"epoch": 0.38643567365724457,
"grad_norm": 16.926706314086914,
"learning_rate": 8.713562020552902e-06,
"loss": 0.2764,
"step": 4450
},
{
"epoch": 0.3907776475185619,
"grad_norm": 12.894941329956055,
"learning_rate": 8.699088145896656e-06,
"loss": 0.326,
"step": 4500
},
{
"epoch": 0.39511962137987927,
"grad_norm": 7.567624568939209,
"learning_rate": 8.684614271240412e-06,
"loss": 0.2442,
"step": 4550
},
{
"epoch": 0.3994615952411966,
"grad_norm": 0.5340330600738525,
"learning_rate": 8.670140396584166e-06,
"loss": 0.2505,
"step": 4600
},
{
"epoch": 0.403803569102514,
"grad_norm": 2.206448554992676,
"learning_rate": 8.655666521927922e-06,
"loss": 0.2872,
"step": 4650
},
{
"epoch": 0.4081455429638314,
"grad_norm": 3.9290964603424072,
"learning_rate": 8.641192647271676e-06,
"loss": 0.264,
"step": 4700
},
{
"epoch": 0.4124875168251487,
"grad_norm": 10.728494644165039,
"learning_rate": 8.62671877261543e-06,
"loss": 0.3088,
"step": 4750
},
{
"epoch": 0.4168294906864661,
"grad_norm": 1.5482255220413208,
"learning_rate": 8.612244897959184e-06,
"loss": 0.2577,
"step": 4800
},
{
"epoch": 0.4211714645477834,
"grad_norm": 12.54301929473877,
"learning_rate": 8.597771023302938e-06,
"loss": 0.2649,
"step": 4850
},
{
"epoch": 0.4255134384091008,
"grad_norm": 8.942083358764648,
"learning_rate": 8.583297148646693e-06,
"loss": 0.3254,
"step": 4900
},
{
"epoch": 0.4298554122704181,
"grad_norm": 6.604653358459473,
"learning_rate": 8.568823273990448e-06,
"loss": 0.3005,
"step": 4950
},
{
"epoch": 0.4341973861317355,
"grad_norm": 4.268127918243408,
"learning_rate": 8.554349399334202e-06,
"loss": 0.2745,
"step": 5000
},
{
"epoch": 0.4385393599930528,
"grad_norm": 6.471729755401611,
"learning_rate": 8.539875524677957e-06,
"loss": 0.2735,
"step": 5050
},
{
"epoch": 0.4428813338543702,
"grad_norm": 15.852076530456543,
"learning_rate": 8.525401650021712e-06,
"loss": 0.3188,
"step": 5100
},
{
"epoch": 0.4472233077156876,
"grad_norm": 9.784584999084473,
"learning_rate": 8.510927775365467e-06,
"loss": 0.2664,
"step": 5150
},
{
"epoch": 0.45156528157700493,
"grad_norm": 7.76936674118042,
"learning_rate": 8.49645390070922e-06,
"loss": 0.2571,
"step": 5200
},
{
"epoch": 0.4559072554383223,
"grad_norm": 11.905091285705566,
"learning_rate": 8.481980026052975e-06,
"loss": 0.3088,
"step": 5250
},
{
"epoch": 0.46024922929963963,
"grad_norm": 7.778932094573975,
"learning_rate": 8.467506151396729e-06,
"loss": 0.2754,
"step": 5300
},
{
"epoch": 0.464591203160957,
"grad_norm": 7.64968729019165,
"learning_rate": 8.453032276740483e-06,
"loss": 0.253,
"step": 5350
},
{
"epoch": 0.46893317702227433,
"grad_norm": 7.020500183105469,
"learning_rate": 8.438558402084239e-06,
"loss": 0.2032,
"step": 5400
},
{
"epoch": 0.4732751508835917,
"grad_norm": 15.488313674926758,
"learning_rate": 8.424084527427993e-06,
"loss": 0.2675,
"step": 5450
},
{
"epoch": 0.477617124744909,
"grad_norm": 6.211950302124023,
"learning_rate": 8.409610652771749e-06,
"loss": 0.2203,
"step": 5500
},
{
"epoch": 0.4819590986062264,
"grad_norm": 11.131321907043457,
"learning_rate": 8.395136778115503e-06,
"loss": 0.323,
"step": 5550
},
{
"epoch": 0.4863010724675437,
"grad_norm": 7.9977126121521,
"learning_rate": 8.380662903459257e-06,
"loss": 0.2231,
"step": 5600
},
{
"epoch": 0.4906430463288611,
"grad_norm": 10.198436737060547,
"learning_rate": 8.366189028803011e-06,
"loss": 0.2465,
"step": 5650
},
{
"epoch": 0.4949850201901785,
"grad_norm": 14.98998737335205,
"learning_rate": 8.351715154146765e-06,
"loss": 0.2235,
"step": 5700
},
{
"epoch": 0.49932699405149583,
"grad_norm": 13.291168212890625,
"learning_rate": 8.33724127949052e-06,
"loss": 0.2082,
"step": 5750
},
{
"epoch": 0.5036689679128131,
"grad_norm": 11.353480339050293,
"learning_rate": 8.322767404834275e-06,
"loss": 0.2813,
"step": 5800
},
{
"epoch": 0.5080109417741305,
"grad_norm": 4.9565815925598145,
"learning_rate": 8.308293530178029e-06,
"loss": 0.2223,
"step": 5850
},
{
"epoch": 0.5123529156354478,
"grad_norm": 10.184609413146973,
"learning_rate": 8.293819655521785e-06,
"loss": 0.2622,
"step": 5900
},
{
"epoch": 0.5166948894967652,
"grad_norm": 11.807438850402832,
"learning_rate": 8.279345780865539e-06,
"loss": 0.221,
"step": 5950
},
{
"epoch": 0.5210368633580826,
"grad_norm": 10.200774192810059,
"learning_rate": 8.264871906209293e-06,
"loss": 0.2987,
"step": 6000
},
{
"epoch": 0.5253788372193999,
"grad_norm": 12.71509838104248,
"learning_rate": 8.250398031553047e-06,
"loss": 0.281,
"step": 6050
},
{
"epoch": 0.5297208110807173,
"grad_norm": 6.515493392944336,
"learning_rate": 8.235924156896801e-06,
"loss": 0.2731,
"step": 6100
},
{
"epoch": 0.5340627849420346,
"grad_norm": 12.207219123840332,
"learning_rate": 8.221450282240555e-06,
"loss": 0.1996,
"step": 6150
},
{
"epoch": 0.538404758803352,
"grad_norm": 14.700127601623535,
"learning_rate": 8.206976407584311e-06,
"loss": 0.2901,
"step": 6200
},
{
"epoch": 0.5427467326646693,
"grad_norm": 13.294961929321289,
"learning_rate": 8.192502532928065e-06,
"loss": 0.2529,
"step": 6250
},
{
"epoch": 0.5470887065259867,
"grad_norm": 7.7809295654296875,
"learning_rate": 8.178028658271821e-06,
"loss": 0.2201,
"step": 6300
},
{
"epoch": 0.551430680387304,
"grad_norm": 9.105611801147461,
"learning_rate": 8.163554783615575e-06,
"loss": 0.2144,
"step": 6350
},
{
"epoch": 0.5557726542486214,
"grad_norm": 9.768695831298828,
"learning_rate": 8.149370386452455e-06,
"loss": 0.2631,
"step": 6400
},
{
"epoch": 0.5601146281099387,
"grad_norm": 15.364813804626465,
"learning_rate": 8.134896511796209e-06,
"loss": 0.2801,
"step": 6450
},
{
"epoch": 0.5644566019712561,
"grad_norm": 11.26169204711914,
"learning_rate": 8.120422637139963e-06,
"loss": 0.2521,
"step": 6500
},
{
"epoch": 0.5687985758325735,
"grad_norm": 14.16651725769043,
"learning_rate": 8.105948762483717e-06,
"loss": 0.2911,
"step": 6550
},
{
"epoch": 0.5731405496938908,
"grad_norm": 7.423428535461426,
"learning_rate": 8.091474887827471e-06,
"loss": 0.2962,
"step": 6600
},
{
"epoch": 0.5774825235552082,
"grad_norm": 12.954950332641602,
"learning_rate": 8.077001013171227e-06,
"loss": 0.2628,
"step": 6650
},
{
"epoch": 0.5818244974165255,
"grad_norm": 0.6374977827072144,
"learning_rate": 8.062527138514981e-06,
"loss": 0.2317,
"step": 6700
},
{
"epoch": 0.5861664712778429,
"grad_norm": 14.547224998474121,
"learning_rate": 8.048053263858737e-06,
"loss": 0.2576,
"step": 6750
},
{
"epoch": 0.5905084451391602,
"grad_norm": 15.81212043762207,
"learning_rate": 8.033579389202491e-06,
"loss": 0.2389,
"step": 6800
},
{
"epoch": 0.5948504190004776,
"grad_norm": 6.094769477844238,
"learning_rate": 8.019105514546245e-06,
"loss": 0.2705,
"step": 6850
},
{
"epoch": 0.5991923928617949,
"grad_norm": 12.700167655944824,
"learning_rate": 8.00463163989e-06,
"loss": 0.2271,
"step": 6900
},
{
"epoch": 0.6035343667231123,
"grad_norm": 9.446319580078125,
"learning_rate": 7.990157765233753e-06,
"loss": 0.1957,
"step": 6950
},
{
"epoch": 0.6078763405844296,
"grad_norm": 10.391845703125,
"learning_rate": 7.975683890577507e-06,
"loss": 0.2764,
"step": 7000
},
{
"epoch": 0.612218314445747,
"grad_norm": 0.8525072932243347,
"learning_rate": 7.961210015921263e-06,
"loss": 0.2413,
"step": 7050
},
{
"epoch": 0.6165602883070644,
"grad_norm": 13.354928970336914,
"learning_rate": 7.946736141265017e-06,
"loss": 0.2644,
"step": 7100
},
{
"epoch": 0.6209022621683817,
"grad_norm": 10.913607597351074,
"learning_rate": 7.932262266608773e-06,
"loss": 0.2478,
"step": 7150
},
{
"epoch": 0.6252442360296991,
"grad_norm": 10.41576099395752,
"learning_rate": 7.917788391952527e-06,
"loss": 0.2523,
"step": 7200
},
{
"epoch": 0.6295862098910164,
"grad_norm": 9.93879508972168,
"learning_rate": 7.903314517296281e-06,
"loss": 0.2313,
"step": 7250
},
{
"epoch": 0.6339281837523338,
"grad_norm": 11.395098686218262,
"learning_rate": 7.888840642640035e-06,
"loss": 0.2674,
"step": 7300
},
{
"epoch": 0.6382701576136511,
"grad_norm": 13.613016128540039,
"learning_rate": 7.87436676798379e-06,
"loss": 0.2442,
"step": 7350
},
{
"epoch": 0.6426121314749685,
"grad_norm": 7.089151859283447,
"learning_rate": 7.859892893327544e-06,
"loss": 0.2313,
"step": 7400
},
{
"epoch": 0.6469541053362858,
"grad_norm": 9.300673484802246,
"learning_rate": 7.845419018671298e-06,
"loss": 0.2631,
"step": 7450
},
{
"epoch": 0.6512960791976032,
"grad_norm": 12.743701934814453,
"learning_rate": 7.830945144015054e-06,
"loss": 0.2909,
"step": 7500
},
{
"epoch": 0.6556380530589205,
"grad_norm": 7.693624019622803,
"learning_rate": 7.816471269358808e-06,
"loss": 0.3017,
"step": 7550
},
{
"epoch": 0.6599800269202379,
"grad_norm": 10.756463050842285,
"learning_rate": 7.801997394702564e-06,
"loss": 0.2438,
"step": 7600
},
{
"epoch": 0.6643220007815553,
"grad_norm": 9.87155818939209,
"learning_rate": 7.787523520046318e-06,
"loss": 0.2453,
"step": 7650
},
{
"epoch": 0.6686639746428726,
"grad_norm": 4.115950107574463,
"learning_rate": 7.773049645390072e-06,
"loss": 0.2131,
"step": 7700
},
{
"epoch": 0.67300594850419,
"grad_norm": 0.4190388023853302,
"learning_rate": 7.758575770733826e-06,
"loss": 0.2383,
"step": 7750
},
{
"epoch": 0.6773479223655073,
"grad_norm": 10.512711524963379,
"learning_rate": 7.74410189607758e-06,
"loss": 0.2181,
"step": 7800
},
{
"epoch": 0.6816898962268247,
"grad_norm": 9.304518699645996,
"learning_rate": 7.729628021421334e-06,
"loss": 0.2388,
"step": 7850
},
{
"epoch": 0.686031870088142,
"grad_norm": 6.525651931762695,
"learning_rate": 7.71515414676509e-06,
"loss": 0.2645,
"step": 7900
},
{
"epoch": 0.6903738439494594,
"grad_norm": 9.819262504577637,
"learning_rate": 7.700680272108844e-06,
"loss": 0.247,
"step": 7950
},
{
"epoch": 0.6947158178107767,
"grad_norm": 14.438730239868164,
"learning_rate": 7.6862063974526e-06,
"loss": 0.2389,
"step": 8000
},
{
"epoch": 0.6990577916720941,
"grad_norm": 12.934776306152344,
"learning_rate": 7.671732522796354e-06,
"loss": 0.2695,
"step": 8050
},
{
"epoch": 0.7033997655334114,
"grad_norm": 10.648255348205566,
"learning_rate": 7.657258648140108e-06,
"loss": 0.2091,
"step": 8100
},
{
"epoch": 0.7077417393947288,
"grad_norm": 7.91797399520874,
"learning_rate": 7.642784773483862e-06,
"loss": 0.2358,
"step": 8150
},
{
"epoch": 0.7120837132560462,
"grad_norm": 8.622334480285645,
"learning_rate": 7.628310898827617e-06,
"loss": 0.2107,
"step": 8200
},
{
"epoch": 0.7164256871173635,
"grad_norm": 10.815421104431152,
"learning_rate": 7.613837024171371e-06,
"loss": 0.2201,
"step": 8250
},
{
"epoch": 0.720767660978681,
"grad_norm": 12.649157524108887,
"learning_rate": 7.599363149515125e-06,
"loss": 0.258,
"step": 8300
},
{
"epoch": 0.7251096348399982,
"grad_norm": 13.133886337280273,
"learning_rate": 7.58488927485888e-06,
"loss": 0.2818,
"step": 8350
},
{
"epoch": 0.7294516087013156,
"grad_norm": 11.359635353088379,
"learning_rate": 7.570415400202635e-06,
"loss": 0.2722,
"step": 8400
},
{
"epoch": 0.7337935825626329,
"grad_norm": 17.86835479736328,
"learning_rate": 7.555941525546389e-06,
"loss": 0.2276,
"step": 8450
},
{
"epoch": 0.7381355564239503,
"grad_norm": 20.632299423217773,
"learning_rate": 7.541467650890144e-06,
"loss": 0.291,
"step": 8500
},
{
"epoch": 0.7424775302852676,
"grad_norm": 8.320937156677246,
"learning_rate": 7.526993776233898e-06,
"loss": 0.2789,
"step": 8550
},
{
"epoch": 0.746819504146585,
"grad_norm": 10.321586608886719,
"learning_rate": 7.5125199015776525e-06,
"loss": 0.2535,
"step": 8600
},
{
"epoch": 0.7511614780079023,
"grad_norm": 3.8393821716308594,
"learning_rate": 7.4980460269214074e-06,
"loss": 0.2108,
"step": 8650
},
{
"epoch": 0.7555034518692197,
"grad_norm": 8.907423973083496,
"learning_rate": 7.4835721522651616e-06,
"loss": 0.2765,
"step": 8700
},
{
"epoch": 0.7598454257305371,
"grad_norm": 3.0093770027160645,
"learning_rate": 7.469098277608916e-06,
"loss": 0.2116,
"step": 8750
},
{
"epoch": 0.7641873995918544,
"grad_norm": 2.4784209728240967,
"learning_rate": 7.4546244029526715e-06,
"loss": 0.2738,
"step": 8800
},
{
"epoch": 0.7685293734531718,
"grad_norm": 4.4596686363220215,
"learning_rate": 7.440150528296426e-06,
"loss": 0.2195,
"step": 8850
},
{
"epoch": 0.7728713473144891,
"grad_norm": 3.5557031631469727,
"learning_rate": 7.4256766536401805e-06,
"loss": 0.2738,
"step": 8900
},
{
"epoch": 0.7772133211758065,
"grad_norm": 10.603437423706055,
"learning_rate": 7.411202778983935e-06,
"loss": 0.2311,
"step": 8950
},
{
"epoch": 0.7815552950371238,
"grad_norm": 14.556783676147461,
"learning_rate": 7.396728904327689e-06,
"loss": 0.2405,
"step": 9000
},
{
"epoch": 0.7858972688984412,
"grad_norm": 10.350156784057617,
"learning_rate": 7.382255029671444e-06,
"loss": 0.2749,
"step": 9050
},
{
"epoch": 0.7902392427597585,
"grad_norm": 8.201991081237793,
"learning_rate": 7.367781155015198e-06,
"loss": 0.2591,
"step": 9100
},
{
"epoch": 0.794581216621076,
"grad_norm": 2.724789619445801,
"learning_rate": 7.353307280358952e-06,
"loss": 0.2934,
"step": 9150
},
{
"epoch": 0.7989231904823932,
"grad_norm": 5.613165855407715,
"learning_rate": 7.338833405702708e-06,
"loss": 0.2337,
"step": 9200
},
{
"epoch": 0.8032651643437106,
"grad_norm": 3.3908209800720215,
"learning_rate": 7.324359531046462e-06,
"loss": 0.2413,
"step": 9250
},
{
"epoch": 0.807607138205028,
"grad_norm": 9.619438171386719,
"learning_rate": 7.309885656390217e-06,
"loss": 0.2663,
"step": 9300
},
{
"epoch": 0.8119491120663453,
"grad_norm": 6.928030967712402,
"learning_rate": 7.295411781733971e-06,
"loss": 0.2713,
"step": 9350
},
{
"epoch": 0.8162910859276628,
"grad_norm": 11.707024574279785,
"learning_rate": 7.280937907077725e-06,
"loss": 0.2465,
"step": 9400
},
{
"epoch": 0.82063305978898,
"grad_norm": 9.797958374023438,
"learning_rate": 7.26646403242148e-06,
"loss": 0.2931,
"step": 9450
},
{
"epoch": 0.8249750336502975,
"grad_norm": 11.354745864868164,
"learning_rate": 7.251990157765234e-06,
"loss": 0.1794,
"step": 9500
},
{
"epoch": 0.8293170075116147,
"grad_norm": 13.923330307006836,
"learning_rate": 7.237516283108988e-06,
"loss": 0.2276,
"step": 9550
},
{
"epoch": 0.8336589813729322,
"grad_norm": 6.454158306121826,
"learning_rate": 7.223042408452743e-06,
"loss": 0.2378,
"step": 9600
},
{
"epoch": 0.8380009552342494,
"grad_norm": 6.767538547515869,
"learning_rate": 7.208568533796498e-06,
"loss": 0.2194,
"step": 9650
},
{
"epoch": 0.8423429290955669,
"grad_norm": 4.610179424285889,
"learning_rate": 7.194094659140253e-06,
"loss": 0.1802,
"step": 9700
},
{
"epoch": 0.8466849029568841,
"grad_norm": 6.4949870109558105,
"learning_rate": 7.179620784484007e-06,
"loss": 0.2525,
"step": 9750
},
{
"epoch": 0.8510268768182015,
"grad_norm": 6.070311069488525,
"learning_rate": 7.165146909827761e-06,
"loss": 0.2412,
"step": 9800
},
{
"epoch": 0.855368850679519,
"grad_norm": 11.405903816223145,
"learning_rate": 7.150673035171516e-06,
"loss": 0.2727,
"step": 9850
},
{
"epoch": 0.8597108245408362,
"grad_norm": 13.383929252624512,
"learning_rate": 7.13619916051527e-06,
"loss": 0.2882,
"step": 9900
},
{
"epoch": 0.8640527984021537,
"grad_norm": 13.404510498046875,
"learning_rate": 7.1217252858590245e-06,
"loss": 0.2219,
"step": 9950
},
{
"epoch": 0.868394772263471,
"grad_norm": 9.866695404052734,
"learning_rate": 7.107251411202779e-06,
"loss": 0.2302,
"step": 10000
},
{
"epoch": 0.8727367461247884,
"grad_norm": 13.462258338928223,
"learning_rate": 7.092777536546534e-06,
"loss": 0.244,
"step": 10050
},
{
"epoch": 0.8770787199861056,
"grad_norm": 4.718477249145508,
"learning_rate": 7.078303661890289e-06,
"loss": 0.2321,
"step": 10100
},
{
"epoch": 0.881420693847423,
"grad_norm": 6.899251937866211,
"learning_rate": 7.0638297872340434e-06,
"loss": 0.2126,
"step": 10150
},
{
"epoch": 0.8857626677087403,
"grad_norm": 7.1671576499938965,
"learning_rate": 7.0493559125777976e-06,
"loss": 0.2159,
"step": 10200
},
{
"epoch": 0.8901046415700578,
"grad_norm": 13.115740776062012,
"learning_rate": 7.0348820379215525e-06,
"loss": 0.244,
"step": 10250
},
{
"epoch": 0.8944466154313752,
"grad_norm": 12.494568824768066,
"learning_rate": 7.020408163265307e-06,
"loss": 0.2641,
"step": 10300
},
{
"epoch": 0.8987885892926925,
"grad_norm": 9.132339477539062,
"learning_rate": 7.005934288609061e-06,
"loss": 0.2483,
"step": 10350
},
{
"epoch": 0.9031305631540099,
"grad_norm": 8.740550994873047,
"learning_rate": 6.991460413952816e-06,
"loss": 0.1968,
"step": 10400
},
{
"epoch": 0.9074725370153272,
"grad_norm": 6.999782562255859,
"learning_rate": 6.976986539296571e-06,
"loss": 0.2513,
"step": 10450
},
{
"epoch": 0.9118145108766446,
"grad_norm": 10.487621307373047,
"learning_rate": 6.96280214213345e-06,
"loss": 0.2685,
"step": 10500
},
{
"epoch": 0.9161564847379619,
"grad_norm": 10.34904670715332,
"learning_rate": 6.948328267477204e-06,
"loss": 0.2173,
"step": 10550
},
{
"epoch": 0.9204984585992793,
"grad_norm": 10.263396263122559,
"learning_rate": 6.933854392820959e-06,
"loss": 0.2394,
"step": 10600
},
{
"epoch": 0.9248404324605966,
"grad_norm": 7.749691963195801,
"learning_rate": 6.919380518164713e-06,
"loss": 0.3045,
"step": 10650
},
{
"epoch": 0.929182406321914,
"grad_norm": 7.5561017990112305,
"learning_rate": 6.904906643508467e-06,
"loss": 0.2006,
"step": 10700
},
{
"epoch": 0.9335243801832313,
"grad_norm": 9.14453125,
"learning_rate": 6.890432768852222e-06,
"loss": 0.246,
"step": 10750
},
{
"epoch": 0.9378663540445487,
"grad_norm": 12.95663833618164,
"learning_rate": 6.875958894195976e-06,
"loss": 0.2269,
"step": 10800
},
{
"epoch": 0.9422083279058661,
"grad_norm": 7.384112358093262,
"learning_rate": 6.8614850195397305e-06,
"loss": 0.2313,
"step": 10850
},
{
"epoch": 0.9465503017671834,
"grad_norm": 11.830831527709961,
"learning_rate": 6.847011144883486e-06,
"loss": 0.2247,
"step": 10900
},
{
"epoch": 0.9508922756285008,
"grad_norm": 11.668469429016113,
"learning_rate": 6.832826747720366e-06,
"loss": 0.3188,
"step": 10950
},
{
"epoch": 0.955234249489818,
"grad_norm": 3.9130442142486572,
"learning_rate": 6.81835287306412e-06,
"loss": 0.2325,
"step": 11000
},
{
"epoch": 0.9595762233511355,
"grad_norm": 5.120070457458496,
"learning_rate": 6.803878998407875e-06,
"loss": 0.2465,
"step": 11050
},
{
"epoch": 0.9639181972124528,
"grad_norm": 8.14548397064209,
"learning_rate": 6.789405123751629e-06,
"loss": 0.1962,
"step": 11100
},
{
"epoch": 0.9682601710737702,
"grad_norm": 8.161116600036621,
"learning_rate": 6.774931249095383e-06,
"loss": 0.2031,
"step": 11150
},
{
"epoch": 0.9726021449350875,
"grad_norm": 6.700616359710693,
"learning_rate": 6.760457374439138e-06,
"loss": 0.2406,
"step": 11200
},
{
"epoch": 0.9769441187964049,
"grad_norm": 9.01518440246582,
"learning_rate": 6.745983499782892e-06,
"loss": 0.2324,
"step": 11250
},
{
"epoch": 0.9812860926577222,
"grad_norm": 8.013750076293945,
"learning_rate": 6.731509625126646e-06,
"loss": 0.2223,
"step": 11300
},
{
"epoch": 0.9856280665190396,
"grad_norm": 3.7864232063293457,
"learning_rate": 6.717035750470402e-06,
"loss": 0.253,
"step": 11350
},
{
"epoch": 0.989970040380357,
"grad_norm": 10.278191566467285,
"learning_rate": 6.702561875814156e-06,
"loss": 0.2362,
"step": 11400
},
{
"epoch": 0.9943120142416743,
"grad_norm": 6.585808753967285,
"learning_rate": 6.688088001157911e-06,
"loss": 0.2489,
"step": 11450
},
{
"epoch": 0.9986539881029917,
"grad_norm": 7.475987911224365,
"learning_rate": 6.673614126501665e-06,
"loss": 0.2333,
"step": 11500
},
{
"epoch": 0.9999565802613868,
"eval_loss": 0.22973021864891052,
"eval_runtime": 158.8031,
"eval_samples_per_second": 64.457,
"eval_steps_per_second": 16.114,
"step": 11515
},
{
"epoch": 1.002995961964309,
"grad_norm": 2.4895355701446533,
"learning_rate": 6.659140251845419e-06,
"loss": 0.1887,
"step": 11550
},
{
"epoch": 1.0073379358256263,
"grad_norm": 6.796756267547607,
"learning_rate": 6.644666377189174e-06,
"loss": 0.175,
"step": 11600
},
{
"epoch": 1.0116799096869438,
"grad_norm": 6.118704319000244,
"learning_rate": 6.630192502532928e-06,
"loss": 0.1286,
"step": 11650
},
{
"epoch": 1.016021883548261,
"grad_norm": 2.102407693862915,
"learning_rate": 6.6157186278766824e-06,
"loss": 0.1484,
"step": 11700
},
{
"epoch": 1.0203638574095784,
"grad_norm": 6.9978156089782715,
"learning_rate": 6.601244753220437e-06,
"loss": 0.1353,
"step": 11750
},
{
"epoch": 1.0247058312708957,
"grad_norm": 12.549331665039062,
"learning_rate": 6.586770878564192e-06,
"loss": 0.1731,
"step": 11800
},
{
"epoch": 1.0290478051322132,
"grad_norm": 0.8738523125648499,
"learning_rate": 6.572297003907947e-06,
"loss": 0.1472,
"step": 11850
},
{
"epoch": 1.0333897789935305,
"grad_norm": 12.091641426086426,
"learning_rate": 6.557823129251701e-06,
"loss": 0.12,
"step": 11900
},
{
"epoch": 1.0377317528548478,
"grad_norm": 4.691054344177246,
"learning_rate": 6.5433492545954555e-06,
"loss": 0.1343,
"step": 11950
},
{
"epoch": 1.0420737267161653,
"grad_norm": 11.15015983581543,
"learning_rate": 6.5288753799392105e-06,
"loss": 0.1486,
"step": 12000
},
{
"epoch": 1.0464157005774826,
"grad_norm": 19.46121597290039,
"learning_rate": 6.514401505282965e-06,
"loss": 0.1872,
"step": 12050
},
{
"epoch": 1.0507576744387999,
"grad_norm": 8.74670696258545,
"learning_rate": 6.499927630626719e-06,
"loss": 0.1351,
"step": 12100
},
{
"epoch": 1.0550996483001172,
"grad_norm": 1.0911568403244019,
"learning_rate": 6.485453755970474e-06,
"loss": 0.1493,
"step": 12150
},
{
"epoch": 1.0594416221614347,
"grad_norm": 15.545437812805176,
"learning_rate": 6.470979881314229e-06,
"loss": 0.1405,
"step": 12200
},
{
"epoch": 1.063783596022752,
"grad_norm": 16.014005661010742,
"learning_rate": 6.4565060066579836e-06,
"loss": 0.1594,
"step": 12250
},
{
"epoch": 1.0681255698840693,
"grad_norm": 10.566229820251465,
"learning_rate": 6.442032132001738e-06,
"loss": 0.1568,
"step": 12300
},
{
"epoch": 1.0724675437453866,
"grad_norm": 5.546818733215332,
"learning_rate": 6.427558257345492e-06,
"loss": 0.162,
"step": 12350
},
{
"epoch": 1.076809517606704,
"grad_norm": 12.857758522033691,
"learning_rate": 6.413084382689247e-06,
"loss": 0.1412,
"step": 12400
},
{
"epoch": 1.0811514914680214,
"grad_norm": 4.669007778167725,
"learning_rate": 6.398610508033001e-06,
"loss": 0.1747,
"step": 12450
},
{
"epoch": 1.0854934653293387,
"grad_norm": 7.152470588684082,
"learning_rate": 6.384136633376755e-06,
"loss": 0.1562,
"step": 12500
},
{
"epoch": 1.0898354391906562,
"grad_norm": 11.267487525939941,
"learning_rate": 6.36966275872051e-06,
"loss": 0.1583,
"step": 12550
},
{
"epoch": 1.0941774130519735,
"grad_norm": 12.031243324279785,
"learning_rate": 6.355188884064265e-06,
"loss": 0.1545,
"step": 12600
},
{
"epoch": 1.0985193869132908,
"grad_norm": 7.165139198303223,
"learning_rate": 6.34071500940802e-06,
"loss": 0.1658,
"step": 12650
},
{
"epoch": 1.102861360774608,
"grad_norm": 9.312381744384766,
"learning_rate": 6.326241134751774e-06,
"loss": 0.1689,
"step": 12700
},
{
"epoch": 1.1072033346359256,
"grad_norm": 6.647734642028809,
"learning_rate": 6.311767260095528e-06,
"loss": 0.149,
"step": 12750
},
{
"epoch": 1.1115453084972429,
"grad_norm": 0.020784372463822365,
"learning_rate": 6.297293385439283e-06,
"loss": 0.1683,
"step": 12800
},
{
"epoch": 1.1158872823585602,
"grad_norm": 8.196714401245117,
"learning_rate": 6.282819510783037e-06,
"loss": 0.1611,
"step": 12850
},
{
"epoch": 1.1202292562198775,
"grad_norm": 12.446895599365234,
"learning_rate": 6.268345636126791e-06,
"loss": 0.1875,
"step": 12900
},
{
"epoch": 1.124571230081195,
"grad_norm": 14.737337112426758,
"learning_rate": 6.253871761470545e-06,
"loss": 0.1379,
"step": 12950
},
{
"epoch": 1.1289132039425123,
"grad_norm": 7.295111179351807,
"learning_rate": 6.239397886814301e-06,
"loss": 0.1062,
"step": 13000
},
{
"epoch": 1.1332551778038296,
"grad_norm": 4.94767427444458,
"learning_rate": 6.224924012158055e-06,
"loss": 0.1716,
"step": 13050
},
{
"epoch": 1.137597151665147,
"grad_norm": 3.360748767852783,
"learning_rate": 6.21045013750181e-06,
"loss": 0.1346,
"step": 13100
},
{
"epoch": 1.1419391255264644,
"grad_norm": 8.84126091003418,
"learning_rate": 6.195976262845564e-06,
"loss": 0.1176,
"step": 13150
},
{
"epoch": 1.1462810993877817,
"grad_norm": 5.007967472076416,
"learning_rate": 6.1815023881893184e-06,
"loss": 0.1754,
"step": 13200
},
{
"epoch": 1.150623073249099,
"grad_norm": 1.278257131576538,
"learning_rate": 6.167028513533073e-06,
"loss": 0.1639,
"step": 13250
},
{
"epoch": 1.1549650471104165,
"grad_norm": 4.407376766204834,
"learning_rate": 6.1525546388768275e-06,
"loss": 0.1605,
"step": 13300
},
{
"epoch": 1.1593070209717338,
"grad_norm": 11.04604721069336,
"learning_rate": 6.138080764220582e-06,
"loss": 0.1222,
"step": 13350
},
{
"epoch": 1.163648994833051,
"grad_norm": 12.066889762878418,
"learning_rate": 6.1236068895643366e-06,
"loss": 0.1046,
"step": 13400
},
{
"epoch": 1.1679909686943684,
"grad_norm": 8.209555625915527,
"learning_rate": 6.1091330149080915e-06,
"loss": 0.1586,
"step": 13450
},
{
"epoch": 1.1723329425556859,
"grad_norm": 7.170156955718994,
"learning_rate": 6.0946591402518465e-06,
"loss": 0.1218,
"step": 13500
},
{
"epoch": 1.1766749164170032,
"grad_norm": 14.487652778625488,
"learning_rate": 6.080185265595601e-06,
"loss": 0.1915,
"step": 13550
},
{
"epoch": 1.1810168902783205,
"grad_norm": 6.8529839515686035,
"learning_rate": 6.065711390939355e-06,
"loss": 0.1129,
"step": 13600
},
{
"epoch": 1.185358864139638,
"grad_norm": 9.348575592041016,
"learning_rate": 6.05123751628311e-06,
"loss": 0.1528,
"step": 13650
},
{
"epoch": 1.1897008380009553,
"grad_norm": 5.676689624786377,
"learning_rate": 6.036763641626864e-06,
"loss": 0.1625,
"step": 13700
},
{
"epoch": 1.1940428118622726,
"grad_norm": 13.035611152648926,
"learning_rate": 6.022289766970618e-06,
"loss": 0.1763,
"step": 13750
},
{
"epoch": 1.1983847857235899,
"grad_norm": 11.174485206604004,
"learning_rate": 6.007815892314373e-06,
"loss": 0.1428,
"step": 13800
},
{
"epoch": 1.2027267595849074,
"grad_norm": 1.6729018688201904,
"learning_rate": 5.993342017658128e-06,
"loss": 0.1718,
"step": 13850
},
{
"epoch": 1.2070687334462247,
"grad_norm": 6.937321662902832,
"learning_rate": 5.978868143001883e-06,
"loss": 0.1085,
"step": 13900
},
{
"epoch": 1.211410707307542,
"grad_norm": 2.6701741218566895,
"learning_rate": 5.964394268345637e-06,
"loss": 0.1343,
"step": 13950
},
{
"epoch": 1.2157526811688593,
"grad_norm": 6.4982380867004395,
"learning_rate": 5.949920393689391e-06,
"loss": 0.1584,
"step": 14000
},
{
"epoch": 1.2200946550301768,
"grad_norm": 8.566705703735352,
"learning_rate": 5.935446519033146e-06,
"loss": 0.1593,
"step": 14050
},
{
"epoch": 1.224436628891494,
"grad_norm": 2.0202713012695312,
"learning_rate": 5.9209726443769e-06,
"loss": 0.1566,
"step": 14100
},
{
"epoch": 1.2287786027528114,
"grad_norm": 5.631886005401611,
"learning_rate": 5.906498769720654e-06,
"loss": 0.1839,
"step": 14150
},
{
"epoch": 1.2331205766141289,
"grad_norm": 7.408547878265381,
"learning_rate": 5.892024895064409e-06,
"loss": 0.1288,
"step": 14200
},
{
"epoch": 1.2374625504754462,
"grad_norm": 3.71185040473938,
"learning_rate": 5.877551020408164e-06,
"loss": 0.1153,
"step": 14250
},
{
"epoch": 1.2418045243367635,
"grad_norm": 5.404465675354004,
"learning_rate": 5.863077145751919e-06,
"loss": 0.1844,
"step": 14300
},
{
"epoch": 1.2461464981980808,
"grad_norm": 12.6980619430542,
"learning_rate": 5.848603271095673e-06,
"loss": 0.1846,
"step": 14350
},
{
"epoch": 1.2504884720593983,
"grad_norm": 0.8423546552658081,
"learning_rate": 5.834129396439427e-06,
"loss": 0.135,
"step": 14400
},
{
"epoch": 1.2548304459207156,
"grad_norm": 5.374547004699707,
"learning_rate": 5.819655521783182e-06,
"loss": 0.1453,
"step": 14450
},
{
"epoch": 1.2591724197820329,
"grad_norm": 7.896478176116943,
"learning_rate": 5.805181647126936e-06,
"loss": 0.1445,
"step": 14500
},
{
"epoch": 1.2635143936433502,
"grad_norm": 5.153639793395996,
"learning_rate": 5.79070777247069e-06,
"loss": 0.1264,
"step": 14550
},
{
"epoch": 1.2678563675046677,
"grad_norm": 7.638331413269043,
"learning_rate": 5.776233897814445e-06,
"loss": 0.1268,
"step": 14600
},
{
"epoch": 1.272198341365985,
"grad_norm": 5.552703380584717,
"learning_rate": 5.7617600231581995e-06,
"loss": 0.1677,
"step": 14650
},
{
"epoch": 1.2765403152273023,
"grad_norm": 8.722034454345703,
"learning_rate": 5.747286148501955e-06,
"loss": 0.1498,
"step": 14700
},
{
"epoch": 1.2808822890886198,
"grad_norm": 5.28520393371582,
"learning_rate": 5.732812273845709e-06,
"loss": 0.1263,
"step": 14750
},
{
"epoch": 1.285224262949937,
"grad_norm": 0.4377736747264862,
"learning_rate": 5.7183383991894635e-06,
"loss": 0.1699,
"step": 14800
},
{
"epoch": 1.2895662368112544,
"grad_norm": 0.9347131848335266,
"learning_rate": 5.7038645245332185e-06,
"loss": 0.1209,
"step": 14850
},
{
"epoch": 1.2939082106725717,
"grad_norm": 9.414371490478516,
"learning_rate": 5.6893906498769726e-06,
"loss": 0.1452,
"step": 14900
},
{
"epoch": 1.2982501845338892,
"grad_norm": 8.528615951538086,
"learning_rate": 5.674916775220727e-06,
"loss": 0.1239,
"step": 14950
},
{
"epoch": 1.3025921583952065,
"grad_norm": 7.305627346038818,
"learning_rate": 5.660442900564481e-06,
"loss": 0.1117,
"step": 15000
},
{
"epoch": 1.3069341322565238,
"grad_norm": 4.432664394378662,
"learning_rate": 5.645969025908236e-06,
"loss": 0.1666,
"step": 15050
},
{
"epoch": 1.311276106117841,
"grad_norm": 8.12741756439209,
"learning_rate": 5.631495151251991e-06,
"loss": 0.1476,
"step": 15100
},
{
"epoch": 1.3156180799791586,
"grad_norm": 0.27312299609184265,
"learning_rate": 5.61731075408887e-06,
"loss": 0.1639,
"step": 15150
},
{
"epoch": 1.3199600538404759,
"grad_norm": 5.976969242095947,
"learning_rate": 5.602836879432625e-06,
"loss": 0.1481,
"step": 15200
},
{
"epoch": 1.3243020277017932,
"grad_norm": 12.17628288269043,
"learning_rate": 5.588363004776379e-06,
"loss": 0.1714,
"step": 15250
},
{
"epoch": 1.3286440015631107,
"grad_norm": 2.3492331504821777,
"learning_rate": 5.573889130120133e-06,
"loss": 0.1696,
"step": 15300
},
{
"epoch": 1.332985975424428,
"grad_norm": 6.673547744750977,
"learning_rate": 5.559415255463888e-06,
"loss": 0.1497,
"step": 15350
},
{
"epoch": 1.3373279492857453,
"grad_norm": 10.461968421936035,
"learning_rate": 5.544941380807642e-06,
"loss": 0.151,
"step": 15400
},
{
"epoch": 1.3416699231470628,
"grad_norm": 5.2127299308776855,
"learning_rate": 5.5304675061513964e-06,
"loss": 0.1359,
"step": 15450
},
{
"epoch": 1.34601189700838,
"grad_norm": 10.900973320007324,
"learning_rate": 5.515993631495151e-06,
"loss": 0.1645,
"step": 15500
},
{
"epoch": 1.3503538708696974,
"grad_norm": 6.777567386627197,
"learning_rate": 5.501519756838906e-06,
"loss": 0.1412,
"step": 15550
},
{
"epoch": 1.3546958447310147,
"grad_norm": 5.665555477142334,
"learning_rate": 5.487045882182661e-06,
"loss": 0.1334,
"step": 15600
},
{
"epoch": 1.359037818592332,
"grad_norm": 6.394235610961914,
"learning_rate": 5.4725720075264154e-06,
"loss": 0.1389,
"step": 15650
},
{
"epoch": 1.3633797924536495,
"grad_norm": 7.386012077331543,
"learning_rate": 5.4580981328701695e-06,
"loss": 0.1534,
"step": 15700
},
{
"epoch": 1.3677217663149668,
"grad_norm": 14.629314422607422,
"learning_rate": 5.4436242582139245e-06,
"loss": 0.1852,
"step": 15750
},
{
"epoch": 1.372063740176284,
"grad_norm": 0.6826161742210388,
"learning_rate": 5.429150383557679e-06,
"loss": 0.1459,
"step": 15800
},
{
"epoch": 1.3764057140376016,
"grad_norm": 9.265501022338867,
"learning_rate": 5.414676508901433e-06,
"loss": 0.1522,
"step": 15850
},
{
"epoch": 1.3807476878989189,
"grad_norm": 6.414170742034912,
"learning_rate": 5.400202634245188e-06,
"loss": 0.1615,
"step": 15900
},
{
"epoch": 1.3850896617602362,
"grad_norm": 8.480489730834961,
"learning_rate": 5.385728759588943e-06,
"loss": 0.1398,
"step": 15950
},
{
"epoch": 1.3894316356215537,
"grad_norm": 8.438610076904297,
"learning_rate": 5.371254884932698e-06,
"loss": 0.1657,
"step": 16000
},
{
"epoch": 1.393773609482871,
"grad_norm": 9.783675193786621,
"learning_rate": 5.356781010276452e-06,
"loss": 0.1656,
"step": 16050
},
{
"epoch": 1.3981155833441883,
"grad_norm": 10.249602317810059,
"learning_rate": 5.342307135620206e-06,
"loss": 0.141,
"step": 16100
},
{
"epoch": 1.4024575572055056,
"grad_norm": 1.1902185678482056,
"learning_rate": 5.327833260963961e-06,
"loss": 0.0986,
"step": 16150
},
{
"epoch": 1.4067995310668229,
"grad_norm": 18.99185562133789,
"learning_rate": 5.313359386307715e-06,
"loss": 0.1236,
"step": 16200
},
{
"epoch": 1.4111415049281404,
"grad_norm": 7.957430839538574,
"learning_rate": 5.298885511651469e-06,
"loss": 0.1832,
"step": 16250
},
{
"epoch": 1.4154834787894577,
"grad_norm": 8.840742111206055,
"learning_rate": 5.284411636995224e-06,
"loss": 0.1175,
"step": 16300
},
{
"epoch": 1.419825452650775,
"grad_norm": 8.315855979919434,
"learning_rate": 5.269937762338978e-06,
"loss": 0.1502,
"step": 16350
},
{
"epoch": 1.4241674265120925,
"grad_norm": 10.64714527130127,
"learning_rate": 5.255463887682734e-06,
"loss": 0.1416,
"step": 16400
},
{
"epoch": 1.4285094003734098,
"grad_norm": 9.392339706420898,
"learning_rate": 5.240990013026488e-06,
"loss": 0.1501,
"step": 16450
},
{
"epoch": 1.432851374234727,
"grad_norm": 6.508515357971191,
"learning_rate": 5.226516138370242e-06,
"loss": 0.1356,
"step": 16500
},
{
"epoch": 1.4371933480960446,
"grad_norm": 12.240583419799805,
"learning_rate": 5.212042263713997e-06,
"loss": 0.1318,
"step": 16550
},
{
"epoch": 1.441535321957362,
"grad_norm": 9.682429313659668,
"learning_rate": 5.197568389057751e-06,
"loss": 0.1392,
"step": 16600
},
{
"epoch": 1.4458772958186792,
"grad_norm": 6.940093517303467,
"learning_rate": 5.183094514401505e-06,
"loss": 0.1288,
"step": 16650
},
{
"epoch": 1.4502192696799965,
"grad_norm": 7.383029937744141,
"learning_rate": 5.16862063974526e-06,
"loss": 0.1638,
"step": 16700
},
{
"epoch": 1.4545612435413138,
"grad_norm": 8.015626907348633,
"learning_rate": 5.154146765089014e-06,
"loss": 0.1337,
"step": 16750
},
{
"epoch": 1.4589032174026313,
"grad_norm": 4.140705108642578,
"learning_rate": 5.13967289043277e-06,
"loss": 0.1572,
"step": 16800
},
{
"epoch": 1.4632451912639486,
"grad_norm": 4.263915061950684,
"learning_rate": 5.125199015776524e-06,
"loss": 0.1591,
"step": 16850
},
{
"epoch": 1.4675871651252659,
"grad_norm": 7.539843559265137,
"learning_rate": 5.110725141120278e-06,
"loss": 0.1663,
"step": 16900
},
{
"epoch": 1.4719291389865834,
"grad_norm": 6.80403470993042,
"learning_rate": 5.096251266464033e-06,
"loss": 0.1443,
"step": 16950
},
{
"epoch": 1.4762711128479007,
"grad_norm": 7.334268569946289,
"learning_rate": 5.081777391807787e-06,
"loss": 0.1345,
"step": 17000
},
{
"epoch": 1.480613086709218,
"grad_norm": 21.538869857788086,
"learning_rate": 5.0673035171515415e-06,
"loss": 0.1439,
"step": 17050
},
{
"epoch": 1.4849550605705355,
"grad_norm": 9.157646179199219,
"learning_rate": 5.0528296424952965e-06,
"loss": 0.1883,
"step": 17100
},
{
"epoch": 1.4892970344318528,
"grad_norm": 10.004467964172363,
"learning_rate": 5.038355767839051e-06,
"loss": 0.1345,
"step": 17150
},
{
"epoch": 1.49363900829317,
"grad_norm": 8.288803100585938,
"learning_rate": 5.0238818931828055e-06,
"loss": 0.1418,
"step": 17200
},
{
"epoch": 1.4979809821544874,
"grad_norm": 6.881669044494629,
"learning_rate": 5.009697496019685e-06,
"loss": 0.141,
"step": 17250
},
{
"epoch": 1.5023229560158047,
"grad_norm": 8.799572944641113,
"learning_rate": 4.995223621363439e-06,
"loss": 0.1546,
"step": 17300
},
{
"epoch": 1.5066649298771222,
"grad_norm": 6.629570960998535,
"learning_rate": 4.980749746707194e-06,
"loss": 0.1725,
"step": 17350
},
{
"epoch": 1.5110069037384395,
"grad_norm": 10.236489295959473,
"learning_rate": 4.966275872050948e-06,
"loss": 0.1267,
"step": 17400
},
{
"epoch": 1.5153488775997568,
"grad_norm": 4.510442733764648,
"learning_rate": 4.951801997394703e-06,
"loss": 0.1569,
"step": 17450
},
{
"epoch": 1.5196908514610743,
"grad_norm": 11.930350303649902,
"learning_rate": 4.937328122738457e-06,
"loss": 0.1326,
"step": 17500
},
{
"epoch": 1.5240328253223916,
"grad_norm": 11.577521324157715,
"learning_rate": 4.922854248082212e-06,
"loss": 0.1689,
"step": 17550
},
{
"epoch": 1.5283747991837089,
"grad_norm": 7.687832832336426,
"learning_rate": 4.908380373425966e-06,
"loss": 0.1633,
"step": 17600
},
{
"epoch": 1.5327167730450264,
"grad_norm": 10.742979049682617,
"learning_rate": 4.893906498769721e-06,
"loss": 0.1631,
"step": 17650
},
{
"epoch": 1.5370587469063435,
"grad_norm": 12.676881790161133,
"learning_rate": 4.879432624113475e-06,
"loss": 0.1466,
"step": 17700
},
{
"epoch": 1.541400720767661,
"grad_norm": 7.963159084320068,
"learning_rate": 4.86495874945723e-06,
"loss": 0.1602,
"step": 17750
},
{
"epoch": 1.5457426946289783,
"grad_norm": 8.404998779296875,
"learning_rate": 4.850484874800984e-06,
"loss": 0.1426,
"step": 17800
},
{
"epoch": 1.5500846684902956,
"grad_norm": 12.583954811096191,
"learning_rate": 4.836011000144739e-06,
"loss": 0.1424,
"step": 17850
},
{
"epoch": 1.554426642351613,
"grad_norm": 10.515045166015625,
"learning_rate": 4.8215371254884934e-06,
"loss": 0.1388,
"step": 17900
},
{
"epoch": 1.5587686162129304,
"grad_norm": 0.09036704897880554,
"learning_rate": 4.807063250832248e-06,
"loss": 0.1354,
"step": 17950
},
{
"epoch": 1.5631105900742477,
"grad_norm": 3.9979803562164307,
"learning_rate": 4.7925893761760025e-06,
"loss": 0.16,
"step": 18000
},
{
"epoch": 1.5674525639355652,
"grad_norm": 10.055344581604004,
"learning_rate": 4.7781155015197575e-06,
"loss": 0.1765,
"step": 18050
},
{
"epoch": 1.5717945377968825,
"grad_norm": 6.515308856964111,
"learning_rate": 4.763641626863512e-06,
"loss": 0.1497,
"step": 18100
},
{
"epoch": 1.5761365116581998,
"grad_norm": 4.9170966148376465,
"learning_rate": 4.749167752207266e-06,
"loss": 0.1309,
"step": 18150
},
{
"epoch": 1.5804784855195173,
"grad_norm": 9.118842124938965,
"learning_rate": 4.734693877551021e-06,
"loss": 0.1611,
"step": 18200
},
{
"epoch": 1.5848204593808344,
"grad_norm": 4.670436859130859,
"learning_rate": 4.720220002894776e-06,
"loss": 0.1671,
"step": 18250
},
{
"epoch": 1.589162433242152,
"grad_norm": 5.8963165283203125,
"learning_rate": 4.70574612823853e-06,
"loss": 0.155,
"step": 18300
},
{
"epoch": 1.5935044071034692,
"grad_norm": 0.9755913019180298,
"learning_rate": 4.691272253582284e-06,
"loss": 0.1451,
"step": 18350
},
{
"epoch": 1.5978463809647865,
"grad_norm": 2.514676570892334,
"learning_rate": 4.676798378926039e-06,
"loss": 0.1874,
"step": 18400
},
{
"epoch": 1.602188354826104,
"grad_norm": 6.621218681335449,
"learning_rate": 4.662324504269794e-06,
"loss": 0.1527,
"step": 18450
},
{
"epoch": 1.6065303286874213,
"grad_norm": 6.083352088928223,
"learning_rate": 4.647850629613548e-06,
"loss": 0.1522,
"step": 18500
},
{
"epoch": 1.6108723025487386,
"grad_norm": 7.968784332275391,
"learning_rate": 4.633376754957302e-06,
"loss": 0.1157,
"step": 18550
},
{
"epoch": 1.615214276410056,
"grad_norm": 3.370387315750122,
"learning_rate": 4.618902880301057e-06,
"loss": 0.1205,
"step": 18600
},
{
"epoch": 1.6195562502713734,
"grad_norm": 7.180332660675049,
"learning_rate": 4.604429005644812e-06,
"loss": 0.1302,
"step": 18650
},
{
"epoch": 1.6238982241326907,
"grad_norm": 0.08418329805135727,
"learning_rate": 4.589955130988566e-06,
"loss": 0.1485,
"step": 18700
},
{
"epoch": 1.6282401979940082,
"grad_norm": 9.862198829650879,
"learning_rate": 4.57548125633232e-06,
"loss": 0.1407,
"step": 18750
},
{
"epoch": 1.6325821718553253,
"grad_norm": 8.546820640563965,
"learning_rate": 4.561007381676075e-06,
"loss": 0.1593,
"step": 18800
},
{
"epoch": 1.6369241457166428,
"grad_norm": 8.208939552307129,
"learning_rate": 4.54653350701983e-06,
"loss": 0.1009,
"step": 18850
},
{
"epoch": 1.64126611957796,
"grad_norm": 9.600082397460938,
"learning_rate": 4.532059632363584e-06,
"loss": 0.1497,
"step": 18900
},
{
"epoch": 1.6456080934392774,
"grad_norm": 6.318692207336426,
"learning_rate": 4.517585757707338e-06,
"loss": 0.1526,
"step": 18950
},
{
"epoch": 1.649950067300595,
"grad_norm": 8.331518173217773,
"learning_rate": 4.503111883051093e-06,
"loss": 0.1279,
"step": 19000
},
{
"epoch": 1.6542920411619122,
"grad_norm": 8.760932922363281,
"learning_rate": 4.488638008394848e-06,
"loss": 0.1117,
"step": 19050
},
{
"epoch": 1.6586340150232295,
"grad_norm": 5.412656784057617,
"learning_rate": 4.474164133738602e-06,
"loss": 0.1405,
"step": 19100
},
{
"epoch": 1.662975988884547,
"grad_norm": 0.18343329429626465,
"learning_rate": 4.459690259082356e-06,
"loss": 0.1385,
"step": 19150
},
{
"epoch": 1.6673179627458643,
"grad_norm": 1.2332340478897095,
"learning_rate": 4.445216384426111e-06,
"loss": 0.1096,
"step": 19200
},
{
"epoch": 1.6716599366071816,
"grad_norm": 8.579971313476562,
"learning_rate": 4.430742509769866e-06,
"loss": 0.1522,
"step": 19250
},
{
"epoch": 1.6760019104684991,
"grad_norm": 11.554192543029785,
"learning_rate": 4.41626863511362e-06,
"loss": 0.1487,
"step": 19300
},
{
"epoch": 1.6803438843298162,
"grad_norm": 4.856355667114258,
"learning_rate": 4.4020842379505e-06,
"loss": 0.1479,
"step": 19350
},
{
"epoch": 1.6846858581911337,
"grad_norm": 2.942544937133789,
"learning_rate": 4.387610363294254e-06,
"loss": 0.1119,
"step": 19400
},
{
"epoch": 1.689027832052451,
"grad_norm": 2.100930690765381,
"learning_rate": 4.373136488638009e-06,
"loss": 0.1388,
"step": 19450
},
{
"epoch": 1.6933698059137683,
"grad_norm": 0.6170673370361328,
"learning_rate": 4.358662613981764e-06,
"loss": 0.1314,
"step": 19500
},
{
"epoch": 1.6977117797750858,
"grad_norm": 4.570499420166016,
"learning_rate": 4.344188739325518e-06,
"loss": 0.1695,
"step": 19550
},
{
"epoch": 1.702053753636403,
"grad_norm": 9.56963062286377,
"learning_rate": 4.329714864669272e-06,
"loss": 0.1226,
"step": 19600
},
{
"epoch": 1.7063957274977204,
"grad_norm": 5.0290350914001465,
"learning_rate": 4.315240990013027e-06,
"loss": 0.1508,
"step": 19650
},
{
"epoch": 1.710737701359038,
"grad_norm": 11.994057655334473,
"learning_rate": 4.300767115356781e-06,
"loss": 0.1797,
"step": 19700
},
{
"epoch": 1.7150796752203552,
"grad_norm": 4.463140487670898,
"learning_rate": 4.286293240700536e-06,
"loss": 0.1461,
"step": 19750
},
{
"epoch": 1.7194216490816725,
"grad_norm": 4.990943908691406,
"learning_rate": 4.27181936604429e-06,
"loss": 0.1524,
"step": 19800
},
{
"epoch": 1.72376362294299,
"grad_norm": 6.280303001403809,
"learning_rate": 4.257345491388045e-06,
"loss": 0.1422,
"step": 19850
},
{
"epoch": 1.728105596804307,
"grad_norm": 5.439699172973633,
"learning_rate": 4.242871616731799e-06,
"loss": 0.1045,
"step": 19900
},
{
"epoch": 1.7324475706656246,
"grad_norm": 6.375192165374756,
"learning_rate": 4.228397742075554e-06,
"loss": 0.1001,
"step": 19950
},
{
"epoch": 1.736789544526942,
"grad_norm": 12.014942169189453,
"learning_rate": 4.213923867419308e-06,
"loss": 0.1702,
"step": 20000
},
{
"epoch": 1.7411315183882592,
"grad_norm": 7.028641223907471,
"learning_rate": 4.199449992763062e-06,
"loss": 0.1177,
"step": 20050
},
{
"epoch": 1.7454734922495767,
"grad_norm": 18.258798599243164,
"learning_rate": 4.184976118106817e-06,
"loss": 0.1513,
"step": 20100
},
{
"epoch": 1.749815466110894,
"grad_norm": 10.809378623962402,
"learning_rate": 4.170502243450572e-06,
"loss": 0.138,
"step": 20150
},
{
"epoch": 1.7541574399722113,
"grad_norm": 10.029609680175781,
"learning_rate": 4.156028368794326e-06,
"loss": 0.134,
"step": 20200
},
{
"epoch": 1.7584994138335288,
"grad_norm": 4.14710807800293,
"learning_rate": 4.1415544941380805e-06,
"loss": 0.1448,
"step": 20250
},
{
"epoch": 1.762841387694846,
"grad_norm": 7.409153461456299,
"learning_rate": 4.1270806194818355e-06,
"loss": 0.1332,
"step": 20300
},
{
"epoch": 1.7671833615561634,
"grad_norm": 9.64983081817627,
"learning_rate": 4.1126067448255904e-06,
"loss": 0.1416,
"step": 20350
},
{
"epoch": 1.771525335417481,
"grad_norm": 0.016156112775206566,
"learning_rate": 4.0981328701693445e-06,
"loss": 0.1428,
"step": 20400
},
{
"epoch": 1.775867309278798,
"grad_norm": 6.8022050857543945,
"learning_rate": 4.083658995513099e-06,
"loss": 0.1286,
"step": 20450
},
{
"epoch": 1.7802092831401155,
"grad_norm": 2.9759087562561035,
"learning_rate": 4.069185120856854e-06,
"loss": 0.1372,
"step": 20500
},
{
"epoch": 1.7845512570014328,
"grad_norm": 9.457865715026855,
"learning_rate": 4.0547112462006086e-06,
"loss": 0.1482,
"step": 20550
},
{
"epoch": 1.78889323086275,
"grad_norm": 1.2885161638259888,
"learning_rate": 4.040237371544363e-06,
"loss": 0.1411,
"step": 20600
},
{
"epoch": 1.7932352047240676,
"grad_norm": 8.49577808380127,
"learning_rate": 4.025763496888117e-06,
"loss": 0.1596,
"step": 20650
},
{
"epoch": 1.797577178585385,
"grad_norm": 3.6167070865631104,
"learning_rate": 4.011289622231872e-06,
"loss": 0.1424,
"step": 20700
},
{
"epoch": 1.8019191524467022,
"grad_norm": 2.6539206504821777,
"learning_rate": 3.996815747575627e-06,
"loss": 0.1328,
"step": 20750
},
{
"epoch": 1.8062611263080197,
"grad_norm": 5.883168697357178,
"learning_rate": 3.982341872919381e-06,
"loss": 0.1268,
"step": 20800
},
{
"epoch": 1.810603100169337,
"grad_norm": 6.293718338012695,
"learning_rate": 3.967867998263135e-06,
"loss": 0.1356,
"step": 20850
},
{
"epoch": 1.8149450740306543,
"grad_norm": 7.67991828918457,
"learning_rate": 3.95339412360689e-06,
"loss": 0.1119,
"step": 20900
},
{
"epoch": 1.8192870478919718,
"grad_norm": 9.023368835449219,
"learning_rate": 3.938920248950645e-06,
"loss": 0.1614,
"step": 20950
},
{
"epoch": 1.823629021753289,
"grad_norm": 13.551576614379883,
"learning_rate": 3.924446374294399e-06,
"loss": 0.1653,
"step": 21000
},
{
"epoch": 1.8279709956146064,
"grad_norm": 7.216656684875488,
"learning_rate": 3.909972499638153e-06,
"loss": 0.1446,
"step": 21050
},
{
"epoch": 1.8323129694759237,
"grad_norm": 13.925249099731445,
"learning_rate": 3.895498624981908e-06,
"loss": 0.1075,
"step": 21100
},
{
"epoch": 1.836654943337241,
"grad_norm": 7.241964817047119,
"learning_rate": 3.881024750325663e-06,
"loss": 0.1382,
"step": 21150
},
{
"epoch": 1.8409969171985585,
"grad_norm": 2.9219863414764404,
"learning_rate": 3.866550875669417e-06,
"loss": 0.1132,
"step": 21200
},
{
"epoch": 1.8453388910598758,
"grad_norm": 9.134288787841797,
"learning_rate": 3.852077001013171e-06,
"loss": 0.1412,
"step": 21250
},
{
"epoch": 1.849680864921193,
"grad_norm": 4.604516506195068,
"learning_rate": 3.837603126356926e-06,
"loss": 0.1318,
"step": 21300
},
{
"epoch": 1.8540228387825106,
"grad_norm": 0.3576798737049103,
"learning_rate": 3.823129251700681e-06,
"loss": 0.1255,
"step": 21350
},
{
"epoch": 1.858364812643828,
"grad_norm": 0.4319687485694885,
"learning_rate": 3.80894485453756e-06,
"loss": 0.1328,
"step": 21400
},
{
"epoch": 1.8627067865051452,
"grad_norm": 1.7280389070510864,
"learning_rate": 3.7944709798813146e-06,
"loss": 0.1386,
"step": 21450
},
{
"epoch": 1.8670487603664627,
"grad_norm": 1.2364240884780884,
"learning_rate": 3.7799971052250687e-06,
"loss": 0.1617,
"step": 21500
},
{
"epoch": 1.8713907342277798,
"grad_norm": 10.524025917053223,
"learning_rate": 3.7655232305688237e-06,
"loss": 0.1643,
"step": 21550
},
{
"epoch": 1.8757327080890973,
"grad_norm": 8.031953811645508,
"learning_rate": 3.7510493559125782e-06,
"loss": 0.1748,
"step": 21600
},
{
"epoch": 1.8800746819504146,
"grad_norm": 4.875488758087158,
"learning_rate": 3.7365754812563327e-06,
"loss": 0.1507,
"step": 21650
},
{
"epoch": 1.884416655811732,
"grad_norm": 7.984673976898193,
"learning_rate": 3.722101606600087e-06,
"loss": 0.131,
"step": 21700
},
{
"epoch": 1.8887586296730494,
"grad_norm": 0.16609551012516022,
"learning_rate": 3.7076277319438414e-06,
"loss": 0.13,
"step": 21750
},
{
"epoch": 1.8931006035343667,
"grad_norm": 7.161849498748779,
"learning_rate": 3.6931538572875963e-06,
"loss": 0.1393,
"step": 21800
},
{
"epoch": 1.897442577395684,
"grad_norm": 7.236243724822998,
"learning_rate": 3.678679982631351e-06,
"loss": 0.129,
"step": 21850
},
{
"epoch": 1.9017845512570015,
"grad_norm": 10.374253273010254,
"learning_rate": 3.664206107975105e-06,
"loss": 0.1368,
"step": 21900
},
{
"epoch": 1.9061265251183188,
"grad_norm": 2.0947864055633545,
"learning_rate": 3.6497322333188595e-06,
"loss": 0.1172,
"step": 21950
},
{
"epoch": 1.910468498979636,
"grad_norm": 6.918741703033447,
"learning_rate": 3.6352583586626145e-06,
"loss": 0.1496,
"step": 22000
},
{
"epoch": 1.9148104728409536,
"grad_norm": 7.337772846221924,
"learning_rate": 3.620784484006369e-06,
"loss": 0.1176,
"step": 22050
},
{
"epoch": 1.9191524467022707,
"grad_norm": 0.9349635243415833,
"learning_rate": 3.606310609350123e-06,
"loss": 0.1231,
"step": 22100
},
{
"epoch": 1.9234944205635882,
"grad_norm": 9.466097831726074,
"learning_rate": 3.5918367346938777e-06,
"loss": 0.1161,
"step": 22150
},
{
"epoch": 1.9278363944249055,
"grad_norm": 1.3035846948623657,
"learning_rate": 3.5773628600376326e-06,
"loss": 0.1522,
"step": 22200
},
{
"epoch": 1.9321783682862228,
"grad_norm": 8.847750663757324,
"learning_rate": 3.562888985381387e-06,
"loss": 0.1666,
"step": 22250
},
{
"epoch": 1.9365203421475403,
"grad_norm": 9.673260688781738,
"learning_rate": 3.5484151107251413e-06,
"loss": 0.1778,
"step": 22300
},
{
"epoch": 1.9408623160088576,
"grad_norm": 5.941328048706055,
"learning_rate": 3.533941236068896e-06,
"loss": 0.1482,
"step": 22350
},
{
"epoch": 1.945204289870175,
"grad_norm": 4.996196269989014,
"learning_rate": 3.5194673614126507e-06,
"loss": 0.1375,
"step": 22400
},
{
"epoch": 1.9495462637314924,
"grad_norm": 14.44809627532959,
"learning_rate": 3.504993486756405e-06,
"loss": 0.0934,
"step": 22450
},
{
"epoch": 1.9538882375928097,
"grad_norm": 2.986774444580078,
"learning_rate": 3.4905196121001594e-06,
"loss": 0.0977,
"step": 22500
},
{
"epoch": 1.958230211454127,
"grad_norm": 0.16753756999969482,
"learning_rate": 3.476045737443914e-06,
"loss": 0.1383,
"step": 22550
},
{
"epoch": 1.9625721853154445,
"grad_norm": 9.141763687133789,
"learning_rate": 3.461571862787669e-06,
"loss": 0.1123,
"step": 22600
},
{
"epoch": 1.9669141591767616,
"grad_norm": 11.578398704528809,
"learning_rate": 3.447097988131423e-06,
"loss": 0.1295,
"step": 22650
},
{
"epoch": 1.9712561330380791,
"grad_norm": 6.170989036560059,
"learning_rate": 3.4326241134751775e-06,
"loss": 0.129,
"step": 22700
},
{
"epoch": 1.9755981068993964,
"grad_norm": 8.633371353149414,
"learning_rate": 3.418150238818932e-06,
"loss": 0.1252,
"step": 22750
},
{
"epoch": 1.9799400807607137,
"grad_norm": 0.2691422402858734,
"learning_rate": 3.403676364162687e-06,
"loss": 0.1309,
"step": 22800
},
{
"epoch": 1.9842820546220312,
"grad_norm": 1.046929121017456,
"learning_rate": 3.389202489506441e-06,
"loss": 0.128,
"step": 22850
},
{
"epoch": 1.9886240284833485,
"grad_norm": 9.4624662399292,
"learning_rate": 3.3747286148501957e-06,
"loss": 0.1663,
"step": 22900
},
{
"epoch": 1.9929660023446658,
"grad_norm": 3.3540661334991455,
"learning_rate": 3.36025474019395e-06,
"loss": 0.094,
"step": 22950
},
{
"epoch": 1.9973079762059833,
"grad_norm": 7.1674675941467285,
"learning_rate": 3.345780865537705e-06,
"loss": 0.1405,
"step": 23000
},
{
"epoch": 2.0,
"eval_loss": 0.21683721244335175,
"eval_runtime": 158.5988,
"eval_samples_per_second": 64.54,
"eval_steps_per_second": 16.135,
"step": 23031
},
{
"epoch": 2.0016499500673004,
"grad_norm": 2.219898223876953,
"learning_rate": 3.3313069908814593e-06,
"loss": 0.1019,
"step": 23050
},
{
"epoch": 2.005991923928618,
"grad_norm": 4.620298385620117,
"learning_rate": 3.3168331162252138e-06,
"loss": 0.0865,
"step": 23100
},
{
"epoch": 2.0103338977899354,
"grad_norm": 5.638464450836182,
"learning_rate": 3.3023592415689683e-06,
"loss": 0.0756,
"step": 23150
},
{
"epoch": 2.0146758716512525,
"grad_norm": 4.311452388763428,
"learning_rate": 3.2878853669127224e-06,
"loss": 0.0872,
"step": 23200
},
{
"epoch": 2.01901784551257,
"grad_norm": 10.365550994873047,
"learning_rate": 3.2734114922564774e-06,
"loss": 0.0868,
"step": 23250
},
{
"epoch": 2.0233598193738875,
"grad_norm": 11.352954864501953,
"learning_rate": 3.258937617600232e-06,
"loss": 0.0979,
"step": 23300
},
{
"epoch": 2.0277017932352046,
"grad_norm": 1.122918963432312,
"learning_rate": 3.2444637429439865e-06,
"loss": 0.0736,
"step": 23350
},
{
"epoch": 2.032043767096522,
"grad_norm": 5.547560214996338,
"learning_rate": 3.2299898682877406e-06,
"loss": 0.0936,
"step": 23400
},
{
"epoch": 2.0363857409578396,
"grad_norm": 7.175875663757324,
"learning_rate": 3.2155159936314955e-06,
"loss": 0.0939,
"step": 23450
},
{
"epoch": 2.0407277148191567,
"grad_norm": 7.929403305053711,
"learning_rate": 3.20104211897525e-06,
"loss": 0.1086,
"step": 23500
},
{
"epoch": 2.0450696886804742,
"grad_norm": 2.572625160217285,
"learning_rate": 3.186568244319004e-06,
"loss": 0.1032,
"step": 23550
},
{
"epoch": 2.0494116625417913,
"grad_norm": 9.160089492797852,
"learning_rate": 3.1723838471558836e-06,
"loss": 0.0887,
"step": 23600
},
{
"epoch": 2.053753636403109,
"grad_norm": 1.3796570301055908,
"learning_rate": 3.157909972499638e-06,
"loss": 0.0798,
"step": 23650
},
{
"epoch": 2.0580956102644263,
"grad_norm": 2.670893669128418,
"learning_rate": 3.143436097843393e-06,
"loss": 0.0898,
"step": 23700
},
{
"epoch": 2.0624375841257434,
"grad_norm": 3.4575767517089844,
"learning_rate": 3.1289622231871476e-06,
"loss": 0.0821,
"step": 23750
},
{
"epoch": 2.066779557987061,
"grad_norm": 6.139395236968994,
"learning_rate": 3.1144883485309017e-06,
"loss": 0.1038,
"step": 23800
},
{
"epoch": 2.0711215318483784,
"grad_norm": 1.5725222826004028,
"learning_rate": 3.1000144738746562e-06,
"loss": 0.0722,
"step": 23850
},
{
"epoch": 2.0754635057096955,
"grad_norm": 5.873169422149658,
"learning_rate": 3.085540599218411e-06,
"loss": 0.1012,
"step": 23900
},
{
"epoch": 2.079805479571013,
"grad_norm": 5.5453877449035645,
"learning_rate": 3.0710667245621657e-06,
"loss": 0.0812,
"step": 23950
},
{
"epoch": 2.0841474534323305,
"grad_norm": 8.914033889770508,
"learning_rate": 3.05659284990592e-06,
"loss": 0.0798,
"step": 24000
},
{
"epoch": 2.0884894272936476,
"grad_norm": 1.6803654432296753,
"learning_rate": 3.0421189752496744e-06,
"loss": 0.0793,
"step": 24050
},
{
"epoch": 2.092831401154965,
"grad_norm": 6.029511451721191,
"learning_rate": 3.0276451005934293e-06,
"loss": 0.098,
"step": 24100
},
{
"epoch": 2.097173375016282,
"grad_norm": 1.0954087972640991,
"learning_rate": 3.013171225937184e-06,
"loss": 0.0664,
"step": 24150
},
{
"epoch": 2.1015153488775997,
"grad_norm": 5.141661643981934,
"learning_rate": 2.998697351280938e-06,
"loss": 0.0796,
"step": 24200
},
{
"epoch": 2.1058573227389172,
"grad_norm": 7.69569730758667,
"learning_rate": 2.9842234766246925e-06,
"loss": 0.0926,
"step": 24250
},
{
"epoch": 2.1101992966002343,
"grad_norm": 5.892622947692871,
"learning_rate": 2.9697496019684475e-06,
"loss": 0.0775,
"step": 24300
},
{
"epoch": 2.114541270461552,
"grad_norm": 8.389673233032227,
"learning_rate": 2.955275727312202e-06,
"loss": 0.0766,
"step": 24350
},
{
"epoch": 2.1188832443228693,
"grad_norm": 3.9702394008636475,
"learning_rate": 2.940801852655956e-06,
"loss": 0.0917,
"step": 24400
},
{
"epoch": 2.1232252181841864,
"grad_norm": 4.655553817749023,
"learning_rate": 2.9263279779997106e-06,
"loss": 0.096,
"step": 24450
},
{
"epoch": 2.127567192045504,
"grad_norm": 8.192728996276855,
"learning_rate": 2.9118541033434656e-06,
"loss": 0.1074,
"step": 24500
},
{
"epoch": 2.1319091659068214,
"grad_norm": 1.8472944498062134,
"learning_rate": 2.89738022868722e-06,
"loss": 0.073,
"step": 24550
},
{
"epoch": 2.1362511397681385,
"grad_norm": 13.238656044006348,
"learning_rate": 2.8829063540309742e-06,
"loss": 0.0838,
"step": 24600
},
{
"epoch": 2.140593113629456,
"grad_norm": 5.6969709396362305,
"learning_rate": 2.8684324793747288e-06,
"loss": 0.0664,
"step": 24650
},
{
"epoch": 2.144935087490773,
"grad_norm": 8.73705768585205,
"learning_rate": 2.8539586047184837e-06,
"loss": 0.0668,
"step": 24700
},
{
"epoch": 2.1492770613520906,
"grad_norm": 0.2656320631504059,
"learning_rate": 2.839484730062238e-06,
"loss": 0.085,
"step": 24750
},
{
"epoch": 2.153619035213408,
"grad_norm": 7.568329334259033,
"learning_rate": 2.8250108554059924e-06,
"loss": 0.0902,
"step": 24800
},
{
"epoch": 2.157961009074725,
"grad_norm": 7.81199836730957,
"learning_rate": 2.810536980749747e-06,
"loss": 0.0889,
"step": 24850
},
{
"epoch": 2.1623029829360427,
"grad_norm": 6.70134162902832,
"learning_rate": 2.796063106093502e-06,
"loss": 0.0781,
"step": 24900
},
{
"epoch": 2.1666449567973602,
"grad_norm": 8.440710067749023,
"learning_rate": 2.781589231437256e-06,
"loss": 0.0842,
"step": 24950
},
{
"epoch": 2.1709869306586773,
"grad_norm": 4.488495826721191,
"learning_rate": 2.7671153567810105e-06,
"loss": 0.0983,
"step": 25000
},
{
"epoch": 2.175328904519995,
"grad_norm": 12.057733535766602,
"learning_rate": 2.752641482124765e-06,
"loss": 0.0724,
"step": 25050
},
{
"epoch": 2.1796708783813123,
"grad_norm": 12.238190650939941,
"learning_rate": 2.738167607468519e-06,
"loss": 0.0791,
"step": 25100
},
{
"epoch": 2.1840128522426294,
"grad_norm": 8.079179763793945,
"learning_rate": 2.723693732812274e-06,
"loss": 0.0651,
"step": 25150
},
{
"epoch": 2.188354826103947,
"grad_norm": 9.638249397277832,
"learning_rate": 2.7092198581560286e-06,
"loss": 0.1001,
"step": 25200
},
{
"epoch": 2.192696799965264,
"grad_norm": 0.03366376459598541,
"learning_rate": 2.694745983499783e-06,
"loss": 0.0724,
"step": 25250
},
{
"epoch": 2.1970387738265815,
"grad_norm": 0.43450728058815,
"learning_rate": 2.6802721088435373e-06,
"loss": 0.1011,
"step": 25300
},
{
"epoch": 2.201380747687899,
"grad_norm": 3.758702516555786,
"learning_rate": 2.6657982341872922e-06,
"loss": 0.1216,
"step": 25350
},
{
"epoch": 2.205722721549216,
"grad_norm": 2.247030735015869,
"learning_rate": 2.6513243595310468e-06,
"loss": 0.0606,
"step": 25400
},
{
"epoch": 2.2100646954105336,
"grad_norm": 7.871387004852295,
"learning_rate": 2.6368504848748013e-06,
"loss": 0.0791,
"step": 25450
},
{
"epoch": 2.214406669271851,
"grad_norm": 4.234960079193115,
"learning_rate": 2.6223766102185554e-06,
"loss": 0.0968,
"step": 25500
},
{
"epoch": 2.218748643133168,
"grad_norm": 0.44377923011779785,
"learning_rate": 2.6079027355623104e-06,
"loss": 0.0897,
"step": 25550
},
{
"epoch": 2.2230906169944857,
"grad_norm": 7.014172077178955,
"learning_rate": 2.593428860906065e-06,
"loss": 0.1147,
"step": 25600
},
{
"epoch": 2.2274325908558033,
"grad_norm": 0.3093360960483551,
"learning_rate": 2.578954986249819e-06,
"loss": 0.0793,
"step": 25650
},
{
"epoch": 2.2317745647171203,
"grad_norm": 4.603691577911377,
"learning_rate": 2.5644811115935735e-06,
"loss": 0.106,
"step": 25700
},
{
"epoch": 2.236116538578438,
"grad_norm": 0.18263719975948334,
"learning_rate": 2.5500072369373285e-06,
"loss": 0.0685,
"step": 25750
},
{
"epoch": 2.240458512439755,
"grad_norm": 6.405594348907471,
"learning_rate": 2.535822839774208e-06,
"loss": 0.0661,
"step": 25800
},
{
"epoch": 2.2448004863010724,
"grad_norm": 6.221367359161377,
"learning_rate": 2.5213489651179624e-06,
"loss": 0.0773,
"step": 25850
},
{
"epoch": 2.24914246016239,
"grad_norm": 7.3640289306640625,
"learning_rate": 2.5068750904617165e-06,
"loss": 0.1208,
"step": 25900
},
{
"epoch": 2.253484434023707,
"grad_norm": 2.9754855632781982,
"learning_rate": 2.4924012158054715e-06,
"loss": 0.0656,
"step": 25950
},
{
"epoch": 2.2578264078850245,
"grad_norm": 0.4063643515110016,
"learning_rate": 2.4779273411492256e-06,
"loss": 0.0794,
"step": 26000
},
{
"epoch": 2.262168381746342,
"grad_norm": 11.072209358215332,
"learning_rate": 2.4634534664929806e-06,
"loss": 0.0872,
"step": 26050
},
{
"epoch": 2.266510355607659,
"grad_norm": 5.498048782348633,
"learning_rate": 2.4489795918367347e-06,
"loss": 0.0767,
"step": 26100
},
{
"epoch": 2.2708523294689766,
"grad_norm": 8.235716819763184,
"learning_rate": 2.4345057171804896e-06,
"loss": 0.1209,
"step": 26150
},
{
"epoch": 2.275194303330294,
"grad_norm": 8.56951904296875,
"learning_rate": 2.4200318425242437e-06,
"loss": 0.0847,
"step": 26200
},
{
"epoch": 2.2795362771916112,
"grad_norm": 7.200361728668213,
"learning_rate": 2.4055579678679987e-06,
"loss": 0.0757,
"step": 26250
},
{
"epoch": 2.2838782510529287,
"grad_norm": 0.4996426999568939,
"learning_rate": 2.391084093211753e-06,
"loss": 0.076,
"step": 26300
},
{
"epoch": 2.2882202249142463,
"grad_norm": 2.8761146068573,
"learning_rate": 2.3766102185555078e-06,
"loss": 0.0959,
"step": 26350
},
{
"epoch": 2.2925621987755633,
"grad_norm": 5.218446731567383,
"learning_rate": 2.362136343899262e-06,
"loss": 0.0775,
"step": 26400
},
{
"epoch": 2.296904172636881,
"grad_norm": 1.0682421922683716,
"learning_rate": 2.347662469243017e-06,
"loss": 0.0883,
"step": 26450
},
{
"epoch": 2.301246146498198,
"grad_norm": 2.8506529331207275,
"learning_rate": 2.333188594586771e-06,
"loss": 0.1023,
"step": 26500
},
{
"epoch": 2.3055881203595154,
"grad_norm": 8.659663200378418,
"learning_rate": 2.318714719930526e-06,
"loss": 0.0658,
"step": 26550
},
{
"epoch": 2.309930094220833,
"grad_norm": 7.557945728302002,
"learning_rate": 2.30424084527428e-06,
"loss": 0.0954,
"step": 26600
},
{
"epoch": 2.31427206808215,
"grad_norm": 0.2548428475856781,
"learning_rate": 2.289766970618035e-06,
"loss": 0.0826,
"step": 26650
},
{
"epoch": 2.3186140419434675,
"grad_norm": 3.291508436203003,
"learning_rate": 2.275293095961789e-06,
"loss": 0.0879,
"step": 26700
},
{
"epoch": 2.3229560158047846,
"grad_norm": 5.556234836578369,
"learning_rate": 2.2608192213055436e-06,
"loss": 0.0976,
"step": 26750
},
{
"epoch": 2.327297989666102,
"grad_norm": 1.609104871749878,
"learning_rate": 2.246345346649298e-06,
"loss": 0.0786,
"step": 26800
},
{
"epoch": 2.3316399635274196,
"grad_norm": 5.0335235595703125,
"learning_rate": 2.2318714719930527e-06,
"loss": 0.0728,
"step": 26850
},
{
"epoch": 2.3359819373887367,
"grad_norm": 8.968177795410156,
"learning_rate": 2.217397597336807e-06,
"loss": 0.0847,
"step": 26900
},
{
"epoch": 2.3403239112500542,
"grad_norm": 9.932825088500977,
"learning_rate": 2.2029237226805617e-06,
"loss": 0.1038,
"step": 26950
},
{
"epoch": 2.3446658851113718,
"grad_norm": 2.8147237300872803,
"learning_rate": 2.1884498480243163e-06,
"loss": 0.0825,
"step": 27000
},
{
"epoch": 2.349007858972689,
"grad_norm": 0.979704737663269,
"learning_rate": 2.173975973368071e-06,
"loss": 0.1116,
"step": 27050
},
{
"epoch": 2.3533498328340063,
"grad_norm": 0.11242897063493729,
"learning_rate": 2.1595020987118253e-06,
"loss": 0.0789,
"step": 27100
},
{
"epoch": 2.357691806695324,
"grad_norm": 7.687259674072266,
"learning_rate": 2.14502822405558e-06,
"loss": 0.0723,
"step": 27150
},
{
"epoch": 2.362033780556641,
"grad_norm": 1.6177990436553955,
"learning_rate": 2.1305543493993344e-06,
"loss": 0.093,
"step": 27200
},
{
"epoch": 2.3663757544179584,
"grad_norm": 8.21517562866211,
"learning_rate": 2.116080474743089e-06,
"loss": 0.0951,
"step": 27250
},
{
"epoch": 2.370717728279276,
"grad_norm": 6.920109748840332,
"learning_rate": 2.1016066000868435e-06,
"loss": 0.0914,
"step": 27300
},
{
"epoch": 2.375059702140593,
"grad_norm": 4.636850833892822,
"learning_rate": 2.087132725430598e-06,
"loss": 0.089,
"step": 27350
},
{
"epoch": 2.3794016760019105,
"grad_norm": 4.11488676071167,
"learning_rate": 2.0726588507743525e-06,
"loss": 0.0841,
"step": 27400
},
{
"epoch": 2.383743649863228,
"grad_norm": 4.323261260986328,
"learning_rate": 2.058184976118107e-06,
"loss": 0.0593,
"step": 27450
},
{
"epoch": 2.388085623724545,
"grad_norm": 10.200052261352539,
"learning_rate": 2.0437111014618616e-06,
"loss": 0.0966,
"step": 27500
},
{
"epoch": 2.3924275975858627,
"grad_norm": 12.892340660095215,
"learning_rate": 2.029237226805616e-06,
"loss": 0.0839,
"step": 27550
},
{
"epoch": 2.3967695714471797,
"grad_norm": 0.33158212900161743,
"learning_rate": 2.0147633521493707e-06,
"loss": 0.107,
"step": 27600
},
{
"epoch": 2.4011115453084972,
"grad_norm": 4.843432903289795,
"learning_rate": 2.000289477493125e-06,
"loss": 0.0761,
"step": 27650
},
{
"epoch": 2.4054535191698148,
"grad_norm": 8.558151245117188,
"learning_rate": 1.9858156028368797e-06,
"loss": 0.0931,
"step": 27700
},
{
"epoch": 2.409795493031132,
"grad_norm": 4.9575653076171875,
"learning_rate": 1.9713417281806343e-06,
"loss": 0.0916,
"step": 27750
},
{
"epoch": 2.4141374668924493,
"grad_norm": 0.742382287979126,
"learning_rate": 1.956867853524389e-06,
"loss": 0.0661,
"step": 27800
},
{
"epoch": 2.4184794407537664,
"grad_norm": 5.219630718231201,
"learning_rate": 1.9423939788681433e-06,
"loss": 0.085,
"step": 27850
},
{
"epoch": 2.422821414615084,
"grad_norm": 5.431758403778076,
"learning_rate": 1.927920104211898e-06,
"loss": 0.0806,
"step": 27900
},
{
"epoch": 2.4271633884764015,
"grad_norm": 2.560927391052246,
"learning_rate": 1.913446229555652e-06,
"loss": 0.0749,
"step": 27950
},
{
"epoch": 2.4315053623377185,
"grad_norm": 13.62498950958252,
"learning_rate": 1.898972354899407e-06,
"loss": 0.0777,
"step": 28000
},
{
"epoch": 2.435847336199036,
"grad_norm": 7.314693927764893,
"learning_rate": 1.8844984802431613e-06,
"loss": 0.098,
"step": 28050
},
{
"epoch": 2.4401893100603536,
"grad_norm": 7.054198741912842,
"learning_rate": 1.870024605586916e-06,
"loss": 0.0782,
"step": 28100
},
{
"epoch": 2.4445312839216706,
"grad_norm": 7.161545276641846,
"learning_rate": 1.8555507309306703e-06,
"loss": 0.0716,
"step": 28150
},
{
"epoch": 2.448873257782988,
"grad_norm": 12.497211456298828,
"learning_rate": 1.8410768562744249e-06,
"loss": 0.0766,
"step": 28200
},
{
"epoch": 2.4532152316443057,
"grad_norm": 7.773441314697266,
"learning_rate": 1.8266029816181794e-06,
"loss": 0.1298,
"step": 28250
},
{
"epoch": 2.4575572055056227,
"grad_norm": 2.4828858375549316,
"learning_rate": 1.8121291069619337e-06,
"loss": 0.0958,
"step": 28300
},
{
"epoch": 2.4618991793669402,
"grad_norm": 3.412752628326416,
"learning_rate": 1.7976552323056885e-06,
"loss": 0.0909,
"step": 28350
},
{
"epoch": 2.4662411532282578,
"grad_norm": 9.695392608642578,
"learning_rate": 1.7831813576494428e-06,
"loss": 0.0878,
"step": 28400
},
{
"epoch": 2.470583127089575,
"grad_norm": 8.524370193481445,
"learning_rate": 1.7687074829931975e-06,
"loss": 0.0735,
"step": 28450
},
{
"epoch": 2.4749251009508924,
"grad_norm": 5.256426811218262,
"learning_rate": 1.7542336083369518e-06,
"loss": 0.0939,
"step": 28500
},
{
"epoch": 2.47926707481221,
"grad_norm": 2.187129259109497,
"learning_rate": 1.7397597336807066e-06,
"loss": 0.0881,
"step": 28550
},
{
"epoch": 2.483609048673527,
"grad_norm": 1.136833906173706,
"learning_rate": 1.725285859024461e-06,
"loss": 0.0802,
"step": 28600
},
{
"epoch": 2.4879510225348445,
"grad_norm": 6.4525251388549805,
"learning_rate": 1.7108119843682157e-06,
"loss": 0.1154,
"step": 28650
},
{
"epoch": 2.4922929963961615,
"grad_norm": 7.624198913574219,
"learning_rate": 1.69633810971197e-06,
"loss": 0.0824,
"step": 28700
},
{
"epoch": 2.496634970257479,
"grad_norm": 6.494277000427246,
"learning_rate": 1.6818642350557245e-06,
"loss": 0.0781,
"step": 28750
},
{
"epoch": 2.5009769441187966,
"grad_norm": 0.3975910544395447,
"learning_rate": 1.667390360399479e-06,
"loss": 0.0752,
"step": 28800
},
{
"epoch": 2.5053189179801136,
"grad_norm": 7.682430744171143,
"learning_rate": 1.6529164857432336e-06,
"loss": 0.0882,
"step": 28850
},
{
"epoch": 2.509660891841431,
"grad_norm": 2.4172372817993164,
"learning_rate": 1.6384426110869881e-06,
"loss": 0.0828,
"step": 28900
},
{
"epoch": 2.5140028657027482,
"grad_norm": 1.9863077402114868,
"learning_rate": 1.6239687364307426e-06,
"loss": 0.0655,
"step": 28950
},
{
"epoch": 2.5183448395640657,
"grad_norm": 4.845276355743408,
"learning_rate": 1.6094948617744972e-06,
"loss": 0.0756,
"step": 29000
},
{
"epoch": 2.5226868134253833,
"grad_norm": 6.560926914215088,
"learning_rate": 1.5950209871182517e-06,
"loss": 0.0731,
"step": 29050
},
{
"epoch": 2.5270287872867003,
"grad_norm": 7.662853717803955,
"learning_rate": 1.5805471124620062e-06,
"loss": 0.0728,
"step": 29100
},
{
"epoch": 2.531370761148018,
"grad_norm": 0.009299159049987793,
"learning_rate": 1.5660732378057608e-06,
"loss": 0.0744,
"step": 29150
},
{
"epoch": 2.5357127350093354,
"grad_norm": 7.238016605377197,
"learning_rate": 1.5515993631495153e-06,
"loss": 0.0932,
"step": 29200
},
{
"epoch": 2.5400547088706524,
"grad_norm": 1.8960305452346802,
"learning_rate": 1.5371254884932698e-06,
"loss": 0.0688,
"step": 29250
},
{
"epoch": 2.54439668273197,
"grad_norm": 5.69565486907959,
"learning_rate": 1.5226516138370242e-06,
"loss": 0.0766,
"step": 29300
},
{
"epoch": 2.5487386565932875,
"grad_norm": 12.48017692565918,
"learning_rate": 1.508177739180779e-06,
"loss": 0.0881,
"step": 29350
},
{
"epoch": 2.5530806304546045,
"grad_norm": 1.490441918373108,
"learning_rate": 1.4937038645245332e-06,
"loss": 0.0827,
"step": 29400
},
{
"epoch": 2.557422604315922,
"grad_norm": 0.3555105924606323,
"learning_rate": 1.479229989868288e-06,
"loss": 0.0791,
"step": 29450
},
{
"epoch": 2.5617645781772396,
"grad_norm": 6.290306091308594,
"learning_rate": 1.4647561152120423e-06,
"loss": 0.0771,
"step": 29500
},
{
"epoch": 2.5661065520385566,
"grad_norm": 0.16260845959186554,
"learning_rate": 1.450282240555797e-06,
"loss": 0.0934,
"step": 29550
},
{
"epoch": 2.570448525899874,
"grad_norm": 7.777038097381592,
"learning_rate": 1.4360978433926764e-06,
"loss": 0.0803,
"step": 29600
},
{
"epoch": 2.5747904997611917,
"grad_norm": 0.2070547193288803,
"learning_rate": 1.4216239687364308e-06,
"loss": 0.0856,
"step": 29650
},
{
"epoch": 2.5791324736225087,
"grad_norm": 0.10048296302556992,
"learning_rate": 1.4071500940801855e-06,
"loss": 0.0746,
"step": 29700
},
{
"epoch": 2.5834744474838263,
"grad_norm": 6.591347694396973,
"learning_rate": 1.3926762194239398e-06,
"loss": 0.0788,
"step": 29750
},
{
"epoch": 2.5878164213451433,
"grad_norm": 6.4904866218566895,
"learning_rate": 1.3782023447676946e-06,
"loss": 0.0897,
"step": 29800
},
{
"epoch": 2.592158395206461,
"grad_norm": 3.354762315750122,
"learning_rate": 1.363728470111449e-06,
"loss": 0.0927,
"step": 29850
},
{
"epoch": 2.5965003690677784,
"grad_norm": 2.270613670349121,
"learning_rate": 1.3492545954552036e-06,
"loss": 0.1087,
"step": 29900
},
{
"epoch": 2.6008423429290954,
"grad_norm": 0.7417836785316467,
"learning_rate": 1.334780720798958e-06,
"loss": 0.0904,
"step": 29950
},
{
"epoch": 2.605184316790413,
"grad_norm": 4.791884422302246,
"learning_rate": 1.3203068461427127e-06,
"loss": 0.0761,
"step": 30000
},
{
"epoch": 2.60952629065173,
"grad_norm": 2.9381160736083984,
"learning_rate": 1.305832971486467e-06,
"loss": 0.0886,
"step": 30050
},
{
"epoch": 2.6138682645130475,
"grad_norm": 4.2343926429748535,
"learning_rate": 1.2913590968302213e-06,
"loss": 0.0931,
"step": 30100
},
{
"epoch": 2.618210238374365,
"grad_norm": 1.3249711990356445,
"learning_rate": 1.276885222173976e-06,
"loss": 0.0817,
"step": 30150
},
{
"epoch": 2.622552212235682,
"grad_norm": 5.119120121002197,
"learning_rate": 1.2624113475177304e-06,
"loss": 0.0715,
"step": 30200
},
{
"epoch": 2.6268941860969997,
"grad_norm": 2.266857862472534,
"learning_rate": 1.2479374728614852e-06,
"loss": 0.0702,
"step": 30250
},
{
"epoch": 2.631236159958317,
"grad_norm": 7.146674633026123,
"learning_rate": 1.2334635982052397e-06,
"loss": 0.0732,
"step": 30300
},
{
"epoch": 2.6355781338196342,
"grad_norm": 4.819944858551025,
"learning_rate": 1.2189897235489942e-06,
"loss": 0.0778,
"step": 30350
},
{
"epoch": 2.6399201076809518,
"grad_norm": 2.735713243484497,
"learning_rate": 1.2045158488927488e-06,
"loss": 0.0532,
"step": 30400
},
{
"epoch": 2.6442620815422693,
"grad_norm": 8.688481330871582,
"learning_rate": 1.1900419742365033e-06,
"loss": 0.1277,
"step": 30450
},
{
"epoch": 2.6486040554035863,
"grad_norm": 7.23391056060791,
"learning_rate": 1.1755680995802578e-06,
"loss": 0.0984,
"step": 30500
},
{
"epoch": 2.652946029264904,
"grad_norm": 6.083117961883545,
"learning_rate": 1.1610942249240124e-06,
"loss": 0.0938,
"step": 30550
},
{
"epoch": 2.6572880031262214,
"grad_norm": 9.920032501220703,
"learning_rate": 1.1466203502677669e-06,
"loss": 0.0894,
"step": 30600
},
{
"epoch": 2.6616299769875384,
"grad_norm": 6.3701605796813965,
"learning_rate": 1.1321464756115214e-06,
"loss": 0.0821,
"step": 30650
},
{
"epoch": 2.665971950848856,
"grad_norm": 1.6920627355575562,
"learning_rate": 1.117672600955276e-06,
"loss": 0.0709,
"step": 30700
},
{
"epoch": 2.6703139247101735,
"grad_norm": 1.3545681238174438,
"learning_rate": 1.1031987262990305e-06,
"loss": 0.0866,
"step": 30750
},
{
"epoch": 2.6746558985714906,
"grad_norm": 13.171427726745605,
"learning_rate": 1.0887248516427848e-06,
"loss": 0.0669,
"step": 30800
},
{
"epoch": 2.678997872432808,
"grad_norm": 1.8995404243469238,
"learning_rate": 1.0742509769865393e-06,
"loss": 0.0911,
"step": 30850
},
{
"epoch": 2.6833398462941256,
"grad_norm": 3.4048333168029785,
"learning_rate": 1.0597771023302939e-06,
"loss": 0.0642,
"step": 30900
},
{
"epoch": 2.6876818201554427,
"grad_norm": 9.538905143737793,
"learning_rate": 1.0453032276740484e-06,
"loss": 0.0715,
"step": 30950
},
{
"epoch": 2.69202379401676,
"grad_norm": 7.958563327789307,
"learning_rate": 1.030829353017803e-06,
"loss": 0.0726,
"step": 31000
},
{
"epoch": 2.6963657678780772,
"grad_norm": 7.041146755218506,
"learning_rate": 1.0163554783615575e-06,
"loss": 0.0641,
"step": 31050
},
{
"epoch": 2.7007077417393948,
"grad_norm": 6.417462348937988,
"learning_rate": 1.001881603705312e-06,
"loss": 0.0744,
"step": 31100
},
{
"epoch": 2.705049715600712,
"grad_norm": 13.097779273986816,
"learning_rate": 9.874077290490665e-07,
"loss": 0.12,
"step": 31150
},
{
"epoch": 2.7093916894620294,
"grad_norm": 8.754915237426758,
"learning_rate": 9.72933854392821e-07,
"loss": 0.0745,
"step": 31200
},
{
"epoch": 2.713733663323347,
"grad_norm": 1.3674185276031494,
"learning_rate": 9.584599797365756e-07,
"loss": 0.0902,
"step": 31250
},
{
"epoch": 2.718075637184664,
"grad_norm": 6.0695719718933105,
"learning_rate": 9.4398610508033e-07,
"loss": 0.0886,
"step": 31300
},
{
"epoch": 2.7224176110459815,
"grad_norm": 7.46438455581665,
"learning_rate": 9.295122304240846e-07,
"loss": 0.1084,
"step": 31350
},
{
"epoch": 2.726759584907299,
"grad_norm": 0.5424315929412842,
"learning_rate": 9.150383557678391e-07,
"loss": 0.0919,
"step": 31400
},
{
"epoch": 2.731101558768616,
"grad_norm": 1.2682020664215088,
"learning_rate": 9.005644811115936e-07,
"loss": 0.0729,
"step": 31450
},
{
"epoch": 2.7354435326299336,
"grad_norm": 1.612199306488037,
"learning_rate": 8.860906064553482e-07,
"loss": 0.0776,
"step": 31500
},
{
"epoch": 2.739785506491251,
"grad_norm": 6.857169151306152,
"learning_rate": 8.716167317991027e-07,
"loss": 0.0829,
"step": 31550
},
{
"epoch": 2.744127480352568,
"grad_norm": 8.613885879516602,
"learning_rate": 8.571428571428572e-07,
"loss": 0.0839,
"step": 31600
},
{
"epoch": 2.7484694542138857,
"grad_norm": 3.3205294609069824,
"learning_rate": 8.426689824866118e-07,
"loss": 0.0703,
"step": 31650
},
{
"epoch": 2.752811428075203,
"grad_norm": 0.2709499001502991,
"learning_rate": 8.281951078303663e-07,
"loss": 0.091,
"step": 31700
},
{
"epoch": 2.7571534019365203,
"grad_norm": 0.9320999979972839,
"learning_rate": 8.137212331741208e-07,
"loss": 0.0905,
"step": 31750
},
{
"epoch": 2.7614953757978378,
"grad_norm": 7.626304626464844,
"learning_rate": 7.992473585178754e-07,
"loss": 0.072,
"step": 31800
},
{
"epoch": 2.7658373496591553,
"grad_norm": 0.484938383102417,
"learning_rate": 7.847734838616298e-07,
"loss": 0.0893,
"step": 31850
},
{
"epoch": 2.7701793235204724,
"grad_norm": 3.4637513160705566,
"learning_rate": 7.705890866985093e-07,
"loss": 0.106,
"step": 31900
},
{
"epoch": 2.77452129738179,
"grad_norm": 19.361818313598633,
"learning_rate": 7.561152120422638e-07,
"loss": 0.0739,
"step": 31950
},
{
"epoch": 2.7788632712431074,
"grad_norm": 3.0003960132598877,
"learning_rate": 7.416413373860183e-07,
"loss": 0.0855,
"step": 32000
},
{
"epoch": 2.7832052451044245,
"grad_norm": 11.666556358337402,
"learning_rate": 7.271674627297728e-07,
"loss": 0.083,
"step": 32050
},
{
"epoch": 2.787547218965742,
"grad_norm": 1.3639492988586426,
"learning_rate": 7.126935880735273e-07,
"loss": 0.0843,
"step": 32100
},
{
"epoch": 2.791889192827059,
"grad_norm": 0.856368362903595,
"learning_rate": 6.982197134172819e-07,
"loss": 0.0675,
"step": 32150
},
{
"epoch": 2.7962311666883766,
"grad_norm": 4.6131272315979,
"learning_rate": 6.837458387610364e-07,
"loss": 0.0889,
"step": 32200
},
{
"epoch": 2.8005731405496936,
"grad_norm": 5.821038722991943,
"learning_rate": 6.692719641047909e-07,
"loss": 0.0883,
"step": 32250
},
{
"epoch": 2.804915114411011,
"grad_norm": 8.942717552185059,
"learning_rate": 6.547980894485455e-07,
"loss": 0.0767,
"step": 32300
},
{
"epoch": 2.8092570882723287,
"grad_norm": 1.455705165863037,
"learning_rate": 6.403242147923e-07,
"loss": 0.0836,
"step": 32350
},
{
"epoch": 2.8135990621336457,
"grad_norm": 0.6939010620117188,
"learning_rate": 6.258503401360545e-07,
"loss": 0.0696,
"step": 32400
},
{
"epoch": 2.8179410359949633,
"grad_norm": 1.1920336484909058,
"learning_rate": 6.11376465479809e-07,
"loss": 0.0635,
"step": 32450
},
{
"epoch": 2.8222830098562808,
"grad_norm": 7.259981155395508,
"learning_rate": 5.969025908235635e-07,
"loss": 0.0931,
"step": 32500
},
{
"epoch": 2.826624983717598,
"grad_norm": 6.035602569580078,
"learning_rate": 5.82428716167318e-07,
"loss": 0.0826,
"step": 32550
},
{
"epoch": 2.8309669575789154,
"grad_norm": 10.794766426086426,
"learning_rate": 5.679548415110726e-07,
"loss": 0.0552,
"step": 32600
},
{
"epoch": 2.835308931440233,
"grad_norm": 4.196424961090088,
"learning_rate": 5.534809668548271e-07,
"loss": 0.055,
"step": 32650
},
{
"epoch": 2.83965090530155,
"grad_norm": 7.877914905548096,
"learning_rate": 5.390070921985816e-07,
"loss": 0.0699,
"step": 32700
},
{
"epoch": 2.8439928791628675,
"grad_norm": 2.6493046283721924,
"learning_rate": 5.245332175423362e-07,
"loss": 0.0687,
"step": 32750
},
{
"epoch": 2.848334853024185,
"grad_norm": 5.444716453552246,
"learning_rate": 5.100593428860907e-07,
"loss": 0.0589,
"step": 32800
},
{
"epoch": 2.852676826885502,
"grad_norm": 0.2738840878009796,
"learning_rate": 4.955854682298452e-07,
"loss": 0.0624,
"step": 32850
},
{
"epoch": 2.8570188007468196,
"grad_norm": 7.651127338409424,
"learning_rate": 4.811115935735998e-07,
"loss": 0.0856,
"step": 32900
},
{
"epoch": 2.861360774608137,
"grad_norm": 0.7363032698631287,
"learning_rate": 4.666377189173542e-07,
"loss": 0.0584,
"step": 32950
},
{
"epoch": 2.865702748469454,
"grad_norm": 5.483317852020264,
"learning_rate": 4.521638442611087e-07,
"loss": 0.0671,
"step": 33000
},
{
"epoch": 2.8700447223307717,
"grad_norm": 6.474352836608887,
"learning_rate": 4.3768996960486325e-07,
"loss": 0.0892,
"step": 33050
},
{
"epoch": 2.874386696192089,
"grad_norm": 0.2653914988040924,
"learning_rate": 4.232160949486178e-07,
"loss": 0.0761,
"step": 33100
},
{
"epoch": 2.8787286700534063,
"grad_norm": 0.05122917890548706,
"learning_rate": 4.0874222029237227e-07,
"loss": 0.0564,
"step": 33150
},
{
"epoch": 2.883070643914724,
"grad_norm": 9.418333053588867,
"learning_rate": 3.942683456361268e-07,
"loss": 0.0655,
"step": 33200
},
{
"epoch": 2.887412617776041,
"grad_norm": 7.8067307472229,
"learning_rate": 3.7979447097988133e-07,
"loss": 0.0626,
"step": 33250
},
{
"epoch": 2.8917545916373584,
"grad_norm": 3.910670757293701,
"learning_rate": 3.6532059632363587e-07,
"loss": 0.0855,
"step": 33300
},
{
"epoch": 2.8960965654986754,
"grad_norm": 7.972170352935791,
"learning_rate": 3.508467216673904e-07,
"loss": 0.0488,
"step": 33350
},
{
"epoch": 2.900438539359993,
"grad_norm": 10.280926704406738,
"learning_rate": 3.3637284701114493e-07,
"loss": 0.0631,
"step": 33400
},
{
"epoch": 2.9047805132213105,
"grad_norm": 4.887994766235352,
"learning_rate": 3.2189897235489947e-07,
"loss": 0.0731,
"step": 33450
},
{
"epoch": 2.9091224870826276,
"grad_norm": 14.115626335144043,
"learning_rate": 3.0742509769865395e-07,
"loss": 0.0877,
"step": 33500
},
{
"epoch": 2.913464460943945,
"grad_norm": 10.208263397216797,
"learning_rate": 2.929512230424085e-07,
"loss": 0.0975,
"step": 33550
},
{
"epoch": 2.9178064348052626,
"grad_norm": 1.1648341417312622,
"learning_rate": 2.78477348386163e-07,
"loss": 0.0761,
"step": 33600
},
{
"epoch": 2.9221484086665797,
"grad_norm": 6.795315265655518,
"learning_rate": 2.640034737299175e-07,
"loss": 0.0777,
"step": 33650
},
{
"epoch": 2.926490382527897,
"grad_norm": 7.7475762367248535,
"learning_rate": 2.4952959907367203e-07,
"loss": 0.0667,
"step": 33700
},
{
"epoch": 2.9308323563892147,
"grad_norm": 0.7257171273231506,
"learning_rate": 2.3505572441742656e-07,
"loss": 0.0832,
"step": 33750
},
{
"epoch": 2.9351743302505318,
"grad_norm": 7.833991527557373,
"learning_rate": 2.205818497611811e-07,
"loss": 0.0845,
"step": 33800
},
{
"epoch": 2.9395163041118493,
"grad_norm": 5.486457347869873,
"learning_rate": 2.0610797510493563e-07,
"loss": 0.0899,
"step": 33850
},
{
"epoch": 2.943858277973167,
"grad_norm": 7.7840375900268555,
"learning_rate": 1.916341004486901e-07,
"loss": 0.0751,
"step": 33900
},
{
"epoch": 2.948200251834484,
"grad_norm": 6.603894233703613,
"learning_rate": 1.7716022579244464e-07,
"loss": 0.0644,
"step": 33950
},
{
"epoch": 2.9525422256958014,
"grad_norm": 4.2936482429504395,
"learning_rate": 1.6268635113619917e-07,
"loss": 0.0853,
"step": 34000
},
{
"epoch": 2.956884199557119,
"grad_norm": 3.4259464740753174,
"learning_rate": 1.482124764799537e-07,
"loss": 0.059,
"step": 34050
},
{
"epoch": 2.961226173418436,
"grad_norm": 5.449902057647705,
"learning_rate": 1.3402807931683313e-07,
"loss": 0.077,
"step": 34100
},
{
"epoch": 2.9655681472797535,
"grad_norm": 0.609876275062561,
"learning_rate": 1.1955420466058764e-07,
"loss": 0.0661,
"step": 34150
},
{
"epoch": 2.969910121141071,
"grad_norm": 4.248520851135254,
"learning_rate": 1.0508033000434217e-07,
"loss": 0.0977,
"step": 34200
},
{
"epoch": 2.974252095002388,
"grad_norm": 2.3178467750549316,
"learning_rate": 9.060645534809668e-08,
"loss": 0.0972,
"step": 34250
},
{
"epoch": 2.9785940688637056,
"grad_norm": 4.120424747467041,
"learning_rate": 7.613258069185121e-08,
"loss": 0.0727,
"step": 34300
},
{
"epoch": 2.9829360427250227,
"grad_norm": 8.699881553649902,
"learning_rate": 6.165870603560573e-08,
"loss": 0.0642,
"step": 34350
},
{
"epoch": 2.98727801658634,
"grad_norm": 8.902776718139648,
"learning_rate": 4.718483137936026e-08,
"loss": 0.0706,
"step": 34400
},
{
"epoch": 2.9916199904476573,
"grad_norm": 4.606892108917236,
"learning_rate": 3.271095672311478e-08,
"loss": 0.0996,
"step": 34450
},
{
"epoch": 2.9959619643089748,
"grad_norm": 8.253307342529297,
"learning_rate": 1.82370820668693e-08,
"loss": 0.0519,
"step": 34500
},
{
"epoch": 2.9998697407841606,
"eval_loss": 0.21618051826953888,
"eval_runtime": 157.9614,
"eval_samples_per_second": 64.801,
"eval_steps_per_second": 16.2,
"step": 34545
}
],
"logging_steps": 50,
"max_steps": 34545,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.97534612324352e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}