{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.9655172413793105, "eval_steps": 500, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03940886699507389, "grad_norm": 2.5584669687578936, "learning_rate": 6.153846153846155e-06, "loss": 0.7558, "step": 1 }, { "epoch": 0.07881773399014778, "grad_norm": 2.565466980202395, "learning_rate": 1.230769230769231e-05, "loss": 0.7528, "step": 2 }, { "epoch": 0.11822660098522167, "grad_norm": 1.8753957272537682, "learning_rate": 1.8461538461538465e-05, "loss": 0.7189, "step": 3 }, { "epoch": 0.15763546798029557, "grad_norm": 1.5499772451700504, "learning_rate": 2.461538461538462e-05, "loss": 0.6998, "step": 4 }, { "epoch": 0.19704433497536947, "grad_norm": 1.629703652668965, "learning_rate": 3.0769230769230774e-05, "loss": 0.639, "step": 5 }, { "epoch": 0.23645320197044334, "grad_norm": 1.351435580335104, "learning_rate": 3.692307692307693e-05, "loss": 0.6071, "step": 6 }, { "epoch": 0.27586206896551724, "grad_norm": 2.1491351662060074, "learning_rate": 4.307692307692308e-05, "loss": 0.6034, "step": 7 }, { "epoch": 0.31527093596059114, "grad_norm": 1.1876259973559116, "learning_rate": 4.923076923076924e-05, "loss": 0.5724, "step": 8 }, { "epoch": 0.35467980295566504, "grad_norm": 1.371610184643724, "learning_rate": 5.538461538461539e-05, "loss": 0.5544, "step": 9 }, { "epoch": 0.39408866995073893, "grad_norm": 1.032169030947674, "learning_rate": 6.153846153846155e-05, "loss": 0.5503, "step": 10 }, { "epoch": 0.43349753694581283, "grad_norm": 1.1747270299234402, "learning_rate": 6.76923076923077e-05, "loss": 0.5314, "step": 11 }, { "epoch": 0.4729064039408867, "grad_norm": 0.9851373875214015, "learning_rate": 7.384615384615386e-05, "loss": 0.5293, "step": 12 }, { "epoch": 0.5123152709359606, "grad_norm": 0.9498199365678511, "learning_rate": 8e-05, "loss": 0.5197, "step": 13 }, { "epoch": 0.5517241379310345, "grad_norm": 0.6850321797374413, "learning_rate": 7.998426505532213e-05, "loss": 0.5108, "step": 14 }, { "epoch": 0.5911330049261084, "grad_norm": 0.741883724345909, "learning_rate": 7.993707260071268e-05, "loss": 0.5039, "step": 15 }, { "epoch": 0.6305418719211823, "grad_norm": 0.6201096559420001, "learning_rate": 7.985845976470478e-05, "loss": 0.492, "step": 16 }, { "epoch": 0.6699507389162561, "grad_norm": 0.5810333918069511, "learning_rate": 7.974848839572971e-05, "loss": 0.4894, "step": 17 }, { "epoch": 0.7093596059113301, "grad_norm": 0.538664331289255, "learning_rate": 7.960724501345783e-05, "loss": 0.478, "step": 18 }, { "epoch": 0.7487684729064039, "grad_norm": 0.48351034605589627, "learning_rate": 7.943484074072943e-05, "loss": 0.4758, "step": 19 }, { "epoch": 0.7881773399014779, "grad_norm": 0.5588522296796855, "learning_rate": 7.923141121612922e-05, "loss": 0.4755, "step": 20 }, { "epoch": 0.8275862068965517, "grad_norm": 0.5099414849429755, "learning_rate": 7.899711648727294e-05, "loss": 0.4669, "step": 21 }, { "epoch": 0.8669950738916257, "grad_norm": 0.5111926450042357, "learning_rate": 7.873214088489047e-05, "loss": 0.4634, "step": 22 }, { "epoch": 0.9064039408866995, "grad_norm": 0.42484145211363433, "learning_rate": 7.843669287780399e-05, "loss": 0.4539, "step": 23 }, { "epoch": 0.9458128078817734, "grad_norm": 0.4063070610232536, "learning_rate": 7.811100490891586e-05, "loss": 0.4545, "step": 24 }, { "epoch": 0.9852216748768473, "grad_norm": 0.34890570659951814, "learning_rate": 7.775533321233471e-05, "loss": 0.4472, "step": 25 }, { "epoch": 1.0344827586206897, "grad_norm": 0.4447080012153922, "learning_rate": 7.736995761178399e-05, "loss": 0.4415, "step": 26 }, { "epoch": 1.0738916256157636, "grad_norm": 0.302803670590809, "learning_rate": 7.695518130045147e-05, "loss": 0.4339, "step": 27 }, { "epoch": 1.1133004926108374, "grad_norm": 0.35078704995091015, "learning_rate": 7.651133060245276e-05, "loss": 0.4334, "step": 28 }, { "epoch": 1.1527093596059113, "grad_norm": 0.347929638232275, "learning_rate": 7.603875471609677e-05, "loss": 0.4306, "step": 29 }, { "epoch": 1.1921182266009853, "grad_norm": 0.34132416804989185, "learning_rate": 7.55378254391549e-05, "loss": 0.4303, "step": 30 }, { "epoch": 1.2315270935960592, "grad_norm": 0.2780341991318012, "learning_rate": 7.500893687635015e-05, "loss": 0.4187, "step": 31 }, { "epoch": 1.270935960591133, "grad_norm": 0.236156587933769, "learning_rate": 7.445250512929637e-05, "loss": 0.4163, "step": 32 }, { "epoch": 1.3103448275862069, "grad_norm": 0.28381264582341237, "learning_rate": 7.386896796913137e-05, "loss": 0.4112, "step": 33 }, { "epoch": 1.3497536945812807, "grad_norm": 0.222292178815377, "learning_rate": 7.325878449210182e-05, "loss": 0.4167, "step": 34 }, { "epoch": 1.3891625615763548, "grad_norm": 0.35512090170091437, "learning_rate": 7.262243475837041e-05, "loss": 0.4109, "step": 35 }, { "epoch": 1.4285714285714286, "grad_norm": 0.2511586648870301, "learning_rate": 7.196041941432998e-05, "loss": 0.4124, "step": 36 }, { "epoch": 1.4679802955665024, "grad_norm": 0.22170782679952175, "learning_rate": 7.12732592987212e-05, "loss": 0.4056, "step": 37 }, { "epoch": 1.5073891625615765, "grad_norm": 0.18785361140671847, "learning_rate": 7.05614950328643e-05, "loss": 0.4087, "step": 38 }, { "epoch": 1.5467980295566504, "grad_norm": 0.20698082992129718, "learning_rate": 6.982568659532663e-05, "loss": 0.412, "step": 39 }, { "epoch": 1.5862068965517242, "grad_norm": 0.2923360568345124, "learning_rate": 6.906641288136109e-05, "loss": 0.4077, "step": 40 }, { "epoch": 1.625615763546798, "grad_norm": 0.36347305309861494, "learning_rate": 6.828427124746191e-05, "loss": 0.4135, "step": 41 }, { "epoch": 1.6650246305418719, "grad_norm": 0.42985470055529523, "learning_rate": 6.747987704139607e-05, "loss": 0.408, "step": 42 }, { "epoch": 1.7044334975369457, "grad_norm": 0.505450596225966, "learning_rate": 6.665386311808017e-05, "loss": 0.4125, "step": 43 }, { "epoch": 1.7438423645320196, "grad_norm": 0.49870209403284876, "learning_rate": 6.580687934168352e-05, "loss": 0.4028, "step": 44 }, { "epoch": 1.7832512315270936, "grad_norm": 0.3113343805759419, "learning_rate": 6.493959207434934e-05, "loss": 0.403, "step": 45 }, { "epoch": 1.8226600985221675, "grad_norm": 0.2080418139358356, "learning_rate": 6.405268365193624e-05, "loss": 0.4143, "step": 46 }, { "epoch": 1.8620689655172413, "grad_norm": 0.35947872339994125, "learning_rate": 6.314685184719224e-05, "loss": 0.3986, "step": 47 }, { "epoch": 1.9014778325123154, "grad_norm": 0.2889476841026003, "learning_rate": 6.22228093207841e-05, "loss": 0.4034, "step": 48 }, { "epoch": 1.9408866995073892, "grad_norm": 0.16926057758761603, "learning_rate": 6.128128306061347e-05, "loss": 0.398, "step": 49 }, { "epoch": 1.980295566502463, "grad_norm": 0.3015239387441465, "learning_rate": 6.0323013809861185e-05, "loss": 0.3946, "step": 50 }, { "epoch": 2.0295566502463056, "grad_norm": 0.27369640017832153, "learning_rate": 5.9348755484209597e-05, "loss": 0.3927, "step": 51 }, { "epoch": 2.0689655172413794, "grad_norm": 0.1434834184101561, "learning_rate": 5.835927457870151e-05, "loss": 0.3811, "step": 52 }, { "epoch": 2.1083743842364533, "grad_norm": 0.2854039627965572, "learning_rate": 5.735534956470233e-05, "loss": 0.3757, "step": 53 }, { "epoch": 2.147783251231527, "grad_norm": 0.3009695490584673, "learning_rate": 5.6337770277439854e-05, "loss": 0.3818, "step": 54 }, { "epoch": 2.187192118226601, "grad_norm": 0.154882062870217, "learning_rate": 5.5307337294603595e-05, "loss": 0.3758, "step": 55 }, { "epoch": 2.226600985221675, "grad_norm": 0.23852846311077655, "learning_rate": 5.4264861306492525e-05, "loss": 0.3667, "step": 56 }, { "epoch": 2.2660098522167487, "grad_norm": 0.22608925683357714, "learning_rate": 5.321116247820669e-05, "loss": 0.3711, "step": 57 }, { "epoch": 2.3054187192118225, "grad_norm": 0.1583983272233887, "learning_rate": 5.214706980438459e-05, "loss": 0.367, "step": 58 }, { "epoch": 2.344827586206897, "grad_norm": 0.205572865908627, "learning_rate": 5.107342045699397e-05, "loss": 0.3651, "step": 59 }, { "epoch": 2.3842364532019706, "grad_norm": 0.2250613392996445, "learning_rate": 4.999105912668908e-05, "loss": 0.3723, "step": 60 }, { "epoch": 2.4236453201970445, "grad_norm": 0.12838953670182573, "learning_rate": 4.890083735825258e-05, "loss": 0.3696, "step": 61 }, { "epoch": 2.4630541871921183, "grad_norm": 0.2251988369582669, "learning_rate": 4.780361288064514e-05, "loss": 0.3676, "step": 62 }, { "epoch": 2.502463054187192, "grad_norm": 0.1725969347073438, "learning_rate": 4.670024893218946e-05, "loss": 0.3697, "step": 63 }, { "epoch": 2.541871921182266, "grad_norm": 0.12188885415056715, "learning_rate": 4.5591613581419984e-05, "loss": 0.3576, "step": 64 }, { "epoch": 2.58128078817734, "grad_norm": 0.18927822293311972, "learning_rate": 4.4478579044132314e-05, "loss": 0.3667, "step": 65 }, { "epoch": 2.6206896551724137, "grad_norm": 0.11561614146910962, "learning_rate": 4.336202099716991e-05, "loss": 0.3658, "step": 66 }, { "epoch": 2.6600985221674875, "grad_norm": 0.1516544571012481, "learning_rate": 4.2242817889487676e-05, "loss": 0.3648, "step": 67 }, { "epoch": 2.6995073891625614, "grad_norm": 0.1317279280788641, "learning_rate": 4.112185025103476e-05, "loss": 0.3615, "step": 68 }, { "epoch": 2.7389162561576352, "grad_norm": 0.11760148271456246, "learning_rate": 4e-05, "loss": 0.3663, "step": 69 }, { "epoch": 2.7783251231527095, "grad_norm": 0.12906163898612472, "learning_rate": 3.8878149748965245e-05, "loss": 0.3606, "step": 70 }, { "epoch": 2.8177339901477834, "grad_norm": 0.09949229840426103, "learning_rate": 3.775718211051233e-05, "loss": 0.3667, "step": 71 }, { "epoch": 2.857142857142857, "grad_norm": 0.12817295262296305, "learning_rate": 3.6637979002830106e-05, "loss": 0.3691, "step": 72 }, { "epoch": 2.896551724137931, "grad_norm": 0.09926866475598084, "learning_rate": 3.552142095586769e-05, "loss": 0.3664, "step": 73 }, { "epoch": 2.935960591133005, "grad_norm": 0.1188738983781825, "learning_rate": 3.4408386418580036e-05, "loss": 0.3717, "step": 74 }, { "epoch": 2.9753694581280787, "grad_norm": 0.09857847772154998, "learning_rate": 3.329975106781055e-05, "loss": 0.358, "step": 75 }, { "epoch": 3.0246305418719213, "grad_norm": 0.12437210969829017, "learning_rate": 3.219638711935488e-05, "loss": 0.3487, "step": 76 }, { "epoch": 3.064039408866995, "grad_norm": 0.13092335355733034, "learning_rate": 3.109916264174743e-05, "loss": 0.3378, "step": 77 }, { "epoch": 3.103448275862069, "grad_norm": 0.1380029588610029, "learning_rate": 3.000894087331092e-05, "loss": 0.3471, "step": 78 }, { "epoch": 3.142857142857143, "grad_norm": 0.16659609357249366, "learning_rate": 2.892657954300603e-05, "loss": 0.345, "step": 79 }, { "epoch": 3.1822660098522166, "grad_norm": 0.1491210789680348, "learning_rate": 2.7852930195615413e-05, "loss": 0.3423, "step": 80 }, { "epoch": 3.2216748768472905, "grad_norm": 0.13323974852056628, "learning_rate": 2.678883752179333e-05, "loss": 0.3375, "step": 81 }, { "epoch": 3.2610837438423648, "grad_norm": 0.16071703950070282, "learning_rate": 2.573513869350748e-05, "loss": 0.3449, "step": 82 }, { "epoch": 3.3004926108374386, "grad_norm": 0.11154281193190216, "learning_rate": 2.4692662705396412e-05, "loss": 0.3477, "step": 83 }, { "epoch": 3.3399014778325125, "grad_norm": 0.13730160221975535, "learning_rate": 2.366222972256016e-05, "loss": 0.3393, "step": 84 }, { "epoch": 3.3793103448275863, "grad_norm": 0.1001261256156619, "learning_rate": 2.264465043529768e-05, "loss": 0.3386, "step": 85 }, { "epoch": 3.41871921182266, "grad_norm": 0.11871409719140899, "learning_rate": 2.1640725421298487e-05, "loss": 0.3413, "step": 86 }, { "epoch": 3.458128078817734, "grad_norm": 0.10822197872193218, "learning_rate": 2.065124451579041e-05, "loss": 0.3415, "step": 87 }, { "epoch": 3.497536945812808, "grad_norm": 0.09848707693593217, "learning_rate": 1.9676986190138835e-05, "loss": 0.3424, "step": 88 }, { "epoch": 3.5369458128078817, "grad_norm": 0.10650565727211944, "learning_rate": 1.8718716939386543e-05, "loss": 0.3438, "step": 89 }, { "epoch": 3.5763546798029555, "grad_norm": 0.09005212054070572, "learning_rate": 1.7777190679215923e-05, "loss": 0.3413, "step": 90 }, { "epoch": 3.6157635467980294, "grad_norm": 0.09471297169488595, "learning_rate": 1.6853148152807774e-05, "loss": 0.3394, "step": 91 }, { "epoch": 3.655172413793103, "grad_norm": 0.08803273896652859, "learning_rate": 1.5947316348063764e-05, "loss": 0.3452, "step": 92 }, { "epoch": 3.6945812807881775, "grad_norm": 0.08555929815288703, "learning_rate": 1.5060407925650662e-05, "loss": 0.3386, "step": 93 }, { "epoch": 3.7339901477832513, "grad_norm": 0.07676232199734123, "learning_rate": 1.4193120658316506e-05, "loss": 0.3384, "step": 94 }, { "epoch": 3.773399014778325, "grad_norm": 0.07008988332755012, "learning_rate": 1.3346136881919845e-05, "loss": 0.3423, "step": 95 }, { "epoch": 3.812807881773399, "grad_norm": 0.07699181822522967, "learning_rate": 1.2520122958603933e-05, "loss": 0.3394, "step": 96 }, { "epoch": 3.852216748768473, "grad_norm": 0.07037612515608051, "learning_rate": 1.1715728752538103e-05, "loss": 0.3377, "step": 97 }, { "epoch": 3.8916256157635467, "grad_norm": 0.06692075874239917, "learning_rate": 1.0933587118638927e-05, "loss": 0.3389, "step": 98 }, { "epoch": 3.9310344827586206, "grad_norm": 0.07239315796297541, "learning_rate": 1.0174313404673378e-05, "loss": 0.3386, "step": 99 }, { "epoch": 3.970443349753695, "grad_norm": 0.06509354323682291, "learning_rate": 9.438504967135703e-06, "loss": 0.3435, "step": 100 }, { "epoch": 4.019704433497537, "grad_norm": 0.07415226571205577, "learning_rate": 8.72674070127881e-06, "loss": 0.3299, "step": 101 }, { "epoch": 4.059113300492611, "grad_norm": 0.09427673195861289, "learning_rate": 8.039580585670047e-06, "loss": 0.3271, "step": 102 }, { "epoch": 4.098522167487685, "grad_norm": 0.07768702640271646, "learning_rate": 7.3775652416295936e-06, "loss": 0.3298, "step": 103 }, { "epoch": 4.137931034482759, "grad_norm": 0.07261233473354416, "learning_rate": 6.7412155078981865e-06, "loss": 0.3295, "step": 104 }, { "epoch": 4.177339901477833, "grad_norm": 0.07340342692044237, "learning_rate": 6.1310320308686354e-06, "loss": 0.3274, "step": 105 }, { "epoch": 4.216748768472907, "grad_norm": 0.07153254469501634, "learning_rate": 5.547494870703642e-06, "loss": 0.3307, "step": 106 }, { "epoch": 4.25615763546798, "grad_norm": 0.07858955565979121, "learning_rate": 4.991063123649853e-06, "loss": 0.326, "step": 107 }, { "epoch": 4.295566502463054, "grad_norm": 0.0747794841861464, "learning_rate": 4.462174560845114e-06, "loss": 0.3273, "step": 108 }, { "epoch": 4.334975369458128, "grad_norm": 0.06814386298200781, "learning_rate": 3.961245283903239e-06, "loss": 0.3304, "step": 109 }, { "epoch": 4.374384236453202, "grad_norm": 0.06709307703349002, "learning_rate": 3.4886693975472443e-06, "loss": 0.3285, "step": 110 }, { "epoch": 4.413793103448276, "grad_norm": 0.06195820663002009, "learning_rate": 3.0448186995485307e-06, "loss": 0.328, "step": 111 }, { "epoch": 4.45320197044335, "grad_norm": 0.06590977322439409, "learning_rate": 2.630042388216012e-06, "loss": 0.3314, "step": 112 }, { "epoch": 4.4926108374384235, "grad_norm": 0.06216983243982217, "learning_rate": 2.244666787665297e-06, "loss": 0.3285, "step": 113 }, { "epoch": 4.532019704433497, "grad_norm": 0.0568288191054022, "learning_rate": 1.888995091084147e-06, "loss": 0.3267, "step": 114 }, { "epoch": 4.571428571428571, "grad_norm": 0.05502800210878154, "learning_rate": 1.5633071221960205e-06, "loss": 0.3199, "step": 115 }, { "epoch": 4.610837438423645, "grad_norm": 0.05085194354160944, "learning_rate": 1.2678591151095466e-06, "loss": 0.3291, "step": 116 }, { "epoch": 4.650246305418719, "grad_norm": 0.0542030183423217, "learning_rate": 1.0028835127270553e-06, "loss": 0.328, "step": 117 }, { "epoch": 4.689655172413794, "grad_norm": 0.04943720054109848, "learning_rate": 7.685887838707828e-07, "loss": 0.3282, "step": 118 }, { "epoch": 4.7290640394088665, "grad_norm": 0.05350280931369455, "learning_rate": 5.651592592705646e-07, "loss": 0.3358, "step": 119 }, { "epoch": 4.768472906403941, "grad_norm": 0.051675651653332454, "learning_rate": 3.9275498654217425e-07, "loss": 0.3231, "step": 120 }, { "epoch": 4.807881773399015, "grad_norm": 0.049565582110987425, "learning_rate": 2.5151160427029584e-07, "loss": 0.3273, "step": 121 }, { "epoch": 4.847290640394089, "grad_norm": 0.05086813508645347, "learning_rate": 1.4154023529523663e-07, "loss": 0.3252, "step": 122 }, { "epoch": 4.886699507389163, "grad_norm": 0.04981788886449752, "learning_rate": 6.292739928733582e-08, "loss": 0.3251, "step": 123 }, { "epoch": 4.926108374384237, "grad_norm": 0.051983785964860994, "learning_rate": 1.5734944677885388e-08, "loss": 0.3261, "step": 124 }, { "epoch": 4.9655172413793105, "grad_norm": 0.05048075450561713, "learning_rate": 0.0, "loss": 0.3278, "step": 125 }, { "epoch": 4.9655172413793105, "step": 125, "total_flos": 4.29306832130723e+18, "train_loss": 0.40043015813827515, "train_runtime": 25003.3066, "train_samples_per_second": 2.597, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 125, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.29306832130723e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }