diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,4492 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9996076892899176, + "eval_steps": 500, + "global_step": 637, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001569242840329541, + "grad_norm": 5.652259349822998, + "learning_rate": 2.02e-06, + "loss": 1.8023, + "step": 1 + }, + { + "epoch": 0.003138485680659082, + "grad_norm": 6.283995151519775, + "learning_rate": 4.04e-06, + "loss": 1.8841, + "step": 2 + }, + { + "epoch": 0.004707728520988623, + "grad_norm": 5.4327287673950195, + "learning_rate": 6.06e-06, + "loss": 1.8413, + "step": 3 + }, + { + "epoch": 0.006276971361318164, + "grad_norm": 5.21534538269043, + "learning_rate": 8.08e-06, + "loss": 1.8447, + "step": 4 + }, + { + "epoch": 0.007846214201647704, + "grad_norm": 5.073209285736084, + "learning_rate": 1.0100000000000002e-05, + "loss": 1.8843, + "step": 5 + }, + { + "epoch": 0.009415457041977247, + "grad_norm": 4.880558490753174, + "learning_rate": 1.212e-05, + "loss": 1.8486, + "step": 6 + }, + { + "epoch": 0.010984699882306787, + "grad_norm": 5.82685661315918, + "learning_rate": 1.4140000000000002e-05, + "loss": 1.8129, + "step": 7 + }, + { + "epoch": 0.012553942722636328, + "grad_norm": 5.182684898376465, + "learning_rate": 1.616e-05, + "loss": 1.8826, + "step": 8 + }, + { + "epoch": 0.014123185562965868, + "grad_norm": 4.716125011444092, + "learning_rate": 1.818e-05, + "loss": 1.8347, + "step": 9 + }, + { + "epoch": 0.01569242840329541, + "grad_norm": 4.742851734161377, + "learning_rate": 2.0200000000000003e-05, + "loss": 1.6935, + "step": 10 + }, + { + "epoch": 0.01726167124362495, + "grad_norm": 5.035205841064453, + "learning_rate": 2.222e-05, + "loss": 1.798, + "step": 11 + }, + { + "epoch": 0.018830914083954493, + "grad_norm": 4.749172687530518, + "learning_rate": 2.424e-05, + "loss": 1.7614, + "step": 12 + }, + { + "epoch": 0.020400156924284034, + "grad_norm": 4.660712242126465, + "learning_rate": 2.6260000000000003e-05, + "loss": 1.6361, + "step": 13 + }, + { + "epoch": 0.021969399764613574, + "grad_norm": 4.731696128845215, + "learning_rate": 2.8280000000000004e-05, + "loss": 1.7397, + "step": 14 + }, + { + "epoch": 0.023538642604943115, + "grad_norm": 4.478929042816162, + "learning_rate": 3.0299999999999998e-05, + "loss": 1.7067, + "step": 15 + }, + { + "epoch": 0.025107885445272655, + "grad_norm": 4.38900899887085, + "learning_rate": 3.232e-05, + "loss": 1.7526, + "step": 16 + }, + { + "epoch": 0.026677128285602196, + "grad_norm": 4.258249759674072, + "learning_rate": 3.434e-05, + "loss": 1.7633, + "step": 17 + }, + { + "epoch": 0.028246371125931737, + "grad_norm": 4.2208991050720215, + "learning_rate": 3.636e-05, + "loss": 1.6607, + "step": 18 + }, + { + "epoch": 0.02981561396626128, + "grad_norm": 4.066312789916992, + "learning_rate": 3.838e-05, + "loss": 1.6511, + "step": 19 + }, + { + "epoch": 0.03138485680659082, + "grad_norm": 4.088052749633789, + "learning_rate": 4.0400000000000006e-05, + "loss": 1.6348, + "step": 20 + }, + { + "epoch": 0.03295409964692036, + "grad_norm": 4.0918426513671875, + "learning_rate": 4.242e-05, + "loss": 1.5877, + "step": 21 + }, + { + "epoch": 0.0345233424872499, + "grad_norm": 4.691335201263428, + "learning_rate": 4.444e-05, + "loss": 1.6784, + "step": 22 + }, + { + "epoch": 0.036092585327579446, + "grad_norm": 4.096027374267578, + "learning_rate": 4.6460000000000006e-05, + "loss": 1.6901, + "step": 23 + }, + { + "epoch": 0.03766182816790899, + "grad_norm": 3.9688150882720947, + "learning_rate": 4.848e-05, + "loss": 1.6736, + "step": 24 + }, + { + "epoch": 0.03923107100823853, + "grad_norm": 3.990572214126587, + "learning_rate": 5.05e-05, + "loss": 1.6116, + "step": 25 + }, + { + "epoch": 0.04080031384856807, + "grad_norm": 4.115541458129883, + "learning_rate": 5.2520000000000005e-05, + "loss": 1.6999, + "step": 26 + }, + { + "epoch": 0.04236955668889761, + "grad_norm": 3.769343376159668, + "learning_rate": 5.454e-05, + "loss": 1.586, + "step": 27 + }, + { + "epoch": 0.04393879952922715, + "grad_norm": 3.8377442359924316, + "learning_rate": 5.656000000000001e-05, + "loss": 1.6878, + "step": 28 + }, + { + "epoch": 0.04550804236955669, + "grad_norm": 3.7446000576019287, + "learning_rate": 5.858e-05, + "loss": 1.6349, + "step": 29 + }, + { + "epoch": 0.04707728520988623, + "grad_norm": 3.7913708686828613, + "learning_rate": 6.0599999999999996e-05, + "loss": 1.6483, + "step": 30 + }, + { + "epoch": 0.04864652805021577, + "grad_norm": 3.7013509273529053, + "learning_rate": 6.262000000000001e-05, + "loss": 1.6065, + "step": 31 + }, + { + "epoch": 0.05021577089054531, + "grad_norm": 3.970893621444702, + "learning_rate": 6.464e-05, + "loss": 1.6069, + "step": 32 + }, + { + "epoch": 0.05178501373087485, + "grad_norm": 3.598435163497925, + "learning_rate": 6.666e-05, + "loss": 1.5888, + "step": 33 + }, + { + "epoch": 0.05335425657120439, + "grad_norm": 4.243397235870361, + "learning_rate": 6.868e-05, + "loss": 1.6099, + "step": 34 + }, + { + "epoch": 0.05492349941153393, + "grad_norm": 3.4000072479248047, + "learning_rate": 7.07e-05, + "loss": 1.6237, + "step": 35 + }, + { + "epoch": 0.05649274225186347, + "grad_norm": 3.955603837966919, + "learning_rate": 7.272e-05, + "loss": 1.6021, + "step": 36 + }, + { + "epoch": 0.058061985092193014, + "grad_norm": 3.165722131729126, + "learning_rate": 7.474e-05, + "loss": 1.5945, + "step": 37 + }, + { + "epoch": 0.05963122793252256, + "grad_norm": 3.4018807411193848, + "learning_rate": 7.676e-05, + "loss": 1.581, + "step": 38 + }, + { + "epoch": 0.0612004707728521, + "grad_norm": 3.3241562843322754, + "learning_rate": 7.878e-05, + "loss": 1.5913, + "step": 39 + }, + { + "epoch": 0.06276971361318164, + "grad_norm": 3.4715592861175537, + "learning_rate": 8.080000000000001e-05, + "loss": 1.627, + "step": 40 + }, + { + "epoch": 0.06433895645351118, + "grad_norm": 3.199425220489502, + "learning_rate": 8.282e-05, + "loss": 1.5639, + "step": 41 + }, + { + "epoch": 0.06590819929384072, + "grad_norm": 3.349463701248169, + "learning_rate": 8.484e-05, + "loss": 1.5548, + "step": 42 + }, + { + "epoch": 0.06747744213417026, + "grad_norm": 3.215970993041992, + "learning_rate": 8.686e-05, + "loss": 1.5495, + "step": 43 + }, + { + "epoch": 0.0690466849744998, + "grad_norm": 3.3943710327148438, + "learning_rate": 8.888e-05, + "loss": 1.5862, + "step": 44 + }, + { + "epoch": 0.07061592781482935, + "grad_norm": 3.251762866973877, + "learning_rate": 9.09e-05, + "loss": 1.6068, + "step": 45 + }, + { + "epoch": 0.07218517065515889, + "grad_norm": 3.138446807861328, + "learning_rate": 9.292000000000001e-05, + "loss": 1.6129, + "step": 46 + }, + { + "epoch": 0.07375441349548843, + "grad_norm": 3.320495843887329, + "learning_rate": 9.494e-05, + "loss": 1.6232, + "step": 47 + }, + { + "epoch": 0.07532365633581797, + "grad_norm": 2.888617753982544, + "learning_rate": 9.696e-05, + "loss": 1.5907, + "step": 48 + }, + { + "epoch": 0.07689289917614751, + "grad_norm": 3.0241446495056152, + "learning_rate": 9.898e-05, + "loss": 1.625, + "step": 49 + }, + { + "epoch": 0.07846214201647705, + "grad_norm": 3.2043673992156982, + "learning_rate": 0.000101, + "loss": 1.5938, + "step": 50 + }, + { + "epoch": 0.0800313848568066, + "grad_norm": 2.879977226257324, + "learning_rate": 0.00010302, + "loss": 1.5635, + "step": 51 + }, + { + "epoch": 0.08160062769713614, + "grad_norm": 3.2901294231414795, + "learning_rate": 0.00010504000000000001, + "loss": 1.5764, + "step": 52 + }, + { + "epoch": 0.08316987053746568, + "grad_norm": 2.959332227706909, + "learning_rate": 0.00010706000000000001, + "loss": 1.533, + "step": 53 + }, + { + "epoch": 0.08473911337779522, + "grad_norm": 3.020022392272949, + "learning_rate": 0.00010908, + "loss": 1.5703, + "step": 54 + }, + { + "epoch": 0.08630835621812476, + "grad_norm": 3.1601905822753906, + "learning_rate": 0.00011110000000000002, + "loss": 1.5896, + "step": 55 + }, + { + "epoch": 0.0878775990584543, + "grad_norm": 2.8589470386505127, + "learning_rate": 0.00011312000000000001, + "loss": 1.5704, + "step": 56 + }, + { + "epoch": 0.08944684189878384, + "grad_norm": 2.8465092182159424, + "learning_rate": 0.00011514, + "loss": 1.6108, + "step": 57 + }, + { + "epoch": 0.09101608473911338, + "grad_norm": 2.7483441829681396, + "learning_rate": 0.00011716, + "loss": 1.5243, + "step": 58 + }, + { + "epoch": 0.09258532757944292, + "grad_norm": 2.689194440841675, + "learning_rate": 0.00011918, + "loss": 1.5183, + "step": 59 + }, + { + "epoch": 0.09415457041977246, + "grad_norm": 2.6547024250030518, + "learning_rate": 0.00012119999999999999, + "loss": 1.559, + "step": 60 + }, + { + "epoch": 0.095723813260102, + "grad_norm": 2.4385483264923096, + "learning_rate": 0.00012322, + "loss": 1.5013, + "step": 61 + }, + { + "epoch": 0.09729305610043154, + "grad_norm": 2.730778932571411, + "learning_rate": 0.00012524000000000001, + "loss": 1.5706, + "step": 62 + }, + { + "epoch": 0.09886229894076108, + "grad_norm": 2.6107254028320312, + "learning_rate": 0.00012726, + "loss": 1.5387, + "step": 63 + }, + { + "epoch": 0.10043154178109062, + "grad_norm": 2.87015700340271, + "learning_rate": 0.00012928, + "loss": 1.5404, + "step": 64 + }, + { + "epoch": 0.10200078462142016, + "grad_norm": 2.5845487117767334, + "learning_rate": 0.00013130000000000002, + "loss": 1.5883, + "step": 65 + }, + { + "epoch": 0.1035700274617497, + "grad_norm": 2.398439407348633, + "learning_rate": 0.00013332, + "loss": 1.5157, + "step": 66 + }, + { + "epoch": 0.10513927030207924, + "grad_norm": 2.387171983718872, + "learning_rate": 0.00013534000000000002, + "loss": 1.5169, + "step": 67 + }, + { + "epoch": 0.10670851314240878, + "grad_norm": 2.475609064102173, + "learning_rate": 0.00013736, + "loss": 1.52, + "step": 68 + }, + { + "epoch": 0.10827775598273832, + "grad_norm": 2.3862760066986084, + "learning_rate": 0.00013937999999999998, + "loss": 1.5273, + "step": 69 + }, + { + "epoch": 0.10984699882306787, + "grad_norm": 2.552382230758667, + "learning_rate": 0.0001414, + "loss": 1.453, + "step": 70 + }, + { + "epoch": 0.1114162416633974, + "grad_norm": 2.4443907737731934, + "learning_rate": 0.00014342, + "loss": 1.523, + "step": 71 + }, + { + "epoch": 0.11298548450372695, + "grad_norm": 2.312199115753174, + "learning_rate": 0.00014544, + "loss": 1.5554, + "step": 72 + }, + { + "epoch": 0.11455472734405649, + "grad_norm": 2.396787643432617, + "learning_rate": 0.00014746, + "loss": 1.4881, + "step": 73 + }, + { + "epoch": 0.11612397018438603, + "grad_norm": 2.26291823387146, + "learning_rate": 0.00014948, + "loss": 1.5068, + "step": 74 + }, + { + "epoch": 0.11769321302471558, + "grad_norm": 2.450401782989502, + "learning_rate": 0.0001515, + "loss": 1.6034, + "step": 75 + }, + { + "epoch": 0.11926245586504512, + "grad_norm": 2.3044116497039795, + "learning_rate": 0.00015352, + "loss": 1.5237, + "step": 76 + }, + { + "epoch": 0.12083169870537466, + "grad_norm": 2.2995636463165283, + "learning_rate": 0.00015554000000000002, + "loss": 1.488, + "step": 77 + }, + { + "epoch": 0.1224009415457042, + "grad_norm": 2.2703638076782227, + "learning_rate": 0.00015756, + "loss": 1.4811, + "step": 78 + }, + { + "epoch": 0.12397018438603374, + "grad_norm": 2.2350616455078125, + "learning_rate": 0.00015958000000000001, + "loss": 1.4951, + "step": 79 + }, + { + "epoch": 0.12553942722636327, + "grad_norm": 2.418657064437866, + "learning_rate": 0.00016160000000000002, + "loss": 1.5312, + "step": 80 + }, + { + "epoch": 0.12710867006669282, + "grad_norm": 2.272240161895752, + "learning_rate": 0.00016362, + "loss": 1.5335, + "step": 81 + }, + { + "epoch": 0.12867791290702235, + "grad_norm": 2.313049554824829, + "learning_rate": 0.00016564, + "loss": 1.452, + "step": 82 + }, + { + "epoch": 0.1302471557473519, + "grad_norm": 2.386436939239502, + "learning_rate": 0.00016766, + "loss": 1.5705, + "step": 83 + }, + { + "epoch": 0.13181639858768143, + "grad_norm": 2.2719545364379883, + "learning_rate": 0.00016968, + "loss": 1.475, + "step": 84 + }, + { + "epoch": 0.133385641428011, + "grad_norm": 2.182734727859497, + "learning_rate": 0.0001717, + "loss": 1.4947, + "step": 85 + }, + { + "epoch": 0.1349548842683405, + "grad_norm": 2.2887940406799316, + "learning_rate": 0.00017372, + "loss": 1.4818, + "step": 86 + }, + { + "epoch": 0.13652412710867007, + "grad_norm": 2.3344202041625977, + "learning_rate": 0.00017574, + "loss": 1.4725, + "step": 87 + }, + { + "epoch": 0.1380933699489996, + "grad_norm": 2.2350351810455322, + "learning_rate": 0.00017776, + "loss": 1.4801, + "step": 88 + }, + { + "epoch": 0.13966261278932915, + "grad_norm": 2.3677713871002197, + "learning_rate": 0.00017978000000000002, + "loss": 1.4672, + "step": 89 + }, + { + "epoch": 0.1412318556296587, + "grad_norm": 2.4192380905151367, + "learning_rate": 0.0001818, + "loss": 1.4643, + "step": 90 + }, + { + "epoch": 0.14280109846998823, + "grad_norm": 2.3313381671905518, + "learning_rate": 0.00018382, + "loss": 1.4723, + "step": 91 + }, + { + "epoch": 0.14437034131031778, + "grad_norm": 2.380906343460083, + "learning_rate": 0.00018584000000000002, + "loss": 1.4901, + "step": 92 + }, + { + "epoch": 0.1459395841506473, + "grad_norm": 2.4067628383636475, + "learning_rate": 0.00018786, + "loss": 1.4618, + "step": 93 + }, + { + "epoch": 0.14750882699097687, + "grad_norm": 2.5495219230651855, + "learning_rate": 0.00018988, + "loss": 1.4991, + "step": 94 + }, + { + "epoch": 0.1490780698313064, + "grad_norm": 2.0640366077423096, + "learning_rate": 0.0001919, + "loss": 1.4351, + "step": 95 + }, + { + "epoch": 0.15064731267163595, + "grad_norm": 2.391345500946045, + "learning_rate": 0.00019392, + "loss": 1.5894, + "step": 96 + }, + { + "epoch": 0.15221655551196547, + "grad_norm": 2.2739181518554688, + "learning_rate": 0.00019594, + "loss": 1.445, + "step": 97 + }, + { + "epoch": 0.15378579835229503, + "grad_norm": 2.1318423748016357, + "learning_rate": 0.00019796, + "loss": 1.5214, + "step": 98 + }, + { + "epoch": 0.15535504119262455, + "grad_norm": 2.162447690963745, + "learning_rate": 0.00019998, + "loss": 1.5267, + "step": 99 + }, + { + "epoch": 0.1569242840329541, + "grad_norm": 2.212371587753296, + "learning_rate": 0.000202, + "loss": 1.4442, + "step": 100 + }, + { + "epoch": 0.15849352687328364, + "grad_norm": 2.2778406143188477, + "learning_rate": 0.0002019986155169901, + "loss": 1.5475, + "step": 101 + }, + { + "epoch": 0.1600627697136132, + "grad_norm": 2.3117587566375732, + "learning_rate": 0.00020199446210591673, + "loss": 1.474, + "step": 102 + }, + { + "epoch": 0.16163201255394272, + "grad_norm": 2.310950994491577, + "learning_rate": 0.00020198753988064772, + "loss": 1.3859, + "step": 103 + }, + { + "epoch": 0.16320125539427227, + "grad_norm": 2.2392947673797607, + "learning_rate": 0.0002019778490309594, + "loss": 1.5075, + "step": 104 + }, + { + "epoch": 0.1647704982346018, + "grad_norm": 2.1941001415252686, + "learning_rate": 0.00020196538982253126, + "loss": 1.4807, + "step": 105 + }, + { + "epoch": 0.16633974107493135, + "grad_norm": 2.147934675216675, + "learning_rate": 0.0002019501625969389, + "loss": 1.4594, + "step": 106 + }, + { + "epoch": 0.16790898391526088, + "grad_norm": 2.421607732772827, + "learning_rate": 0.00020193216777164428, + "loss": 1.5031, + "step": 107 + }, + { + "epoch": 0.16947822675559043, + "grad_norm": 2.099569320678711, + "learning_rate": 0.0002019114058399847, + "loss": 1.5007, + "step": 108 + }, + { + "epoch": 0.17104746959591996, + "grad_norm": 2.0871477127075195, + "learning_rate": 0.00020188787737115897, + "loss": 1.476, + "step": 109 + }, + { + "epoch": 0.17261671243624951, + "grad_norm": 2.3890280723571777, + "learning_rate": 0.00020186158301021195, + "loss": 1.5477, + "step": 110 + }, + { + "epoch": 0.17418595527657904, + "grad_norm": 2.295742988586426, + "learning_rate": 0.00020183252347801686, + "loss": 1.4869, + "step": 111 + }, + { + "epoch": 0.1757551981169086, + "grad_norm": 2.3225204944610596, + "learning_rate": 0.00020180069957125544, + "loss": 1.4822, + "step": 112 + }, + { + "epoch": 0.17732444095723812, + "grad_norm": 2.23429012298584, + "learning_rate": 0.00020176611216239613, + "loss": 1.489, + "step": 113 + }, + { + "epoch": 0.17889368379756768, + "grad_norm": 2.137282609939575, + "learning_rate": 0.00020172876219967027, + "loss": 1.5072, + "step": 114 + }, + { + "epoch": 0.1804629266378972, + "grad_norm": 2.3375244140625, + "learning_rate": 0.00020168865070704594, + "loss": 1.4568, + "step": 115 + }, + { + "epoch": 0.18203216947822676, + "grad_norm": 2.0739850997924805, + "learning_rate": 0.00020164577878419994, + "loss": 1.4597, + "step": 116 + }, + { + "epoch": 0.18360141231855628, + "grad_norm": 2.071535587310791, + "learning_rate": 0.00020160014760648774, + "loss": 1.4368, + "step": 117 + }, + { + "epoch": 0.18517065515888584, + "grad_norm": 2.0465242862701416, + "learning_rate": 0.00020155175842491107, + "loss": 1.3992, + "step": 118 + }, + { + "epoch": 0.18673989799921537, + "grad_norm": 2.119605541229248, + "learning_rate": 0.00020150061256608387, + "loss": 1.4823, + "step": 119 + }, + { + "epoch": 0.18830914083954492, + "grad_norm": 2.2085328102111816, + "learning_rate": 0.0002014467114321956, + "loss": 1.4771, + "step": 120 + }, + { + "epoch": 0.18987838367987447, + "grad_norm": 2.3099820613861084, + "learning_rate": 0.00020139005650097317, + "loss": 1.4859, + "step": 121 + }, + { + "epoch": 0.191447626520204, + "grad_norm": 2.1115100383758545, + "learning_rate": 0.00020133064932564002, + "loss": 1.4616, + "step": 122 + }, + { + "epoch": 0.19301686936053356, + "grad_norm": 2.0505757331848145, + "learning_rate": 0.000201268491534874, + "loss": 1.4993, + "step": 123 + }, + { + "epoch": 0.19458611220086308, + "grad_norm": 2.181440591812134, + "learning_rate": 0.00020120358483276227, + "loss": 1.4628, + "step": 124 + }, + { + "epoch": 0.19615535504119264, + "grad_norm": 1.945551872253418, + "learning_rate": 0.00020113593099875486, + "loss": 1.4222, + "step": 125 + }, + { + "epoch": 0.19772459788152216, + "grad_norm": 2.000396490097046, + "learning_rate": 0.00020106553188761582, + "loss": 1.4034, + "step": 126 + }, + { + "epoch": 0.19929384072185172, + "grad_norm": 2.1692821979522705, + "learning_rate": 0.0002009923894293723, + "loss": 1.4816, + "step": 127 + }, + { + "epoch": 0.20086308356218124, + "grad_norm": 2.055147409439087, + "learning_rate": 0.00020091650562926183, + "loss": 1.4283, + "step": 128 + }, + { + "epoch": 0.2024323264025108, + "grad_norm": 2.0244970321655273, + "learning_rate": 0.00020083788256767702, + "loss": 1.4366, + "step": 129 + }, + { + "epoch": 0.20400156924284032, + "grad_norm": 1.9098484516143799, + "learning_rate": 0.00020075652240010892, + "loss": 1.3343, + "step": 130 + }, + { + "epoch": 0.20557081208316988, + "grad_norm": 2.3663299083709717, + "learning_rate": 0.00020067242735708754, + "loss": 1.399, + "step": 131 + }, + { + "epoch": 0.2071400549234994, + "grad_norm": 2.1342029571533203, + "learning_rate": 0.00020058559974412102, + "loss": 1.5022, + "step": 132 + }, + { + "epoch": 0.20870929776382896, + "grad_norm": 2.145221471786499, + "learning_rate": 0.00020049604194163217, + "loss": 1.3742, + "step": 133 + }, + { + "epoch": 0.2102785406041585, + "grad_norm": 2.1555604934692383, + "learning_rate": 0.00020040375640489343, + "loss": 1.4698, + "step": 134 + }, + { + "epoch": 0.21184778344448804, + "grad_norm": 2.1271767616271973, + "learning_rate": 0.00020030874566395943, + "loss": 1.4066, + "step": 135 + }, + { + "epoch": 0.21341702628481757, + "grad_norm": 2.0314996242523193, + "learning_rate": 0.00020021101232359757, + "loss": 1.4793, + "step": 136 + }, + { + "epoch": 0.21498626912514712, + "grad_norm": 2.0091471672058105, + "learning_rate": 0.00020011055906321676, + "loss": 1.4372, + "step": 137 + }, + { + "epoch": 0.21655551196547665, + "grad_norm": 2.1660966873168945, + "learning_rate": 0.0002000073886367939, + "loss": 1.3927, + "step": 138 + }, + { + "epoch": 0.2181247548058062, + "grad_norm": 2.262688159942627, + "learning_rate": 0.00019990150387279835, + "loss": 1.4345, + "step": 139 + }, + { + "epoch": 0.21969399764613573, + "grad_norm": 2.0990803241729736, + "learning_rate": 0.00019979290767411438, + "loss": 1.3917, + "step": 140 + }, + { + "epoch": 0.22126324048646528, + "grad_norm": 2.1939804553985596, + "learning_rate": 0.00019968160301796163, + "loss": 1.5128, + "step": 141 + }, + { + "epoch": 0.2228324833267948, + "grad_norm": 1.8914436101913452, + "learning_rate": 0.0001995675929558135, + "loss": 1.4053, + "step": 142 + }, + { + "epoch": 0.22440172616712437, + "grad_norm": 2.1180291175842285, + "learning_rate": 0.0001994508806133134, + "loss": 1.336, + "step": 143 + }, + { + "epoch": 0.2259709690074539, + "grad_norm": 2.0142769813537598, + "learning_rate": 0.0001993314691901892, + "loss": 1.3699, + "step": 144 + }, + { + "epoch": 0.22754021184778345, + "grad_norm": 2.077742576599121, + "learning_rate": 0.00019920936196016534, + "loss": 1.4395, + "step": 145 + }, + { + "epoch": 0.22910945468811297, + "grad_norm": 2.0023796558380127, + "learning_rate": 0.00019908456227087326, + "loss": 1.3826, + "step": 146 + }, + { + "epoch": 0.23067869752844253, + "grad_norm": 2.257500171661377, + "learning_rate": 0.00019895707354375945, + "loss": 1.4208, + "step": 147 + }, + { + "epoch": 0.23224794036877205, + "grad_norm": 2.02466082572937, + "learning_rate": 0.00019882689927399174, + "loss": 1.3968, + "step": 148 + }, + { + "epoch": 0.2338171832091016, + "grad_norm": 2.146395683288574, + "learning_rate": 0.00019869404303036355, + "loss": 1.4605, + "step": 149 + }, + { + "epoch": 0.23538642604943116, + "grad_norm": 2.076221466064453, + "learning_rate": 0.00019855850845519588, + "loss": 1.4345, + "step": 150 + }, + { + "epoch": 0.2369556688897607, + "grad_norm": 2.0548439025878906, + "learning_rate": 0.00019842029926423762, + "loss": 1.3429, + "step": 151 + }, + { + "epoch": 0.23852491173009024, + "grad_norm": 2.017364025115967, + "learning_rate": 0.00019827941924656348, + "loss": 1.3277, + "step": 152 + }, + { + "epoch": 0.24009415457041977, + "grad_norm": 2.0859904289245605, + "learning_rate": 0.00019813587226447034, + "loss": 1.3884, + "step": 153 + }, + { + "epoch": 0.24166339741074933, + "grad_norm": 2.0451669692993164, + "learning_rate": 0.00019798966225337126, + "loss": 1.354, + "step": 154 + }, + { + "epoch": 0.24323264025107885, + "grad_norm": 1.956303596496582, + "learning_rate": 0.00019784079322168752, + "loss": 1.4503, + "step": 155 + }, + { + "epoch": 0.2448018830914084, + "grad_norm": 1.8808430433273315, + "learning_rate": 0.00019768926925073878, + "loss": 1.4561, + "step": 156 + }, + { + "epoch": 0.24637112593173793, + "grad_norm": 1.830910563468933, + "learning_rate": 0.00019753509449463134, + "loss": 1.3961, + "step": 157 + }, + { + "epoch": 0.2479403687720675, + "grad_norm": 1.928559422492981, + "learning_rate": 0.00019737827318014396, + "loss": 1.4138, + "step": 158 + }, + { + "epoch": 0.24950961161239701, + "grad_norm": 1.8621546030044556, + "learning_rate": 0.00019721880960661223, + "loss": 1.3784, + "step": 159 + }, + { + "epoch": 0.25107885445272654, + "grad_norm": 1.8627210855484009, + "learning_rate": 0.00019705670814581052, + "loss": 1.4002, + "step": 160 + }, + { + "epoch": 0.2526480972930561, + "grad_norm": 1.9636502265930176, + "learning_rate": 0.0001968919732418323, + "loss": 1.3627, + "step": 161 + }, + { + "epoch": 0.25421734013338565, + "grad_norm": 1.929227590560913, + "learning_rate": 0.00019672460941096818, + "loss": 1.4889, + "step": 162 + }, + { + "epoch": 0.2557865829737152, + "grad_norm": 1.8213303089141846, + "learning_rate": 0.0001965546212415821, + "loss": 1.4022, + "step": 163 + }, + { + "epoch": 0.2573558258140447, + "grad_norm": 1.883415937423706, + "learning_rate": 0.0001963820133939856, + "loss": 1.4234, + "step": 164 + }, + { + "epoch": 0.25892506865437426, + "grad_norm": 1.8944504261016846, + "learning_rate": 0.00019620679060031003, + "loss": 1.439, + "step": 165 + }, + { + "epoch": 0.2604943114947038, + "grad_norm": 1.863971471786499, + "learning_rate": 0.00019602895766437678, + "loss": 1.3649, + "step": 166 + }, + { + "epoch": 0.26206355433503337, + "grad_norm": 1.7960493564605713, + "learning_rate": 0.0001958485194615656, + "loss": 1.4145, + "step": 167 + }, + { + "epoch": 0.26363279717536287, + "grad_norm": 1.8760932683944702, + "learning_rate": 0.00019566548093868106, + "loss": 1.4017, + "step": 168 + }, + { + "epoch": 0.2652020400156924, + "grad_norm": 1.9801069498062134, + "learning_rate": 0.00019547984711381662, + "loss": 1.4373, + "step": 169 + }, + { + "epoch": 0.266771282856022, + "grad_norm": 1.8810572624206543, + "learning_rate": 0.00019529162307621738, + "loss": 1.4491, + "step": 170 + }, + { + "epoch": 0.26834052569635153, + "grad_norm": 2.004303216934204, + "learning_rate": 0.00019510081398614045, + "loss": 1.338, + "step": 171 + }, + { + "epoch": 0.269909768536681, + "grad_norm": 1.9581718444824219, + "learning_rate": 0.00019490742507471338, + "loss": 1.3921, + "step": 172 + }, + { + "epoch": 0.2714790113770106, + "grad_norm": 1.9514821767807007, + "learning_rate": 0.00019471146164379093, + "loss": 1.3735, + "step": 173 + }, + { + "epoch": 0.27304825421734014, + "grad_norm": 2.156669855117798, + "learning_rate": 0.00019451292906580948, + "loss": 1.4243, + "step": 174 + }, + { + "epoch": 0.2746174970576697, + "grad_norm": 2.1529016494750977, + "learning_rate": 0.00019431183278363997, + "loss": 1.4146, + "step": 175 + }, + { + "epoch": 0.2761867398979992, + "grad_norm": 1.9543150663375854, + "learning_rate": 0.00019410817831043856, + "loss": 1.4225, + "step": 176 + }, + { + "epoch": 0.27775598273832874, + "grad_norm": 2.0345335006713867, + "learning_rate": 0.00019390197122949552, + "loss": 1.4153, + "step": 177 + }, + { + "epoch": 0.2793252255786583, + "grad_norm": 1.9812119007110596, + "learning_rate": 0.0001936932171940821, + "loss": 1.3389, + "step": 178 + }, + { + "epoch": 0.28089446841898785, + "grad_norm": 1.9412561655044556, + "learning_rate": 0.0001934819219272957, + "loss": 1.4274, + "step": 179 + }, + { + "epoch": 0.2824637112593174, + "grad_norm": 1.8606380224227905, + "learning_rate": 0.0001932680912219027, + "loss": 1.4213, + "step": 180 + }, + { + "epoch": 0.2840329540996469, + "grad_norm": 1.9317265748977661, + "learning_rate": 0.00019305173094017996, + "loss": 1.3488, + "step": 181 + }, + { + "epoch": 0.28560219693997646, + "grad_norm": 1.8266358375549316, + "learning_rate": 0.00019283284701375393, + "loss": 1.3412, + "step": 182 + }, + { + "epoch": 0.287171439780306, + "grad_norm": 1.8459333181381226, + "learning_rate": 0.00019261144544343794, + "loss": 1.3835, + "step": 183 + }, + { + "epoch": 0.28874068262063557, + "grad_norm": 2.1560158729553223, + "learning_rate": 0.00019238753229906797, + "loss": 1.3651, + "step": 184 + }, + { + "epoch": 0.29030992546096507, + "grad_norm": 2.070158004760742, + "learning_rate": 0.00019216111371933594, + "loss": 1.3273, + "step": 185 + }, + { + "epoch": 0.2918791683012946, + "grad_norm": 2.0824270248413086, + "learning_rate": 0.00019193219591162155, + "loss": 1.3123, + "step": 186 + }, + { + "epoch": 0.2934484111416242, + "grad_norm": 1.988049030303955, + "learning_rate": 0.00019170078515182216, + "loss": 1.3751, + "step": 187 + }, + { + "epoch": 0.29501765398195373, + "grad_norm": 1.8832893371582031, + "learning_rate": 0.0001914668877841807, + "loss": 1.3474, + "step": 188 + }, + { + "epoch": 0.29658689682228323, + "grad_norm": 2.0252370834350586, + "learning_rate": 0.0001912305102211116, + "loss": 1.4584, + "step": 189 + }, + { + "epoch": 0.2981561396626128, + "grad_norm": 1.966884970664978, + "learning_rate": 0.00019099165894302515, + "loss": 1.3996, + "step": 190 + }, + { + "epoch": 0.29972538250294234, + "grad_norm": 1.7160698175430298, + "learning_rate": 0.00019075034049814983, + "loss": 1.3406, + "step": 191 + }, + { + "epoch": 0.3012946253432719, + "grad_norm": 1.9381587505340576, + "learning_rate": 0.00019050656150235268, + "loss": 1.3609, + "step": 192 + }, + { + "epoch": 0.3028638681836014, + "grad_norm": 1.8005485534667969, + "learning_rate": 0.00019026032863895805, + "loss": 1.3029, + "step": 193 + }, + { + "epoch": 0.30443311102393095, + "grad_norm": 1.8562763929367065, + "learning_rate": 0.0001900116486585642, + "loss": 1.3968, + "step": 194 + }, + { + "epoch": 0.3060023538642605, + "grad_norm": 1.8850858211517334, + "learning_rate": 0.0001897605283788585, + "loss": 1.4209, + "step": 195 + }, + { + "epoch": 0.30757159670459006, + "grad_norm": 1.8385757207870483, + "learning_rate": 0.0001895069746844302, + "loss": 1.3793, + "step": 196 + }, + { + "epoch": 0.30914083954491955, + "grad_norm": 1.7821141481399536, + "learning_rate": 0.000189250994526582, + "loss": 1.3574, + "step": 197 + }, + { + "epoch": 0.3107100823852491, + "grad_norm": 1.9112757444381714, + "learning_rate": 0.00018899259492313915, + "loss": 1.4031, + "step": 198 + }, + { + "epoch": 0.31227932522557866, + "grad_norm": 1.9298224449157715, + "learning_rate": 0.00018873178295825732, + "loss": 1.3782, + "step": 199 + }, + { + "epoch": 0.3138485680659082, + "grad_norm": 1.8507230281829834, + "learning_rate": 0.00018846856578222832, + "loss": 1.372, + "step": 200 + }, + { + "epoch": 0.3154178109062377, + "grad_norm": 1.8771377801895142, + "learning_rate": 0.00018820295061128394, + "loss": 1.3568, + "step": 201 + }, + { + "epoch": 0.31698705374656727, + "grad_norm": 1.9520775079727173, + "learning_rate": 0.00018793494472739831, + "loss": 1.297, + "step": 202 + }, + { + "epoch": 0.3185562965868968, + "grad_norm": 1.7460178136825562, + "learning_rate": 0.00018766455547808813, + "loss": 1.3382, + "step": 203 + }, + { + "epoch": 0.3201255394272264, + "grad_norm": 1.793826937675476, + "learning_rate": 0.0001873917902762112, + "loss": 1.2849, + "step": 204 + }, + { + "epoch": 0.3216947822675559, + "grad_norm": 1.734061598777771, + "learning_rate": 0.0001871166565997633, + "loss": 1.2942, + "step": 205 + }, + { + "epoch": 0.32326402510788543, + "grad_norm": 1.863922119140625, + "learning_rate": 0.00018683916199167325, + "loss": 1.3778, + "step": 206 + }, + { + "epoch": 0.324833267948215, + "grad_norm": 1.8447890281677246, + "learning_rate": 0.00018655931405959586, + "loss": 1.3759, + "step": 207 + }, + { + "epoch": 0.32640251078854454, + "grad_norm": 1.857258915901184, + "learning_rate": 0.00018627712047570352, + "loss": 1.3924, + "step": 208 + }, + { + "epoch": 0.32797175362887404, + "grad_norm": 1.7293856143951416, + "learning_rate": 0.00018599258897647594, + "loss": 1.3082, + "step": 209 + }, + { + "epoch": 0.3295409964692036, + "grad_norm": 1.8512126207351685, + "learning_rate": 0.00018570572736248782, + "loss": 1.4116, + "step": 210 + }, + { + "epoch": 0.33111023930953315, + "grad_norm": 2.1293859481811523, + "learning_rate": 0.0001854165434981953, + "loss": 1.2651, + "step": 211 + }, + { + "epoch": 0.3326794821498627, + "grad_norm": 2.024932622909546, + "learning_rate": 0.00018512504531172005, + "loss": 1.4358, + "step": 212 + }, + { + "epoch": 0.33424872499019226, + "grad_norm": 1.9704970121383667, + "learning_rate": 0.0001848312407946321, + "loss": 1.3563, + "step": 213 + }, + { + "epoch": 0.33581796783052176, + "grad_norm": 2.0325543880462646, + "learning_rate": 0.00018453513800173072, + "loss": 1.3437, + "step": 214 + }, + { + "epoch": 0.3373872106708513, + "grad_norm": 1.8330961465835571, + "learning_rate": 0.00018423674505082356, + "loss": 1.4267, + "step": 215 + }, + { + "epoch": 0.33895645351118087, + "grad_norm": 1.8310669660568237, + "learning_rate": 0.0001839360701225041, + "loss": 1.3147, + "step": 216 + }, + { + "epoch": 0.3405256963515104, + "grad_norm": 1.851898193359375, + "learning_rate": 0.00018363312145992737, + "loss": 1.2873, + "step": 217 + }, + { + "epoch": 0.3420949391918399, + "grad_norm": 2.0073893070220947, + "learning_rate": 0.00018332790736858397, + "loss": 1.338, + "step": 218 + }, + { + "epoch": 0.3436641820321695, + "grad_norm": 1.9479587078094482, + "learning_rate": 0.00018302043621607245, + "loss": 1.3545, + "step": 219 + }, + { + "epoch": 0.34523342487249903, + "grad_norm": 1.9442317485809326, + "learning_rate": 0.00018271071643186968, + "loss": 1.3116, + "step": 220 + }, + { + "epoch": 0.3468026677128286, + "grad_norm": 1.840929388999939, + "learning_rate": 0.0001823987565071001, + "loss": 1.3155, + "step": 221 + }, + { + "epoch": 0.3483719105531581, + "grad_norm": 1.7986841201782227, + "learning_rate": 0.00018208456499430256, + "loss": 1.272, + "step": 222 + }, + { + "epoch": 0.34994115339348764, + "grad_norm": 1.8865517377853394, + "learning_rate": 0.00018176815050719615, + "loss": 1.2964, + "step": 223 + }, + { + "epoch": 0.3515103962338172, + "grad_norm": 1.9019880294799805, + "learning_rate": 0.00018144952172044381, + "loss": 1.3574, + "step": 224 + }, + { + "epoch": 0.35307963907414674, + "grad_norm": 1.8171775341033936, + "learning_rate": 0.00018112868736941477, + "loss": 1.3124, + "step": 225 + }, + { + "epoch": 0.35464888191447624, + "grad_norm": 1.8235315084457397, + "learning_rate": 0.00018080565624994474, + "loss": 1.3242, + "step": 226 + }, + { + "epoch": 0.3562181247548058, + "grad_norm": 1.7586414813995361, + "learning_rate": 0.00018048043721809507, + "loss": 1.3221, + "step": 227 + }, + { + "epoch": 0.35778736759513535, + "grad_norm": 1.851314663887024, + "learning_rate": 0.00018015303918990982, + "loss": 1.2603, + "step": 228 + }, + { + "epoch": 0.3593566104354649, + "grad_norm": 1.8386470079421997, + "learning_rate": 0.0001798234711411713, + "loss": 1.3862, + "step": 229 + }, + { + "epoch": 0.3609258532757944, + "grad_norm": 1.7785301208496094, + "learning_rate": 0.00017949174210715407, + "loss": 1.292, + "step": 230 + }, + { + "epoch": 0.36249509611612396, + "grad_norm": 1.790682315826416, + "learning_rate": 0.00017915786118237714, + "loss": 1.3234, + "step": 231 + }, + { + "epoch": 0.3640643389564535, + "grad_norm": 1.743922472000122, + "learning_rate": 0.0001788218375203547, + "loss": 1.2607, + "step": 232 + }, + { + "epoch": 0.36563358179678307, + "grad_norm": 1.9812731742858887, + "learning_rate": 0.00017848368033334528, + "loss": 1.2807, + "step": 233 + }, + { + "epoch": 0.36720282463711257, + "grad_norm": 1.8043054342269897, + "learning_rate": 0.00017814339889209887, + "loss": 1.2972, + "step": 234 + }, + { + "epoch": 0.3687720674774421, + "grad_norm": 1.8638029098510742, + "learning_rate": 0.00017780100252560313, + "loss": 1.3205, + "step": 235 + }, + { + "epoch": 0.3703413103177717, + "grad_norm": 1.6905754804611206, + "learning_rate": 0.0001774565006208274, + "loss": 1.3694, + "step": 236 + }, + { + "epoch": 0.37191055315810123, + "grad_norm": 2.024243116378784, + "learning_rate": 0.00017710990262246543, + "loss": 1.3052, + "step": 237 + }, + { + "epoch": 0.37347979599843073, + "grad_norm": 1.7341359853744507, + "learning_rate": 0.0001767612180326764, + "loss": 1.2568, + "step": 238 + }, + { + "epoch": 0.3750490388387603, + "grad_norm": 1.910465955734253, + "learning_rate": 0.00017641045641082453, + "loss": 1.2682, + "step": 239 + }, + { + "epoch": 0.37661828167908984, + "grad_norm": 2.0867741107940674, + "learning_rate": 0.00017605762737321683, + "loss": 1.2977, + "step": 240 + }, + { + "epoch": 0.3781875245194194, + "grad_norm": 1.8427789211273193, + "learning_rate": 0.0001757027405928396, + "loss": 1.3258, + "step": 241 + }, + { + "epoch": 0.37975676735974895, + "grad_norm": 1.8280835151672363, + "learning_rate": 0.0001753458057990932, + "loss": 1.3496, + "step": 242 + }, + { + "epoch": 0.38132601020007845, + "grad_norm": 1.9032690525054932, + "learning_rate": 0.00017498683277752527, + "loss": 1.3686, + "step": 243 + }, + { + "epoch": 0.382895253040408, + "grad_norm": 1.81566321849823, + "learning_rate": 0.00017462583136956258, + "loss": 1.3302, + "step": 244 + }, + { + "epoch": 0.38446449588073756, + "grad_norm": 1.8299857378005981, + "learning_rate": 0.00017426281147224105, + "loss": 1.3012, + "step": 245 + }, + { + "epoch": 0.3860337387210671, + "grad_norm": 1.9972599744796753, + "learning_rate": 0.00017389778303793457, + "loss": 1.314, + "step": 246 + }, + { + "epoch": 0.3876029815613966, + "grad_norm": 1.8850973844528198, + "learning_rate": 0.00017353075607408209, + "loss": 1.2659, + "step": 247 + }, + { + "epoch": 0.38917222440172616, + "grad_norm": 1.8441429138183594, + "learning_rate": 0.00017316174064291315, + "loss": 1.3649, + "step": 248 + }, + { + "epoch": 0.3907414672420557, + "grad_norm": 1.9221850633621216, + "learning_rate": 0.00017279074686117225, + "loss": 1.3119, + "step": 249 + }, + { + "epoch": 0.39231071008238527, + "grad_norm": 1.762587547302246, + "learning_rate": 0.0001724177848998413, + "loss": 1.3002, + "step": 250 + }, + { + "epoch": 0.39387995292271477, + "grad_norm": 1.948606014251709, + "learning_rate": 0.0001720428649838609, + "loss": 1.3097, + "step": 251 + }, + { + "epoch": 0.3954491957630443, + "grad_norm": 1.9769690036773682, + "learning_rate": 0.0001716659973918499, + "loss": 1.2652, + "step": 252 + }, + { + "epoch": 0.3970184386033739, + "grad_norm": 1.8369519710540771, + "learning_rate": 0.00017128719245582374, + "loss": 1.2252, + "step": 253 + }, + { + "epoch": 0.39858768144370343, + "grad_norm": 1.783319115638733, + "learning_rate": 0.0001709064605609111, + "loss": 1.3208, + "step": 254 + }, + { + "epoch": 0.40015692428403293, + "grad_norm": 1.836661696434021, + "learning_rate": 0.00017052381214506914, + "loss": 1.2694, + "step": 255 + }, + { + "epoch": 0.4017261671243625, + "grad_norm": 1.7220667600631714, + "learning_rate": 0.00017013925769879755, + "loss": 1.3311, + "step": 256 + }, + { + "epoch": 0.40329540996469204, + "grad_norm": 1.724839687347412, + "learning_rate": 0.0001697528077648507, + "loss": 1.2153, + "step": 257 + }, + { + "epoch": 0.4048646528050216, + "grad_norm": 1.788538932800293, + "learning_rate": 0.00016936447293794873, + "loss": 1.2579, + "step": 258 + }, + { + "epoch": 0.4064338956453511, + "grad_norm": 1.8611515760421753, + "learning_rate": 0.0001689742638644871, + "loss": 1.3162, + "step": 259 + }, + { + "epoch": 0.40800313848568065, + "grad_norm": 2.0099024772644043, + "learning_rate": 0.0001685821912422447, + "loss": 1.3641, + "step": 260 + }, + { + "epoch": 0.4095723813260102, + "grad_norm": 1.8042457103729248, + "learning_rate": 0.00016818826582009044, + "loss": 1.2704, + "step": 261 + }, + { + "epoch": 0.41114162416633976, + "grad_norm": 1.6778589487075806, + "learning_rate": 0.00016779249839768884, + "loss": 1.2519, + "step": 262 + }, + { + "epoch": 0.41271086700666926, + "grad_norm": 1.8367433547973633, + "learning_rate": 0.00016739489982520368, + "loss": 1.3781, + "step": 263 + }, + { + "epoch": 0.4142801098469988, + "grad_norm": 1.75066339969635, + "learning_rate": 0.00016699548100300066, + "loss": 1.3071, + "step": 264 + }, + { + "epoch": 0.41584935268732837, + "grad_norm": 1.7271960973739624, + "learning_rate": 0.00016659425288134854, + "loss": 1.2459, + "step": 265 + }, + { + "epoch": 0.4174185955276579, + "grad_norm": 1.7992873191833496, + "learning_rate": 0.00016619122646011902, + "loss": 1.3366, + "step": 266 + }, + { + "epoch": 0.4189878383679874, + "grad_norm": 1.7786139249801636, + "learning_rate": 0.00016578641278848497, + "loss": 1.3496, + "step": 267 + }, + { + "epoch": 0.420557081208317, + "grad_norm": 1.6860625743865967, + "learning_rate": 0.00016537982296461768, + "loss": 1.2674, + "step": 268 + }, + { + "epoch": 0.42212632404864653, + "grad_norm": 1.948914885520935, + "learning_rate": 0.00016497146813538257, + "loss": 1.3536, + "step": 269 + }, + { + "epoch": 0.4236955668889761, + "grad_norm": 1.8243354558944702, + "learning_rate": 0.00016456135949603358, + "loss": 1.2287, + "step": 270 + }, + { + "epoch": 0.42526480972930564, + "grad_norm": 1.869276762008667, + "learning_rate": 0.00016414950828990625, + "loss": 1.359, + "step": 271 + }, + { + "epoch": 0.42683405256963514, + "grad_norm": 1.8261938095092773, + "learning_rate": 0.00016373592580810935, + "loss": 1.2632, + "step": 272 + }, + { + "epoch": 0.4284032954099647, + "grad_norm": 1.778885006904602, + "learning_rate": 0.00016332062338921563, + "loss": 1.2505, + "step": 273 + }, + { + "epoch": 0.42997253825029425, + "grad_norm": 1.8894082307815552, + "learning_rate": 0.00016290361241895064, + "loss": 1.268, + "step": 274 + }, + { + "epoch": 0.4315417810906238, + "grad_norm": 1.956537127494812, + "learning_rate": 0.0001624849043298808, + "loss": 1.3514, + "step": 275 + }, + { + "epoch": 0.4331110239309533, + "grad_norm": 1.758349895477295, + "learning_rate": 0.00016206451060109988, + "loss": 1.3651, + "step": 276 + }, + { + "epoch": 0.43468026677128285, + "grad_norm": 1.8040424585342407, + "learning_rate": 0.0001616424427579143, + "loss": 1.2928, + "step": 277 + }, + { + "epoch": 0.4362495096116124, + "grad_norm": 1.750389814376831, + "learning_rate": 0.0001612187123715272, + "loss": 1.3145, + "step": 278 + }, + { + "epoch": 0.43781875245194196, + "grad_norm": 1.7329552173614502, + "learning_rate": 0.00016079333105872118, + "loss": 1.3341, + "step": 279 + }, + { + "epoch": 0.43938799529227146, + "grad_norm": 1.630483865737915, + "learning_rate": 0.00016036631048153979, + "loss": 1.2743, + "step": 280 + }, + { + "epoch": 0.440957238132601, + "grad_norm": 1.7958582639694214, + "learning_rate": 0.00015993766234696785, + "loss": 1.2129, + "step": 281 + }, + { + "epoch": 0.44252648097293057, + "grad_norm": 1.8848406076431274, + "learning_rate": 0.00015950739840661055, + "loss": 1.3214, + "step": 282 + }, + { + "epoch": 0.4440957238132601, + "grad_norm": 1.718554139137268, + "learning_rate": 0.00015907553045637116, + "loss": 1.3056, + "step": 283 + }, + { + "epoch": 0.4456649666535896, + "grad_norm": 1.7483363151550293, + "learning_rate": 0.00015864207033612762, + "loss": 1.2572, + "step": 284 + }, + { + "epoch": 0.4472342094939192, + "grad_norm": 1.7969883680343628, + "learning_rate": 0.00015820702992940813, + "loss": 1.241, + "step": 285 + }, + { + "epoch": 0.44880345233424873, + "grad_norm": 1.8833965063095093, + "learning_rate": 0.0001577704211630652, + "loss": 1.2755, + "step": 286 + }, + { + "epoch": 0.4503726951745783, + "grad_norm": 1.8978298902511597, + "learning_rate": 0.00015733225600694873, + "loss": 1.3369, + "step": 287 + }, + { + "epoch": 0.4519419380149078, + "grad_norm": 1.721032738685608, + "learning_rate": 0.00015689254647357776, + "loss": 1.3007, + "step": 288 + }, + { + "epoch": 0.45351118085523734, + "grad_norm": 1.7071144580841064, + "learning_rate": 0.0001564513046178113, + "loss": 1.2972, + "step": 289 + }, + { + "epoch": 0.4550804236955669, + "grad_norm": 1.8105018138885498, + "learning_rate": 0.00015600854253651776, + "loss": 1.2729, + "step": 290 + }, + { + "epoch": 0.45664966653589645, + "grad_norm": 1.7796598672866821, + "learning_rate": 0.00015556427236824318, + "loss": 1.2485, + "step": 291 + }, + { + "epoch": 0.45821890937622595, + "grad_norm": 1.795495867729187, + "learning_rate": 0.00015511850629287865, + "loss": 1.2461, + "step": 292 + }, + { + "epoch": 0.4597881522165555, + "grad_norm": 1.7340847253799438, + "learning_rate": 0.00015467125653132637, + "loss": 1.1394, + "step": 293 + }, + { + "epoch": 0.46135739505688506, + "grad_norm": 1.7541520595550537, + "learning_rate": 0.00015422253534516444, + "loss": 1.2426, + "step": 294 + }, + { + "epoch": 0.4629266378972146, + "grad_norm": 1.8418830633163452, + "learning_rate": 0.00015377235503631083, + "loss": 1.3093, + "step": 295 + }, + { + "epoch": 0.4644958807375441, + "grad_norm": 1.7632546424865723, + "learning_rate": 0.00015332072794668617, + "loss": 1.2566, + "step": 296 + }, + { + "epoch": 0.46606512357787366, + "grad_norm": 1.733276128768921, + "learning_rate": 0.0001528676664578752, + "loss": 1.2236, + "step": 297 + }, + { + "epoch": 0.4676343664182032, + "grad_norm": 1.6496461629867554, + "learning_rate": 0.00015241318299078751, + "loss": 1.2501, + "step": 298 + }, + { + "epoch": 0.46920360925853277, + "grad_norm": 1.7713314294815063, + "learning_rate": 0.00015195729000531694, + "loss": 1.2539, + "step": 299 + }, + { + "epoch": 0.4707728520988623, + "grad_norm": 1.6796895265579224, + "learning_rate": 0.0001515, + "loss": 1.2113, + "step": 300 + }, + { + "epoch": 0.4723420949391918, + "grad_norm": 1.9239540100097656, + "learning_rate": 0.00015104132551167318, + "loss": 1.2986, + "step": 301 + }, + { + "epoch": 0.4739113377795214, + "grad_norm": 1.78468656539917, + "learning_rate": 0.00015058127911512923, + "loss": 1.256, + "step": 302 + }, + { + "epoch": 0.47548058061985093, + "grad_norm": 1.7345401048660278, + "learning_rate": 0.00015011987342277255, + "loss": 1.3178, + "step": 303 + }, + { + "epoch": 0.4770498234601805, + "grad_norm": 1.6663039922714233, + "learning_rate": 0.00014965712108427323, + "loss": 1.2479, + "step": 304 + }, + { + "epoch": 0.47861906630051, + "grad_norm": 1.7298219203948975, + "learning_rate": 0.00014919303478622045, + "loss": 1.3205, + "step": 305 + }, + { + "epoch": 0.48018830914083954, + "grad_norm": 1.7559915781021118, + "learning_rate": 0.00014872762725177447, + "loss": 1.2312, + "step": 306 + }, + { + "epoch": 0.4817575519811691, + "grad_norm": 1.6063960790634155, + "learning_rate": 0.00014826091124031792, + "loss": 1.2061, + "step": 307 + }, + { + "epoch": 0.48332679482149865, + "grad_norm": 1.6477845907211304, + "learning_rate": 0.00014779289954710604, + "loss": 1.2162, + "step": 308 + }, + { + "epoch": 0.48489603766182815, + "grad_norm": 1.8582090139389038, + "learning_rate": 0.00014732360500291583, + "loss": 1.1779, + "step": 309 + }, + { + "epoch": 0.4864652805021577, + "grad_norm": 1.6793705224990845, + "learning_rate": 0.00014685304047369423, + "loss": 1.2055, + "step": 310 + }, + { + "epoch": 0.48803452334248726, + "grad_norm": 1.754279375076294, + "learning_rate": 0.00014638121886020555, + "loss": 1.2283, + "step": 311 + }, + { + "epoch": 0.4896037661828168, + "grad_norm": 1.7692251205444336, + "learning_rate": 0.00014590815309767767, + "loss": 1.1791, + "step": 312 + }, + { + "epoch": 0.4911730090231463, + "grad_norm": 1.7964473962783813, + "learning_rate": 0.00014543385615544744, + "loss": 1.1909, + "step": 313 + }, + { + "epoch": 0.49274225186347587, + "grad_norm": 1.6660207509994507, + "learning_rate": 0.0001449583410366051, + "loss": 1.2513, + "step": 314 + }, + { + "epoch": 0.4943114947038054, + "grad_norm": 1.7291803359985352, + "learning_rate": 0.00014448162077763783, + "loss": 1.2748, + "step": 315 + }, + { + "epoch": 0.495880737544135, + "grad_norm": 1.760999083518982, + "learning_rate": 0.00014400370844807234, + "loss": 1.2899, + "step": 316 + }, + { + "epoch": 0.4974499803844645, + "grad_norm": 1.8744004964828491, + "learning_rate": 0.0001435246171501166, + "loss": 1.2209, + "step": 317 + }, + { + "epoch": 0.49901922322479403, + "grad_norm": 1.713536024093628, + "learning_rate": 0.00014304436001830054, + "loss": 1.2697, + "step": 318 + }, + { + "epoch": 0.5005884660651235, + "grad_norm": 1.6927945613861084, + "learning_rate": 0.000142562950219116, + "loss": 1.23, + "step": 319 + }, + { + "epoch": 0.5021577089054531, + "grad_norm": 1.8305960893630981, + "learning_rate": 0.00014208040095065584, + "loss": 1.3223, + "step": 320 + }, + { + "epoch": 0.5037269517457826, + "grad_norm": 1.8874188661575317, + "learning_rate": 0.000141596725442252, + "loss": 1.3081, + "step": 321 + }, + { + "epoch": 0.5052961945861122, + "grad_norm": 1.7516905069351196, + "learning_rate": 0.00014111193695411285, + "loss": 1.1938, + "step": 322 + }, + { + "epoch": 0.5068654374264417, + "grad_norm": 1.7049230337142944, + "learning_rate": 0.00014062604877695972, + "loss": 1.2761, + "step": 323 + }, + { + "epoch": 0.5084346802667713, + "grad_norm": 1.8058017492294312, + "learning_rate": 0.0001401390742316624, + "loss": 1.2404, + "step": 324 + }, + { + "epoch": 0.5100039231071009, + "grad_norm": 1.6817086935043335, + "learning_rate": 0.00013965102666887408, + "loss": 1.2652, + "step": 325 + }, + { + "epoch": 0.5115731659474304, + "grad_norm": 1.6659653186798096, + "learning_rate": 0.0001391619194686652, + "loss": 1.2307, + "step": 326 + }, + { + "epoch": 0.5131424087877599, + "grad_norm": 1.768681526184082, + "learning_rate": 0.00013867176604015672, + "loss": 1.1658, + "step": 327 + }, + { + "epoch": 0.5147116516280894, + "grad_norm": 1.7019435167312622, + "learning_rate": 0.0001381805798211525, + "loss": 1.2692, + "step": 328 + }, + { + "epoch": 0.516280894468419, + "grad_norm": 1.7097575664520264, + "learning_rate": 0.00013768837427777082, + "loss": 1.2284, + "step": 329 + }, + { + "epoch": 0.5178501373087485, + "grad_norm": 1.6221075057983398, + "learning_rate": 0.0001371951629040753, + "loss": 1.229, + "step": 330 + }, + { + "epoch": 0.5194193801490781, + "grad_norm": 1.6491563320159912, + "learning_rate": 0.00013670095922170498, + "loss": 1.2365, + "step": 331 + }, + { + "epoch": 0.5209886229894076, + "grad_norm": 1.6598455905914307, + "learning_rate": 0.00013620577677950335, + "loss": 1.1755, + "step": 332 + }, + { + "epoch": 0.5225578658297372, + "grad_norm": 1.634438157081604, + "learning_rate": 0.00013570962915314725, + "loss": 1.2135, + "step": 333 + }, + { + "epoch": 0.5241271086700667, + "grad_norm": 1.6856127977371216, + "learning_rate": 0.00013521252994477446, + "loss": 1.1748, + "step": 334 + }, + { + "epoch": 0.5256963515103963, + "grad_norm": 1.7330875396728516, + "learning_rate": 0.00013471449278261086, + "loss": 1.2766, + "step": 335 + }, + { + "epoch": 0.5272655943507257, + "grad_norm": 1.8133728504180908, + "learning_rate": 0.0001342155313205969, + "loss": 1.1994, + "step": 336 + }, + { + "epoch": 0.5288348371910553, + "grad_norm": 1.7580565214157104, + "learning_rate": 0.0001337156592380131, + "loss": 1.2956, + "step": 337 + }, + { + "epoch": 0.5304040800313848, + "grad_norm": 1.6416274309158325, + "learning_rate": 0.00013321489023910508, + "loss": 1.1444, + "step": 338 + }, + { + "epoch": 0.5319733228717144, + "grad_norm": 1.599387526512146, + "learning_rate": 0.0001327132380527079, + "loss": 1.2985, + "step": 339 + }, + { + "epoch": 0.533542565712044, + "grad_norm": 1.7045061588287354, + "learning_rate": 0.0001322107164318697, + "loss": 1.2182, + "step": 340 + }, + { + "epoch": 0.5351118085523735, + "grad_norm": 1.617336630821228, + "learning_rate": 0.00013170733915347451, + "loss": 1.2239, + "step": 341 + }, + { + "epoch": 0.5366810513927031, + "grad_norm": 1.7053676843643188, + "learning_rate": 0.00013120312001786477, + "loss": 1.2439, + "step": 342 + }, + { + "epoch": 0.5382502942330326, + "grad_norm": 1.664506435394287, + "learning_rate": 0.0001306980728484627, + "loss": 1.1968, + "step": 343 + }, + { + "epoch": 0.539819537073362, + "grad_norm": 1.5783089399337769, + "learning_rate": 0.00013019221149139162, + "loss": 1.152, + "step": 344 + }, + { + "epoch": 0.5413887799136916, + "grad_norm": 1.7623755931854248, + "learning_rate": 0.00012968554981509622, + "loss": 1.235, + "step": 345 + }, + { + "epoch": 0.5429580227540212, + "grad_norm": 1.8229283094406128, + "learning_rate": 0.00012917810170996218, + "loss": 1.2662, + "step": 346 + }, + { + "epoch": 0.5445272655943507, + "grad_norm": 1.754353404045105, + "learning_rate": 0.0001286698810879357, + "loss": 1.2447, + "step": 347 + }, + { + "epoch": 0.5460965084346803, + "grad_norm": 1.679383397102356, + "learning_rate": 0.00012816090188214182, + "loss": 1.134, + "step": 348 + }, + { + "epoch": 0.5476657512750098, + "grad_norm": 1.7973510026931763, + "learning_rate": 0.00012765117804650267, + "loss": 1.289, + "step": 349 + }, + { + "epoch": 0.5492349941153394, + "grad_norm": 1.8203507661819458, + "learning_rate": 0.0001271407235553546, + "loss": 1.2394, + "step": 350 + }, + { + "epoch": 0.5508042369556689, + "grad_norm": 1.7235724925994873, + "learning_rate": 0.00012662955240306538, + "loss": 1.2863, + "step": 351 + }, + { + "epoch": 0.5523734797959984, + "grad_norm": 1.6305742263793945, + "learning_rate": 0.00012611767860365038, + "loss": 1.2085, + "step": 352 + }, + { + "epoch": 0.5539427226363279, + "grad_norm": 1.9298070669174194, + "learning_rate": 0.00012560511619038827, + "loss": 1.1949, + "step": 353 + }, + { + "epoch": 0.5555119654766575, + "grad_norm": 1.7499457597732544, + "learning_rate": 0.00012509187921543667, + "loss": 1.2497, + "step": 354 + }, + { + "epoch": 0.557081208316987, + "grad_norm": 1.7231147289276123, + "learning_rate": 0.00012457798174944645, + "loss": 1.2064, + "step": 355 + }, + { + "epoch": 0.5586504511573166, + "grad_norm": 1.7700331211090088, + "learning_rate": 0.00012406343788117625, + "loss": 1.2664, + "step": 356 + }, + { + "epoch": 0.5602196939976462, + "grad_norm": 1.7591279745101929, + "learning_rate": 0.0001235482617171061, + "loss": 1.2207, + "step": 357 + }, + { + "epoch": 0.5617889368379757, + "grad_norm": 1.633703589439392, + "learning_rate": 0.00012303246738105082, + "loss": 1.2179, + "step": 358 + }, + { + "epoch": 0.5633581796783053, + "grad_norm": 1.8380872011184692, + "learning_rate": 0.00012251606901377265, + "loss": 1.2262, + "step": 359 + }, + { + "epoch": 0.5649274225186348, + "grad_norm": 1.5703797340393066, + "learning_rate": 0.00012199908077259367, + "loss": 1.2049, + "step": 360 + }, + { + "epoch": 0.5664966653589643, + "grad_norm": 1.7306458950042725, + "learning_rate": 0.00012148151683100776, + "loss": 1.1827, + "step": 361 + }, + { + "epoch": 0.5680659081992938, + "grad_norm": 1.823798418045044, + "learning_rate": 0.00012096339137829174, + "loss": 1.2548, + "step": 362 + }, + { + "epoch": 0.5696351510396234, + "grad_norm": 1.8271148204803467, + "learning_rate": 0.00012044471861911666, + "loss": 1.2351, + "step": 363 + }, + { + "epoch": 0.5712043938799529, + "grad_norm": 1.7489742040634155, + "learning_rate": 0.0001199255127731582, + "loss": 1.2063, + "step": 364 + }, + { + "epoch": 0.5727736367202825, + "grad_norm": 1.8556783199310303, + "learning_rate": 0.00011940578807470692, + "loss": 1.2427, + "step": 365 + }, + { + "epoch": 0.574342879560612, + "grad_norm": 1.799114465713501, + "learning_rate": 0.00011888555877227793, + "loss": 1.2112, + "step": 366 + }, + { + "epoch": 0.5759121224009416, + "grad_norm": 1.662523627281189, + "learning_rate": 0.00011836483912822035, + "loss": 1.161, + "step": 367 + }, + { + "epoch": 0.5774813652412711, + "grad_norm": 1.794938564300537, + "learning_rate": 0.00011784364341832634, + "loss": 1.2121, + "step": 368 + }, + { + "epoch": 0.5790506080816006, + "grad_norm": 1.6661959886550903, + "learning_rate": 0.00011732198593143949, + "loss": 1.2353, + "step": 369 + }, + { + "epoch": 0.5806198509219301, + "grad_norm": 1.6102111339569092, + "learning_rate": 0.00011679988096906333, + "loss": 1.1507, + "step": 370 + }, + { + "epoch": 0.5821890937622597, + "grad_norm": 1.7132065296173096, + "learning_rate": 0.00011627734284496917, + "loss": 1.2091, + "step": 371 + }, + { + "epoch": 0.5837583366025892, + "grad_norm": 1.7906545400619507, + "learning_rate": 0.00011575438588480359, + "loss": 1.2118, + "step": 372 + }, + { + "epoch": 0.5853275794429188, + "grad_norm": 1.6036125421524048, + "learning_rate": 0.00011523102442569585, + "loss": 1.1802, + "step": 373 + }, + { + "epoch": 0.5868968222832484, + "grad_norm": 1.668480634689331, + "learning_rate": 0.00011470727281586475, + "loss": 1.1865, + "step": 374 + }, + { + "epoch": 0.5884660651235779, + "grad_norm": 1.6136295795440674, + "learning_rate": 0.00011418314541422523, + "loss": 1.1822, + "step": 375 + }, + { + "epoch": 0.5900353079639075, + "grad_norm": 1.6823208332061768, + "learning_rate": 0.00011365865658999474, + "loss": 1.1952, + "step": 376 + }, + { + "epoch": 0.5916045508042369, + "grad_norm": 1.6804534196853638, + "learning_rate": 0.00011313382072229936, + "loss": 1.1742, + "step": 377 + }, + { + "epoch": 0.5931737936445665, + "grad_norm": 1.5278829336166382, + "learning_rate": 0.00011260865219977954, + "loss": 1.1901, + "step": 378 + }, + { + "epoch": 0.594743036484896, + "grad_norm": 1.7754433155059814, + "learning_rate": 0.00011208316542019556, + "loss": 1.1232, + "step": 379 + }, + { + "epoch": 0.5963122793252256, + "grad_norm": 1.7139606475830078, + "learning_rate": 0.00011155737479003301, + "loss": 1.1466, + "step": 380 + }, + { + "epoch": 0.5978815221655551, + "grad_norm": 1.7254337072372437, + "learning_rate": 0.00011103129472410755, + "loss": 1.1848, + "step": 381 + }, + { + "epoch": 0.5994507650058847, + "grad_norm": 1.7449042797088623, + "learning_rate": 0.00011050493964516997, + "loss": 1.2385, + "step": 382 + }, + { + "epoch": 0.6010200078462142, + "grad_norm": 1.8243709802627563, + "learning_rate": 0.00010997832398351062, + "loss": 1.153, + "step": 383 + }, + { + "epoch": 0.6025892506865438, + "grad_norm": 1.6541403532028198, + "learning_rate": 0.0001094514621765639, + "loss": 1.1378, + "step": 384 + }, + { + "epoch": 0.6041584935268732, + "grad_norm": 1.7091960906982422, + "learning_rate": 0.00010892436866851235, + "loss": 1.2825, + "step": 385 + }, + { + "epoch": 0.6057277363672028, + "grad_norm": 1.546218991279602, + "learning_rate": 0.0001083970579098908, + "loss": 1.1905, + "step": 386 + }, + { + "epoch": 0.6072969792075323, + "grad_norm": 1.6091347932815552, + "learning_rate": 0.00010786954435719008, + "loss": 1.1997, + "step": 387 + }, + { + "epoch": 0.6088662220478619, + "grad_norm": 1.624965786933899, + "learning_rate": 0.00010734184247246066, + "loss": 1.2429, + "step": 388 + }, + { + "epoch": 0.6104354648881914, + "grad_norm": 1.607266902923584, + "learning_rate": 0.00010681396672291631, + "loss": 1.1942, + "step": 389 + }, + { + "epoch": 0.612004707728521, + "grad_norm": 1.5943619012832642, + "learning_rate": 0.00010628593158053734, + "loss": 1.1064, + "step": 390 + }, + { + "epoch": 0.6135739505688506, + "grad_norm": 1.703718900680542, + "learning_rate": 0.00010575775152167391, + "loss": 1.1444, + "step": 391 + }, + { + "epoch": 0.6151431934091801, + "grad_norm": 1.6199911832809448, + "learning_rate": 0.00010522944102664915, + "loss": 1.2093, + "step": 392 + }, + { + "epoch": 0.6167124362495097, + "grad_norm": 1.735672116279602, + "learning_rate": 0.00010470101457936219, + "loss": 1.2039, + "step": 393 + }, + { + "epoch": 0.6182816790898391, + "grad_norm": 1.718082308769226, + "learning_rate": 0.00010417248666689095, + "loss": 1.2933, + "step": 394 + }, + { + "epoch": 0.6198509219301687, + "grad_norm": 1.6482293605804443, + "learning_rate": 0.00010364387177909521, + "loss": 1.1876, + "step": 395 + }, + { + "epoch": 0.6214201647704982, + "grad_norm": 1.7670931816101074, + "learning_rate": 0.00010311518440821906, + "loss": 1.2162, + "step": 396 + }, + { + "epoch": 0.6229894076108278, + "grad_norm": 1.710958480834961, + "learning_rate": 0.0001025864390484939, + "loss": 1.166, + "step": 397 + }, + { + "epoch": 0.6245586504511573, + "grad_norm": 1.5978552103042603, + "learning_rate": 0.00010205765019574084, + "loss": 1.1309, + "step": 398 + }, + { + "epoch": 0.6261278932914869, + "grad_norm": 1.7044875621795654, + "learning_rate": 0.00010152883234697336, + "loss": 1.197, + "step": 399 + }, + { + "epoch": 0.6276971361318164, + "grad_norm": 1.6688252687454224, + "learning_rate": 0.000101, + "loss": 1.1222, + "step": 400 + }, + { + "epoch": 0.629266378972146, + "grad_norm": 1.7264560461044312, + "learning_rate": 0.00010047116765302661, + "loss": 1.1826, + "step": 401 + }, + { + "epoch": 0.6308356218124754, + "grad_norm": 1.6112580299377441, + "learning_rate": 9.994234980425921e-05, + "loss": 1.1535, + "step": 402 + }, + { + "epoch": 0.632404864652805, + "grad_norm": 1.6894357204437256, + "learning_rate": 9.941356095150613e-05, + "loss": 1.2267, + "step": 403 + }, + { + "epoch": 0.6339741074931345, + "grad_norm": 1.8299328088760376, + "learning_rate": 9.888481559178096e-05, + "loss": 1.2352, + "step": 404 + }, + { + "epoch": 0.6355433503334641, + "grad_norm": 1.5724095106124878, + "learning_rate": 9.835612822090483e-05, + "loss": 1.175, + "step": 405 + }, + { + "epoch": 0.6371125931737937, + "grad_norm": 1.590934157371521, + "learning_rate": 9.782751333310905e-05, + "loss": 1.1422, + "step": 406 + }, + { + "epoch": 0.6386818360141232, + "grad_norm": 1.620956540107727, + "learning_rate": 9.72989854206378e-05, + "loss": 1.1898, + "step": 407 + }, + { + "epoch": 0.6402510788544528, + "grad_norm": 1.6138383150100708, + "learning_rate": 9.677055897335087e-05, + "loss": 1.1417, + "step": 408 + }, + { + "epoch": 0.6418203216947823, + "grad_norm": 1.5782678127288818, + "learning_rate": 9.62422484783261e-05, + "loss": 1.1912, + "step": 409 + }, + { + "epoch": 0.6433895645351118, + "grad_norm": 1.5806231498718262, + "learning_rate": 9.571406841946267e-05, + "loss": 1.1833, + "step": 410 + }, + { + "epoch": 0.6449588073754413, + "grad_norm": 1.6173325777053833, + "learning_rate": 9.518603327708372e-05, + "loss": 1.1794, + "step": 411 + }, + { + "epoch": 0.6465280502157709, + "grad_norm": 1.602523684501648, + "learning_rate": 9.465815752753935e-05, + "loss": 1.1119, + "step": 412 + }, + { + "epoch": 0.6480972930561004, + "grad_norm": 1.7795995473861694, + "learning_rate": 9.413045564280998e-05, + "loss": 1.1879, + "step": 413 + }, + { + "epoch": 0.64966653589643, + "grad_norm": 1.7646915912628174, + "learning_rate": 9.360294209010923e-05, + "loss": 1.1926, + "step": 414 + }, + { + "epoch": 0.6512357787367595, + "grad_norm": 1.6798741817474365, + "learning_rate": 9.307563133148767e-05, + "loss": 1.1804, + "step": 415 + }, + { + "epoch": 0.6528050215770891, + "grad_norm": 1.6782431602478027, + "learning_rate": 9.254853782343616e-05, + "loss": 1.2528, + "step": 416 + }, + { + "epoch": 0.6543742644174186, + "grad_norm": 1.6691837310791016, + "learning_rate": 9.202167601648942e-05, + "loss": 1.1242, + "step": 417 + }, + { + "epoch": 0.6559435072577481, + "grad_norm": 1.6898577213287354, + "learning_rate": 9.149506035483005e-05, + "loss": 1.1733, + "step": 418 + }, + { + "epoch": 0.6575127500980776, + "grad_norm": 1.5182565450668335, + "learning_rate": 9.096870527589248e-05, + "loss": 1.1741, + "step": 419 + }, + { + "epoch": 0.6590819929384072, + "grad_norm": 1.7436866760253906, + "learning_rate": 9.044262520996702e-05, + "loss": 1.2368, + "step": 420 + }, + { + "epoch": 0.6606512357787367, + "grad_norm": 1.6554378271102905, + "learning_rate": 8.991683457980443e-05, + "loss": 1.1828, + "step": 421 + }, + { + "epoch": 0.6622204786190663, + "grad_norm": 1.6172407865524292, + "learning_rate": 8.93913478002205e-05, + "loss": 1.2287, + "step": 422 + }, + { + "epoch": 0.6637897214593959, + "grad_norm": 1.614776372909546, + "learning_rate": 8.886617927770065e-05, + "loss": 1.1724, + "step": 423 + }, + { + "epoch": 0.6653589642997254, + "grad_norm": 1.7096689939498901, + "learning_rate": 8.834134341000527e-05, + "loss": 1.1883, + "step": 424 + }, + { + "epoch": 0.666928207140055, + "grad_norm": 1.6051740646362305, + "learning_rate": 8.781685458577481e-05, + "loss": 1.2461, + "step": 425 + }, + { + "epoch": 0.6684974499803845, + "grad_norm": 1.588941216468811, + "learning_rate": 8.729272718413527e-05, + "loss": 1.1225, + "step": 426 + }, + { + "epoch": 0.670066692820714, + "grad_norm": 1.5431382656097412, + "learning_rate": 8.676897557430415e-05, + "loss": 1.1613, + "step": 427 + }, + { + "epoch": 0.6716359356610435, + "grad_norm": 1.6947062015533447, + "learning_rate": 8.624561411519644e-05, + "loss": 1.1438, + "step": 428 + }, + { + "epoch": 0.6732051785013731, + "grad_norm": 1.6536586284637451, + "learning_rate": 8.572265715503086e-05, + "loss": 1.1989, + "step": 429 + }, + { + "epoch": 0.6747744213417026, + "grad_norm": 1.8156611919403076, + "learning_rate": 8.520011903093666e-05, + "loss": 1.1684, + "step": 430 + }, + { + "epoch": 0.6763436641820322, + "grad_norm": 1.5637779235839844, + "learning_rate": 8.467801406856054e-05, + "loss": 1.1771, + "step": 431 + }, + { + "epoch": 0.6779129070223617, + "grad_norm": 1.542081356048584, + "learning_rate": 8.415635658167368e-05, + "loss": 1.1785, + "step": 432 + }, + { + "epoch": 0.6794821498626913, + "grad_norm": 1.6623213291168213, + "learning_rate": 8.363516087177962e-05, + "loss": 1.1521, + "step": 433 + }, + { + "epoch": 0.6810513927030208, + "grad_norm": 1.6023197174072266, + "learning_rate": 8.31144412277221e-05, + "loss": 1.1462, + "step": 434 + }, + { + "epoch": 0.6826206355433503, + "grad_norm": 1.670816421508789, + "learning_rate": 8.25942119252931e-05, + "loss": 1.104, + "step": 435 + }, + { + "epoch": 0.6841898783836798, + "grad_norm": 1.5413976907730103, + "learning_rate": 8.20744872268418e-05, + "loss": 1.07, + "step": 436 + }, + { + "epoch": 0.6857591212240094, + "grad_norm": 1.5363709926605225, + "learning_rate": 8.155528138088337e-05, + "loss": 1.1674, + "step": 437 + }, + { + "epoch": 0.687328364064339, + "grad_norm": 1.670090675354004, + "learning_rate": 8.103660862170826e-05, + "loss": 1.1539, + "step": 438 + }, + { + "epoch": 0.6888976069046685, + "grad_norm": 1.6151149272918701, + "learning_rate": 8.051848316899227e-05, + "loss": 1.1257, + "step": 439 + }, + { + "epoch": 0.6904668497449981, + "grad_norm": 1.7211952209472656, + "learning_rate": 8.000091922740633e-05, + "loss": 1.1186, + "step": 440 + }, + { + "epoch": 0.6920360925853276, + "grad_norm": 1.7568705081939697, + "learning_rate": 7.948393098622737e-05, + "loss": 1.1997, + "step": 441 + }, + { + "epoch": 0.6936053354256572, + "grad_norm": 1.6453711986541748, + "learning_rate": 7.896753261894923e-05, + "loss": 1.1643, + "step": 442 + }, + { + "epoch": 0.6951745782659866, + "grad_norm": 1.5977420806884766, + "learning_rate": 7.845173828289392e-05, + "loss": 1.1062, + "step": 443 + }, + { + "epoch": 0.6967438211063162, + "grad_norm": 1.5398576259613037, + "learning_rate": 7.793656211882377e-05, + "loss": 1.1786, + "step": 444 + }, + { + "epoch": 0.6983130639466457, + "grad_norm": 1.7165251970291138, + "learning_rate": 7.74220182505536e-05, + "loss": 1.1887, + "step": 445 + }, + { + "epoch": 0.6998823067869753, + "grad_norm": 1.6169159412384033, + "learning_rate": 7.690812078456336e-05, + "loss": 1.0989, + "step": 446 + }, + { + "epoch": 0.7014515496273048, + "grad_norm": 1.5985289812088013, + "learning_rate": 7.639488380961173e-05, + "loss": 1.1686, + "step": 447 + }, + { + "epoch": 0.7030207924676344, + "grad_norm": 1.5922420024871826, + "learning_rate": 7.588232139634968e-05, + "loss": 1.1712, + "step": 448 + }, + { + "epoch": 0.7045900353079639, + "grad_norm": 1.8242510557174683, + "learning_rate": 7.537044759693463e-05, + "loss": 1.1134, + "step": 449 + }, + { + "epoch": 0.7061592781482935, + "grad_norm": 1.9206913709640503, + "learning_rate": 7.48592764446454e-05, + "loss": 1.1952, + "step": 450 + }, + { + "epoch": 0.707728520988623, + "grad_norm": 1.7030757665634155, + "learning_rate": 7.434882195349736e-05, + "loss": 1.1359, + "step": 451 + }, + { + "epoch": 0.7092977638289525, + "grad_norm": 1.4911803007125854, + "learning_rate": 7.383909811785817e-05, + "loss": 1.1342, + "step": 452 + }, + { + "epoch": 0.710867006669282, + "grad_norm": 1.5316143035888672, + "learning_rate": 7.333011891206432e-05, + "loss": 1.0975, + "step": 453 + }, + { + "epoch": 0.7124362495096116, + "grad_norm": 1.5703046321868896, + "learning_rate": 7.282189829003785e-05, + "loss": 1.0844, + "step": 454 + }, + { + "epoch": 0.7140054923499412, + "grad_norm": 1.579211950302124, + "learning_rate": 7.231445018490381e-05, + "loss": 1.1384, + "step": 455 + }, + { + "epoch": 0.7155747351902707, + "grad_norm": 1.7093926668167114, + "learning_rate": 7.180778850860835e-05, + "loss": 1.1875, + "step": 456 + }, + { + "epoch": 0.7171439780306003, + "grad_norm": 1.5931205749511719, + "learning_rate": 7.130192715153731e-05, + "loss": 1.2042, + "step": 457 + }, + { + "epoch": 0.7187132208709298, + "grad_norm": 1.6165008544921875, + "learning_rate": 7.079687998213526e-05, + "loss": 1.2046, + "step": 458 + }, + { + "epoch": 0.7202824637112594, + "grad_norm": 1.6996906995773315, + "learning_rate": 7.029266084652548e-05, + "loss": 1.1708, + "step": 459 + }, + { + "epoch": 0.7218517065515888, + "grad_norm": 1.6533430814743042, + "learning_rate": 6.978928356813031e-05, + "loss": 1.1512, + "step": 460 + }, + { + "epoch": 0.7234209493919184, + "grad_norm": 1.619112491607666, + "learning_rate": 6.92867619472921e-05, + "loss": 1.178, + "step": 461 + }, + { + "epoch": 0.7249901922322479, + "grad_norm": 1.6113744974136353, + "learning_rate": 6.878510976089493e-05, + "loss": 1.1969, + "step": 462 + }, + { + "epoch": 0.7265594350725775, + "grad_norm": 1.634495735168457, + "learning_rate": 6.828434076198693e-05, + "loss": 1.1333, + "step": 463 + }, + { + "epoch": 0.728128677912907, + "grad_norm": 1.584319829940796, + "learning_rate": 6.77844686794031e-05, + "loss": 1.1284, + "step": 464 + }, + { + "epoch": 0.7296979207532366, + "grad_norm": 1.6179981231689453, + "learning_rate": 6.728550721738915e-05, + "loss": 1.2024, + "step": 465 + }, + { + "epoch": 0.7312671635935661, + "grad_norm": 1.6516456604003906, + "learning_rate": 6.678747005522557e-05, + "loss": 1.0998, + "step": 466 + }, + { + "epoch": 0.7328364064338957, + "grad_norm": 1.5799529552459717, + "learning_rate": 6.629037084685278e-05, + "loss": 1.168, + "step": 467 + }, + { + "epoch": 0.7344056492742251, + "grad_norm": 1.5804849863052368, + "learning_rate": 6.579422322049668e-05, + "loss": 1.1063, + "step": 468 + }, + { + "epoch": 0.7359748921145547, + "grad_norm": 1.6777349710464478, + "learning_rate": 6.529904077829505e-05, + "loss": 1.1856, + "step": 469 + }, + { + "epoch": 0.7375441349548842, + "grad_norm": 1.6487278938293457, + "learning_rate": 6.480483709592468e-05, + "loss": 1.115, + "step": 470 + }, + { + "epoch": 0.7391133777952138, + "grad_norm": 1.6507923603057861, + "learning_rate": 6.43116257222292e-05, + "loss": 1.1951, + "step": 471 + }, + { + "epoch": 0.7406826206355434, + "grad_norm": 1.4872692823410034, + "learning_rate": 6.381942017884753e-05, + "loss": 1.1603, + "step": 472 + }, + { + "epoch": 0.7422518634758729, + "grad_norm": 1.5706673860549927, + "learning_rate": 6.33282339598433e-05, + "loss": 1.1358, + "step": 473 + }, + { + "epoch": 0.7438211063162025, + "grad_norm": 1.6565934419631958, + "learning_rate": 6.283808053133484e-05, + "loss": 1.1493, + "step": 474 + }, + { + "epoch": 0.745390349156532, + "grad_norm": 1.4948631525039673, + "learning_rate": 6.234897333112594e-05, + "loss": 1.1434, + "step": 475 + }, + { + "epoch": 0.7469595919968615, + "grad_norm": 1.5693128108978271, + "learning_rate": 6.186092576833761e-05, + "loss": 1.2109, + "step": 476 + }, + { + "epoch": 0.748528834837191, + "grad_norm": 1.6032805442810059, + "learning_rate": 6.137395122304033e-05, + "loss": 1.1024, + "step": 477 + }, + { + "epoch": 0.7500980776775206, + "grad_norm": 1.6286931037902832, + "learning_rate": 6.088806304588717e-05, + "loss": 1.0877, + "step": 478 + }, + { + "epoch": 0.7516673205178501, + "grad_norm": 1.4611527919769287, + "learning_rate": 6.0403274557748035e-05, + "loss": 1.1388, + "step": 479 + }, + { + "epoch": 0.7532365633581797, + "grad_norm": 1.681735634803772, + "learning_rate": 5.9919599049344194e-05, + "loss": 1.1269, + "step": 480 + }, + { + "epoch": 0.7548058061985092, + "grad_norm": 1.681593656539917, + "learning_rate": 5.943704978088402e-05, + "loss": 1.1375, + "step": 481 + }, + { + "epoch": 0.7563750490388388, + "grad_norm": 1.514367938041687, + "learning_rate": 5.89556399816995e-05, + "loss": 1.1519, + "step": 482 + }, + { + "epoch": 0.7579442918791683, + "grad_norm": 1.7447987794876099, + "learning_rate": 5.847538284988341e-05, + "loss": 1.1649, + "step": 483 + }, + { + "epoch": 0.7595135347194979, + "grad_norm": 1.460632562637329, + "learning_rate": 5.7996291551927666e-05, + "loss": 1.1383, + "step": 484 + }, + { + "epoch": 0.7610827775598273, + "grad_norm": 1.621333360671997, + "learning_rate": 5.751837922236217e-05, + "loss": 1.108, + "step": 485 + }, + { + "epoch": 0.7626520204001569, + "grad_norm": 1.5239380598068237, + "learning_rate": 5.704165896339494e-05, + "loss": 1.1324, + "step": 486 + }, + { + "epoch": 0.7642212632404864, + "grad_norm": 1.6265413761138916, + "learning_rate": 5.656614384455257e-05, + "loss": 1.1278, + "step": 487 + }, + { + "epoch": 0.765790506080816, + "grad_norm": 1.4997467994689941, + "learning_rate": 5.609184690232235e-05, + "loss": 1.1452, + "step": 488 + }, + { + "epoch": 0.7673597489211456, + "grad_norm": 1.559548020362854, + "learning_rate": 5.5618781139794465e-05, + "loss": 1.1395, + "step": 489 + }, + { + "epoch": 0.7689289917614751, + "grad_norm": 1.6800786256790161, + "learning_rate": 5.514695952630578e-05, + "loss": 1.1498, + "step": 490 + }, + { + "epoch": 0.7704982346018047, + "grad_norm": 1.7421120405197144, + "learning_rate": 5.467639499708423e-05, + "loss": 1.1677, + "step": 491 + }, + { + "epoch": 0.7720674774421342, + "grad_norm": 1.520545482635498, + "learning_rate": 5.420710045289399e-05, + "loss": 1.157, + "step": 492 + }, + { + "epoch": 0.7736367202824637, + "grad_norm": 1.5284466743469238, + "learning_rate": 5.373908875968211e-05, + "loss": 1.1243, + "step": 493 + }, + { + "epoch": 0.7752059631227932, + "grad_norm": 1.6169205904006958, + "learning_rate": 5.3272372748225556e-05, + "loss": 1.0896, + "step": 494 + }, + { + "epoch": 0.7767752059631228, + "grad_norm": 1.6210825443267822, + "learning_rate": 5.2806965213779544e-05, + "loss": 1.0994, + "step": 495 + }, + { + "epoch": 0.7783444488034523, + "grad_norm": 1.621155023574829, + "learning_rate": 5.234287891572674e-05, + "loss": 1.0807, + "step": 496 + }, + { + "epoch": 0.7799136916437819, + "grad_norm": 1.5995038747787476, + "learning_rate": 5.1880126577227464e-05, + "loss": 1.1314, + "step": 497 + }, + { + "epoch": 0.7814829344841114, + "grad_norm": 1.6519408226013184, + "learning_rate": 5.141872088487078e-05, + "loss": 1.1535, + "step": 498 + }, + { + "epoch": 0.783052177324441, + "grad_norm": 1.617515206336975, + "learning_rate": 5.095867448832683e-05, + "loss": 1.0794, + "step": 499 + }, + { + "epoch": 0.7846214201647705, + "grad_norm": 1.6349530220031738, + "learning_rate": 5.050000000000002e-05, + "loss": 1.1414, + "step": 500 + }, + { + "epoch": 0.7861906630051, + "grad_norm": 1.6123729944229126, + "learning_rate": 5.004270999468307e-05, + "loss": 1.1066, + "step": 501 + }, + { + "epoch": 0.7877599058454295, + "grad_norm": 1.6318762302398682, + "learning_rate": 4.95868170092125e-05, + "loss": 1.0861, + "step": 502 + }, + { + "epoch": 0.7893291486857591, + "grad_norm": 1.6744816303253174, + "learning_rate": 4.913233354212485e-05, + "loss": 1.0963, + "step": 503 + }, + { + "epoch": 0.7908983915260887, + "grad_norm": 1.7084587812423706, + "learning_rate": 4.867927205331386e-05, + "loss": 1.1848, + "step": 504 + }, + { + "epoch": 0.7924676343664182, + "grad_norm": 1.587443470954895, + "learning_rate": 4.822764496368917e-05, + "loss": 1.1294, + "step": 505 + }, + { + "epoch": 0.7940368772067478, + "grad_norm": 1.5023859739303589, + "learning_rate": 4.7777464654835564e-05, + "loss": 1.0804, + "step": 506 + }, + { + "epoch": 0.7956061200470773, + "grad_norm": 1.537320852279663, + "learning_rate": 4.732874346867362e-05, + "loss": 1.1379, + "step": 507 + }, + { + "epoch": 0.7971753628874069, + "grad_norm": 1.6183412075042725, + "learning_rate": 4.6881493707121315e-05, + "loss": 1.0744, + "step": 508 + }, + { + "epoch": 0.7987446057277363, + "grad_norm": 1.6079076528549194, + "learning_rate": 4.643572763175684e-05, + "loss": 1.1069, + "step": 509 + }, + { + "epoch": 0.8003138485680659, + "grad_norm": 1.6581976413726807, + "learning_rate": 4.5991457463482264e-05, + "loss": 1.159, + "step": 510 + }, + { + "epoch": 0.8018830914083954, + "grad_norm": 1.6260336637496948, + "learning_rate": 4.554869538218868e-05, + "loss": 1.0766, + "step": 511 + }, + { + "epoch": 0.803452334248725, + "grad_norm": 1.6418932676315308, + "learning_rate": 4.5107453526422255e-05, + "loss": 1.1652, + "step": 512 + }, + { + "epoch": 0.8050215770890545, + "grad_norm": 1.6536438465118408, + "learning_rate": 4.46677439930513e-05, + "loss": 1.1225, + "step": 513 + }, + { + "epoch": 0.8065908199293841, + "grad_norm": 1.6844714879989624, + "learning_rate": 4.422957883693483e-05, + "loss": 1.1073, + "step": 514 + }, + { + "epoch": 0.8081600627697136, + "grad_norm": 1.5378201007843018, + "learning_rate": 4.3792970070591906e-05, + "loss": 1.1282, + "step": 515 + }, + { + "epoch": 0.8097293056100432, + "grad_norm": 1.5851975679397583, + "learning_rate": 4.3357929663872406e-05, + "loss": 1.1365, + "step": 516 + }, + { + "epoch": 0.8112985484503727, + "grad_norm": 1.6574060916900635, + "learning_rate": 4.29244695436289e-05, + "loss": 1.1886, + "step": 517 + }, + { + "epoch": 0.8128677912907022, + "grad_norm": 1.5741300582885742, + "learning_rate": 4.249260159338946e-05, + "loss": 1.1466, + "step": 518 + }, + { + "epoch": 0.8144370341310317, + "grad_norm": 1.5635714530944824, + "learning_rate": 4.2062337653032146e-05, + "loss": 1.0656, + "step": 519 + }, + { + "epoch": 0.8160062769713613, + "grad_norm": 1.4327596426010132, + "learning_rate": 4.1633689518460225e-05, + "loss": 1.0977, + "step": 520 + }, + { + "epoch": 0.8175755198116909, + "grad_norm": 1.4986056089401245, + "learning_rate": 4.1206668941278826e-05, + "loss": 1.1026, + "step": 521 + }, + { + "epoch": 0.8191447626520204, + "grad_norm": 1.557176947593689, + "learning_rate": 4.078128762847279e-05, + "loss": 1.0738, + "step": 522 + }, + { + "epoch": 0.82071400549235, + "grad_norm": 1.5916979312896729, + "learning_rate": 4.035755724208573e-05, + "loss": 1.0649, + "step": 523 + }, + { + "epoch": 0.8222832483326795, + "grad_norm": 1.561154842376709, + "learning_rate": 3.9935489398900145e-05, + "loss": 1.0963, + "step": 524 + }, + { + "epoch": 0.8238524911730091, + "grad_norm": 1.5679534673690796, + "learning_rate": 3.951509567011922e-05, + "loss": 1.1373, + "step": 525 + }, + { + "epoch": 0.8254217340133385, + "grad_norm": 1.464132308959961, + "learning_rate": 3.90963875810494e-05, + "loss": 1.1076, + "step": 526 + }, + { + "epoch": 0.8269909768536681, + "grad_norm": 1.6665191650390625, + "learning_rate": 3.86793766107844e-05, + "loss": 1.2185, + "step": 527 + }, + { + "epoch": 0.8285602196939976, + "grad_norm": 1.5471841096878052, + "learning_rate": 3.826407419189066e-05, + "loss": 1.0831, + "step": 528 + }, + { + "epoch": 0.8301294625343272, + "grad_norm": 1.5329172611236572, + "learning_rate": 3.785049171009381e-05, + "loss": 1.0105, + "step": 529 + }, + { + "epoch": 0.8316987053746567, + "grad_norm": 1.4539717435836792, + "learning_rate": 3.743864050396644e-05, + "loss": 1.0966, + "step": 530 + }, + { + "epoch": 0.8332679482149863, + "grad_norm": 1.624233603477478, + "learning_rate": 3.7028531864617444e-05, + "loss": 1.0657, + "step": 531 + }, + { + "epoch": 0.8348371910553158, + "grad_norm": 1.6335023641586304, + "learning_rate": 3.662017703538234e-05, + "loss": 1.172, + "step": 532 + }, + { + "epoch": 0.8364064338956454, + "grad_norm": 1.6688640117645264, + "learning_rate": 3.621358721151505e-05, + "loss": 1.1412, + "step": 533 + }, + { + "epoch": 0.8379756767359748, + "grad_norm": 1.4961200952529907, + "learning_rate": 3.5808773539880973e-05, + "loss": 1.0503, + "step": 534 + }, + { + "epoch": 0.8395449195763044, + "grad_norm": 1.6660538911819458, + "learning_rate": 3.540574711865146e-05, + "loss": 1.1834, + "step": 535 + }, + { + "epoch": 0.841114162416634, + "grad_norm": 1.464595079421997, + "learning_rate": 3.500451899699935e-05, + "loss": 1.0673, + "step": 536 + }, + { + "epoch": 0.8426834052569635, + "grad_norm": 1.5613619089126587, + "learning_rate": 3.460510017479631e-05, + "loss": 1.0945, + "step": 537 + }, + { + "epoch": 0.8442526480972931, + "grad_norm": 1.5044405460357666, + "learning_rate": 3.420750160231118e-05, + "loss": 1.0833, + "step": 538 + }, + { + "epoch": 0.8458218909376226, + "grad_norm": 1.4220069646835327, + "learning_rate": 3.381173417990957e-05, + "loss": 1.1425, + "step": 539 + }, + { + "epoch": 0.8473911337779522, + "grad_norm": 1.6131671667099, + "learning_rate": 3.3417808757755355e-05, + "loss": 1.1282, + "step": 540 + }, + { + "epoch": 0.8489603766182817, + "grad_norm": 1.5812686681747437, + "learning_rate": 3.302573613551292e-05, + "loss": 1.1966, + "step": 541 + }, + { + "epoch": 0.8505296194586113, + "grad_norm": 1.6086622476577759, + "learning_rate": 3.263552706205128e-05, + "loss": 1.1522, + "step": 542 + }, + { + "epoch": 0.8520988622989407, + "grad_norm": 1.512442946434021, + "learning_rate": 3.22471922351493e-05, + "loss": 1.1183, + "step": 543 + }, + { + "epoch": 0.8536681051392703, + "grad_norm": 1.5406948328018188, + "learning_rate": 3.186074230120244e-05, + "loss": 1.0528, + "step": 544 + }, + { + "epoch": 0.8552373479795998, + "grad_norm": 1.505651593208313, + "learning_rate": 3.147618785493083e-05, + "loss": 1.0228, + "step": 545 + }, + { + "epoch": 0.8568065908199294, + "grad_norm": 2.0095841884613037, + "learning_rate": 3.109353943908893e-05, + "loss": 1.1148, + "step": 546 + }, + { + "epoch": 0.8583758336602589, + "grad_norm": 1.5780339241027832, + "learning_rate": 3.071280754417626e-05, + "loss": 1.0751, + "step": 547 + }, + { + "epoch": 0.8599450765005885, + "grad_norm": 1.5076093673706055, + "learning_rate": 3.033400260815008e-05, + "loss": 1.0675, + "step": 548 + }, + { + "epoch": 0.861514319340918, + "grad_norm": 1.563519835472107, + "learning_rate": 2.9957135016139122e-05, + "loss": 1.1056, + "step": 549 + }, + { + "epoch": 0.8630835621812476, + "grad_norm": 1.558599829673767, + "learning_rate": 2.9582215100158706e-05, + "loss": 1.0487, + "step": 550 + }, + { + "epoch": 0.864652805021577, + "grad_norm": 1.6026594638824463, + "learning_rate": 2.920925313882776e-05, + "loss": 1.1625, + "step": 551 + }, + { + "epoch": 0.8662220478619066, + "grad_norm": 1.637216567993164, + "learning_rate": 2.8838259357086884e-05, + "loss": 1.1964, + "step": 552 + }, + { + "epoch": 0.8677912907022362, + "grad_norm": 1.5943851470947266, + "learning_rate": 2.846924392591794e-05, + "loss": 1.1149, + "step": 553 + }, + { + "epoch": 0.8693605335425657, + "grad_norm": 1.7017689943313599, + "learning_rate": 2.8102216962065423e-05, + "loss": 1.1066, + "step": 554 + }, + { + "epoch": 0.8709297763828953, + "grad_norm": 1.5497442483901978, + "learning_rate": 2.7737188527758972e-05, + "loss": 1.1479, + "step": 555 + }, + { + "epoch": 0.8724990192232248, + "grad_norm": 1.6563622951507568, + "learning_rate": 2.7374168630437456e-05, + "loss": 1.1053, + "step": 556 + }, + { + "epoch": 0.8740682620635544, + "grad_norm": 1.5732630491256714, + "learning_rate": 2.7013167222474756e-05, + "loss": 1.0767, + "step": 557 + }, + { + "epoch": 0.8756375049038839, + "grad_norm": 1.6066272258758545, + "learning_rate": 2.6654194200906833e-05, + "loss": 1.0761, + "step": 558 + }, + { + "epoch": 0.8772067477442134, + "grad_norm": 1.603977918624878, + "learning_rate": 2.629725940716041e-05, + "loss": 1.061, + "step": 559 + }, + { + "epoch": 0.8787759905845429, + "grad_norm": 1.6915643215179443, + "learning_rate": 2.5942372626783172e-05, + "loss": 1.0364, + "step": 560 + }, + { + "epoch": 0.8803452334248725, + "grad_norm": 1.4734680652618408, + "learning_rate": 2.5589543589175485e-05, + "loss": 1.1086, + "step": 561 + }, + { + "epoch": 0.881914476265202, + "grad_norm": 1.5728095769882202, + "learning_rate": 2.523878196732358e-05, + "loss": 1.0431, + "step": 562 + }, + { + "epoch": 0.8834837191055316, + "grad_norm": 1.4190233945846558, + "learning_rate": 2.489009737753459e-05, + "loss": 1.1058, + "step": 563 + }, + { + "epoch": 0.8850529619458611, + "grad_norm": 1.5212831497192383, + "learning_rate": 2.4543499379172615e-05, + "loss": 1.1334, + "step": 564 + }, + { + "epoch": 0.8866222047861907, + "grad_norm": 1.5723265409469604, + "learning_rate": 2.4198997474396877e-05, + "loss": 1.0652, + "step": 565 + }, + { + "epoch": 0.8881914476265202, + "grad_norm": 1.5650742053985596, + "learning_rate": 2.3856601107901166e-05, + "loss": 1.0766, + "step": 566 + }, + { + "epoch": 0.8897606904668497, + "grad_norm": 1.5874619483947754, + "learning_rate": 2.351631966665476e-05, + "loss": 1.0752, + "step": 567 + }, + { + "epoch": 0.8913299333071792, + "grad_norm": 1.6027027368545532, + "learning_rate": 2.31781624796453e-05, + "loss": 1.1453, + "step": 568 + }, + { + "epoch": 0.8928991761475088, + "grad_norm": 1.463350534439087, + "learning_rate": 2.2842138817622883e-05, + "loss": 1.0868, + "step": 569 + }, + { + "epoch": 0.8944684189878384, + "grad_norm": 1.4280847311019897, + "learning_rate": 2.250825789284594e-05, + "loss": 1.0417, + "step": 570 + }, + { + "epoch": 0.8960376618281679, + "grad_norm": 1.6122801303863525, + "learning_rate": 2.217652885882869e-05, + "loss": 1.1508, + "step": 571 + }, + { + "epoch": 0.8976069046684975, + "grad_norm": 1.5138072967529297, + "learning_rate": 2.1846960810090188e-05, + "loss": 1.0953, + "step": 572 + }, + { + "epoch": 0.899176147508827, + "grad_norm": 1.4871585369110107, + "learning_rate": 2.151956278190494e-05, + "loss": 1.0518, + "step": 573 + }, + { + "epoch": 0.9007453903491566, + "grad_norm": 1.552441120147705, + "learning_rate": 2.119434375005527e-05, + "loss": 1.0794, + "step": 574 + }, + { + "epoch": 0.9023146331894861, + "grad_norm": 1.66527259349823, + "learning_rate": 2.087131263058526e-05, + "loss": 1.1307, + "step": 575 + }, + { + "epoch": 0.9038838760298156, + "grad_norm": 1.5651088953018188, + "learning_rate": 2.055047827955618e-05, + "loss": 1.113, + "step": 576 + }, + { + "epoch": 0.9054531188701451, + "grad_norm": 1.4527775049209595, + "learning_rate": 2.0231849492803852e-05, + "loss": 1.0613, + "step": 577 + }, + { + "epoch": 0.9070223617104747, + "grad_norm": 1.5543384552001953, + "learning_rate": 1.991543500569745e-05, + "loss": 1.0867, + "step": 578 + }, + { + "epoch": 0.9085916045508042, + "grad_norm": 1.5950472354888916, + "learning_rate": 1.960124349289992e-05, + "loss": 1.083, + "step": 579 + }, + { + "epoch": 0.9101608473911338, + "grad_norm": 1.492758870124817, + "learning_rate": 1.928928356813032e-05, + "loss": 1.0918, + "step": 580 + }, + { + "epoch": 0.9117300902314633, + "grad_norm": 1.8252304792404175, + "learning_rate": 1.8979563783927565e-05, + "loss": 1.0682, + "step": 581 + }, + { + "epoch": 0.9132993330717929, + "grad_norm": 1.6624518632888794, + "learning_rate": 1.8672092631416013e-05, + "loss": 1.093, + "step": 582 + }, + { + "epoch": 0.9148685759121225, + "grad_norm": 1.4181456565856934, + "learning_rate": 1.8366878540072614e-05, + "loss": 1.1201, + "step": 583 + }, + { + "epoch": 0.9164378187524519, + "grad_norm": 1.3676549196243286, + "learning_rate": 1.8063929877495892e-05, + "loss": 1.0446, + "step": 584 + }, + { + "epoch": 0.9180070615927814, + "grad_norm": 1.5469727516174316, + "learning_rate": 1.7763254949176414e-05, + "loss": 1.0591, + "step": 585 + }, + { + "epoch": 0.919576304433111, + "grad_norm": 1.5047856569290161, + "learning_rate": 1.7464861998269243e-05, + "loss": 1.1106, + "step": 586 + }, + { + "epoch": 0.9211455472734406, + "grad_norm": 1.4426612854003906, + "learning_rate": 1.7168759205367893e-05, + "loss": 1.05, + "step": 587 + }, + { + "epoch": 0.9227147901137701, + "grad_norm": 1.5957785844802856, + "learning_rate": 1.6874954688279956e-05, + "loss": 1.109, + "step": 588 + }, + { + "epoch": 0.9242840329540997, + "grad_norm": 1.4436473846435547, + "learning_rate": 1.6583456501804725e-05, + "loss": 1.0848, + "step": 589 + }, + { + "epoch": 0.9258532757944292, + "grad_norm": 1.6218407154083252, + "learning_rate": 1.6294272637512183e-05, + "loss": 1.0913, + "step": 590 + }, + { + "epoch": 0.9274225186347588, + "grad_norm": 1.5806410312652588, + "learning_rate": 1.600741102352409e-05, + "loss": 1.1311, + "step": 591 + }, + { + "epoch": 0.9289917614750882, + "grad_norm": 1.7603397369384766, + "learning_rate": 1.57228795242965e-05, + "loss": 1.0529, + "step": 592 + }, + { + "epoch": 0.9305610043154178, + "grad_norm": 1.5605883598327637, + "learning_rate": 1.544068594040417e-05, + "loss": 1.1127, + "step": 593 + }, + { + "epoch": 0.9321302471557473, + "grad_norm": 1.5535039901733398, + "learning_rate": 1.516083800832676e-05, + "loss": 1.0298, + "step": 594 + }, + { + "epoch": 0.9336994899960769, + "grad_norm": 1.653369665145874, + "learning_rate": 1.488334340023669e-05, + "loss": 1.1684, + "step": 595 + }, + { + "epoch": 0.9352687328364064, + "grad_norm": 1.5320407152175903, + "learning_rate": 1.4608209723788835e-05, + "loss": 1.0255, + "step": 596 + }, + { + "epoch": 0.936837975676736, + "grad_norm": 1.4592413902282715, + "learning_rate": 1.4335444521911899e-05, + "loss": 1.0427, + "step": 597 + }, + { + "epoch": 0.9384072185170655, + "grad_norm": 1.5888770818710327, + "learning_rate": 1.4065055272601703e-05, + "loss": 1.11, + "step": 598 + }, + { + "epoch": 0.9399764613573951, + "grad_norm": 1.45270836353302, + "learning_rate": 1.3797049388716065e-05, + "loss": 1.1176, + "step": 599 + }, + { + "epoch": 0.9415457041977247, + "grad_norm": 1.5480446815490723, + "learning_rate": 1.3531434217771692e-05, + "loss": 1.0656, + "step": 600 + }, + { + "epoch": 0.9431149470380541, + "grad_norm": 1.661987066268921, + "learning_rate": 1.3268217041742701e-05, + "loss": 1.0949, + "step": 601 + }, + { + "epoch": 0.9446841898783837, + "grad_norm": 1.6081300973892212, + "learning_rate": 1.3007405076860875e-05, + "loss": 1.1668, + "step": 602 + }, + { + "epoch": 0.9462534327187132, + "grad_norm": 1.5872290134429932, + "learning_rate": 1.2749005473418015e-05, + "loss": 1.1205, + "step": 603 + }, + { + "epoch": 0.9478226755590428, + "grad_norm": 1.6399871110916138, + "learning_rate": 1.2493025315569801e-05, + "loss": 1.0915, + "step": 604 + }, + { + "epoch": 0.9493919183993723, + "grad_norm": 1.6280970573425293, + "learning_rate": 1.2239471621141508e-05, + "loss": 1.0793, + "step": 605 + }, + { + "epoch": 0.9509611612397019, + "grad_norm": 1.511046290397644, + "learning_rate": 1.1988351341435792e-05, + "loss": 1.0551, + "step": 606 + }, + { + "epoch": 0.9525304040800314, + "grad_norm": 1.6120575666427612, + "learning_rate": 1.173967136104196e-05, + "loss": 1.068, + "step": 607 + }, + { + "epoch": 0.954099646920361, + "grad_norm": 1.6098144054412842, + "learning_rate": 1.1493438497647313e-05, + "loss": 1.0377, + "step": 608 + }, + { + "epoch": 0.9556688897606904, + "grad_norm": 1.6548283100128174, + "learning_rate": 1.1249659501850155e-05, + "loss": 1.079, + "step": 609 + }, + { + "epoch": 0.95723813260102, + "grad_norm": 1.6977226734161377, + "learning_rate": 1.1008341056974854e-05, + "loss": 1.1694, + "step": 610 + }, + { + "epoch": 0.9588073754413495, + "grad_norm": 1.5469707250595093, + "learning_rate": 1.0769489778888405e-05, + "loss": 1.0471, + "step": 611 + }, + { + "epoch": 0.9603766182816791, + "grad_norm": 1.5638216733932495, + "learning_rate": 1.0533112215819298e-05, + "loss": 1.0531, + "step": 612 + }, + { + "epoch": 0.9619458611220086, + "grad_norm": 1.6216305494308472, + "learning_rate": 1.029921484817783e-05, + "loss": 1.0872, + "step": 613 + }, + { + "epoch": 0.9635151039623382, + "grad_norm": 1.5082978010177612, + "learning_rate": 1.0067804088378455e-05, + "loss": 1.1015, + "step": 614 + }, + { + "epoch": 0.9650843468026677, + "grad_norm": 1.5933103561401367, + "learning_rate": 9.8388862806641e-06, + "loss": 1.1554, + "step": 615 + }, + { + "epoch": 0.9666535896429973, + "grad_norm": 1.5416555404663086, + "learning_rate": 9.612467700932045e-06, + "loss": 1.0544, + "step": 616 + }, + { + "epoch": 0.9682228324833267, + "grad_norm": 1.622605562210083, + "learning_rate": 9.388554556562049e-06, + "loss": 1.055, + "step": 617 + }, + { + "epoch": 0.9697920753236563, + "grad_norm": 1.537731409072876, + "learning_rate": 9.167152986246078e-06, + "loss": 1.0893, + "step": 618 + }, + { + "epoch": 0.9713613181639859, + "grad_norm": 1.5320947170257568, + "learning_rate": 8.948269059820025e-06, + "loss": 1.0938, + "step": 619 + }, + { + "epoch": 0.9729305610043154, + "grad_norm": 1.525396466255188, + "learning_rate": 8.731908778097302e-06, + "loss": 1.0354, + "step": 620 + }, + { + "epoch": 0.974499803844645, + "grad_norm": 1.506839632987976, + "learning_rate": 8.518078072704338e-06, + "loss": 1.0021, + "step": 621 + }, + { + "epoch": 0.9760690466849745, + "grad_norm": 1.5291165113449097, + "learning_rate": 8.306782805917904e-06, + "loss": 1.1189, + "step": 622 + }, + { + "epoch": 0.9776382895253041, + "grad_norm": 1.5870240926742554, + "learning_rate": 8.098028770504494e-06, + "loss": 1.0332, + "step": 623 + }, + { + "epoch": 0.9792075323656336, + "grad_norm": 1.5112744569778442, + "learning_rate": 7.891821689561459e-06, + "loss": 1.0524, + "step": 624 + }, + { + "epoch": 0.9807767752059631, + "grad_norm": 1.6814526319503784, + "learning_rate": 7.68816721636004e-06, + "loss": 1.053, + "step": 625 + }, + { + "epoch": 0.9823460180462926, + "grad_norm": 1.6654129028320312, + "learning_rate": 7.487070934190532e-06, + "loss": 1.1405, + "step": 626 + }, + { + "epoch": 0.9839152608866222, + "grad_norm": 1.5979644060134888, + "learning_rate": 7.288538356209092e-06, + "loss": 1.1101, + "step": 627 + }, + { + "epoch": 0.9854845037269517, + "grad_norm": 1.549574375152588, + "learning_rate": 7.092574925286614e-06, + "loss": 1.0432, + "step": 628 + }, + { + "epoch": 0.9870537465672813, + "grad_norm": 1.592332363128662, + "learning_rate": 6.899186013859561e-06, + "loss": 1.0697, + "step": 629 + }, + { + "epoch": 0.9886229894076108, + "grad_norm": 1.599252700805664, + "learning_rate": 6.708376923782635e-06, + "loss": 1.0677, + "step": 630 + }, + { + "epoch": 0.9901922322479404, + "grad_norm": 1.6618796586990356, + "learning_rate": 6.520152886183406e-06, + "loss": 1.0747, + "step": 631 + }, + { + "epoch": 0.99176147508827, + "grad_norm": 1.548222303390503, + "learning_rate": 6.3345190613189635e-06, + "loss": 1.0863, + "step": 632 + }, + { + "epoch": 0.9933307179285995, + "grad_norm": 1.588174819946289, + "learning_rate": 6.151480538434382e-06, + "loss": 1.0863, + "step": 633 + }, + { + "epoch": 0.994899960768929, + "grad_norm": 1.5684528350830078, + "learning_rate": 5.971042335623229e-06, + "loss": 1.0459, + "step": 634 + }, + { + "epoch": 0.9964692036092585, + "grad_norm": 1.5120803117752075, + "learning_rate": 5.793209399689978e-06, + "loss": 1.0603, + "step": 635 + }, + { + "epoch": 0.9980384464495881, + "grad_norm": 1.537070393562317, + "learning_rate": 5.617986606014419e-06, + "loss": 1.067, + "step": 636 + }, + { + "epoch": 0.9996076892899176, + "grad_norm": 1.5575647354125977, + "learning_rate": 5.445378758417925e-06, + "loss": 1.063, + "step": 637 + } + ], + "logging_steps": 1, + "max_steps": 700, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2983334799867904e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}