| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.989690721649485, | |
| "eval_steps": 500, | |
| "global_step": 720, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013745704467353952, | |
| "grad_norm": 0.03515453264117241, | |
| "learning_rate": 4.9999940504433794e-05, | |
| "loss": 0.5822, | |
| "num_input_tokens_seen": 143824, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.027491408934707903, | |
| "grad_norm": 0.03614756464958191, | |
| "learning_rate": 4.999976201801837e-05, | |
| "loss": 0.6334, | |
| "num_input_tokens_seen": 304184, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.041237113402061855, | |
| "grad_norm": 0.03565426915884018, | |
| "learning_rate": 4.999946454160324e-05, | |
| "loss": 0.5673, | |
| "num_input_tokens_seen": 467952, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.054982817869415807, | |
| "grad_norm": 0.04261317849159241, | |
| "learning_rate": 4.999904807660428e-05, | |
| "loss": 0.6365, | |
| "num_input_tokens_seen": 654520, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.06872852233676977, | |
| "grad_norm": 0.05010489746928215, | |
| "learning_rate": 4.999851262500374e-05, | |
| "loss": 0.6061, | |
| "num_input_tokens_seen": 811248, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08247422680412371, | |
| "grad_norm": 0.046112459152936935, | |
| "learning_rate": 4.999785818935018e-05, | |
| "loss": 0.6162, | |
| "num_input_tokens_seen": 979304, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.09621993127147767, | |
| "grad_norm": 0.04539132118225098, | |
| "learning_rate": 4.9997084772758463e-05, | |
| "loss": 0.6038, | |
| "num_input_tokens_seen": 1121856, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.10996563573883161, | |
| "grad_norm": 0.04501952975988388, | |
| "learning_rate": 4.9996192378909786e-05, | |
| "loss": 0.6671, | |
| "num_input_tokens_seen": 1275576, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.12371134020618557, | |
| "grad_norm": 0.04220053553581238, | |
| "learning_rate": 4.9995181012051625e-05, | |
| "loss": 0.6325, | |
| "num_input_tokens_seen": 1436536, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.13745704467353953, | |
| "grad_norm": 0.039944130927324295, | |
| "learning_rate": 4.999405067699773e-05, | |
| "loss": 0.573, | |
| "num_input_tokens_seen": 1587288, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.15120274914089346, | |
| "grad_norm": 0.03584982454776764, | |
| "learning_rate": 4.9992801379128094e-05, | |
| "loss": 0.5465, | |
| "num_input_tokens_seen": 1740640, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.16494845360824742, | |
| "grad_norm": 0.03859821334481239, | |
| "learning_rate": 4.999143312438893e-05, | |
| "loss": 0.5738, | |
| "num_input_tokens_seen": 1883152, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.17869415807560138, | |
| "grad_norm": 0.038480937480926514, | |
| "learning_rate": 4.998994591929266e-05, | |
| "loss": 0.5919, | |
| "num_input_tokens_seen": 2053096, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.19243986254295534, | |
| "grad_norm": 0.041351836174726486, | |
| "learning_rate": 4.9988339770917825e-05, | |
| "loss": 0.5544, | |
| "num_input_tokens_seen": 2210992, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.20618556701030927, | |
| "grad_norm": 0.05070508271455765, | |
| "learning_rate": 4.9986614686909146e-05, | |
| "loss": 0.5928, | |
| "num_input_tokens_seen": 2352472, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.21993127147766323, | |
| "grad_norm": 0.04055149853229523, | |
| "learning_rate": 4.99847706754774e-05, | |
| "loss": 0.5723, | |
| "num_input_tokens_seen": 2533920, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.23367697594501718, | |
| "grad_norm": 0.045761920511722565, | |
| "learning_rate": 4.998280774539943e-05, | |
| "loss": 0.567, | |
| "num_input_tokens_seen": 2682600, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.24742268041237114, | |
| "grad_norm": 0.040435537695884705, | |
| "learning_rate": 4.9980725906018074e-05, | |
| "loss": 0.5493, | |
| "num_input_tokens_seen": 2874584, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.2611683848797251, | |
| "grad_norm": 0.04898778721690178, | |
| "learning_rate": 4.9978525167242174e-05, | |
| "loss": 0.6005, | |
| "num_input_tokens_seen": 3029832, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.27491408934707906, | |
| "grad_norm": 0.0490049347281456, | |
| "learning_rate": 4.997620553954645e-05, | |
| "loss": 0.5382, | |
| "num_input_tokens_seen": 3195424, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.28865979381443296, | |
| "grad_norm": 0.048852819949388504, | |
| "learning_rate": 4.9973767033971505e-05, | |
| "loss": 0.5289, | |
| "num_input_tokens_seen": 3351008, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.3024054982817869, | |
| "grad_norm": 0.0393831841647625, | |
| "learning_rate": 4.997120966212377e-05, | |
| "loss": 0.5202, | |
| "num_input_tokens_seen": 3536896, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.3161512027491409, | |
| "grad_norm": 0.04958576709032059, | |
| "learning_rate": 4.9968533436175426e-05, | |
| "loss": 0.5442, | |
| "num_input_tokens_seen": 3687176, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.32989690721649484, | |
| "grad_norm": 0.04262508824467659, | |
| "learning_rate": 4.996573836886435e-05, | |
| "loss": 0.5308, | |
| "num_input_tokens_seen": 3855776, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.3436426116838488, | |
| "grad_norm": 0.047140106558799744, | |
| "learning_rate": 4.996282447349408e-05, | |
| "loss": 0.5437, | |
| "num_input_tokens_seen": 4006768, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.35738831615120276, | |
| "grad_norm": 0.044271714985370636, | |
| "learning_rate": 4.995979176393372e-05, | |
| "loss": 0.5257, | |
| "num_input_tokens_seen": 4172560, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.3711340206185567, | |
| "grad_norm": 0.0509733222424984, | |
| "learning_rate": 4.9956640254617906e-05, | |
| "loss": 0.5507, | |
| "num_input_tokens_seen": 4334000, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.3848797250859107, | |
| "grad_norm": 0.050593916326761246, | |
| "learning_rate": 4.9953369960546676e-05, | |
| "loss": 0.5738, | |
| "num_input_tokens_seen": 4504304, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.39862542955326463, | |
| "grad_norm": 0.05178133025765419, | |
| "learning_rate": 4.99499808972855e-05, | |
| "loss": 0.5715, | |
| "num_input_tokens_seen": 4673928, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.41237113402061853, | |
| "grad_norm": 0.05001429468393326, | |
| "learning_rate": 4.994647308096509e-05, | |
| "loss": 0.4948, | |
| "num_input_tokens_seen": 4865008, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4261168384879725, | |
| "grad_norm": 0.046113837510347366, | |
| "learning_rate": 4.994284652828143e-05, | |
| "loss": 0.4275, | |
| "num_input_tokens_seen": 5039776, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.43986254295532645, | |
| "grad_norm": 0.05269442871212959, | |
| "learning_rate": 4.993910125649561e-05, | |
| "loss": 0.5144, | |
| "num_input_tokens_seen": 5207488, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.4536082474226804, | |
| "grad_norm": 0.05338837951421738, | |
| "learning_rate": 4.99352372834338e-05, | |
| "loss": 0.5126, | |
| "num_input_tokens_seen": 5379536, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.46735395189003437, | |
| "grad_norm": 0.05707105994224548, | |
| "learning_rate": 4.9931254627487145e-05, | |
| "loss": 0.5106, | |
| "num_input_tokens_seen": 5537592, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.48109965635738833, | |
| "grad_norm": 0.06054031476378441, | |
| "learning_rate": 4.992715330761167e-05, | |
| "loss": 0.4421, | |
| "num_input_tokens_seen": 5672288, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4948453608247423, | |
| "grad_norm": 0.05465317517518997, | |
| "learning_rate": 4.99229333433282e-05, | |
| "loss": 0.5042, | |
| "num_input_tokens_seen": 5838192, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.5085910652920962, | |
| "grad_norm": 0.06289108097553253, | |
| "learning_rate": 4.9918594754722284e-05, | |
| "loss": 0.4345, | |
| "num_input_tokens_seen": 5980424, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.5223367697594502, | |
| "grad_norm": 0.049855027347803116, | |
| "learning_rate": 4.9914137562444044e-05, | |
| "loss": 0.4658, | |
| "num_input_tokens_seen": 6150032, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.5360824742268041, | |
| "grad_norm": 0.06255878508090973, | |
| "learning_rate": 4.9909561787708136e-05, | |
| "loss": 0.5347, | |
| "num_input_tokens_seen": 6301952, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.5498281786941581, | |
| "grad_norm": 0.05674908682703972, | |
| "learning_rate": 4.990486745229364e-05, | |
| "loss": 0.4736, | |
| "num_input_tokens_seen": 6447816, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.563573883161512, | |
| "grad_norm": 0.05906549096107483, | |
| "learning_rate": 4.990005457854392e-05, | |
| "loss": 0.4904, | |
| "num_input_tokens_seen": 6606904, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.5773195876288659, | |
| "grad_norm": 0.05874306336045265, | |
| "learning_rate": 4.989512318936655e-05, | |
| "loss": 0.5014, | |
| "num_input_tokens_seen": 6749320, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.5910652920962199, | |
| "grad_norm": 0.06385283172130585, | |
| "learning_rate": 4.989007330823318e-05, | |
| "loss": 0.4541, | |
| "num_input_tokens_seen": 6897968, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.6048109965635738, | |
| "grad_norm": 0.05710778757929802, | |
| "learning_rate": 4.988490495917947e-05, | |
| "loss": 0.4485, | |
| "num_input_tokens_seen": 7053728, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.6185567010309279, | |
| "grad_norm": 0.05861648917198181, | |
| "learning_rate": 4.987961816680492e-05, | |
| "loss": 0.457, | |
| "num_input_tokens_seen": 7244240, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.6323024054982818, | |
| "grad_norm": 0.06273844093084335, | |
| "learning_rate": 4.987421295627279e-05, | |
| "loss": 0.4752, | |
| "num_input_tokens_seen": 7427368, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.6460481099656358, | |
| "grad_norm": 0.06045098602771759, | |
| "learning_rate": 4.9868689353309974e-05, | |
| "loss": 0.4636, | |
| "num_input_tokens_seen": 7571424, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.6597938144329897, | |
| "grad_norm": 0.06441298127174377, | |
| "learning_rate": 4.9863047384206835e-05, | |
| "loss": 0.4293, | |
| "num_input_tokens_seen": 7739096, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.6735395189003437, | |
| "grad_norm": 0.05901084840297699, | |
| "learning_rate": 4.985728707581717e-05, | |
| "loss": 0.4007, | |
| "num_input_tokens_seen": 7895936, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.6872852233676976, | |
| "grad_norm": 0.05707869306206703, | |
| "learning_rate": 4.985140845555799e-05, | |
| "loss": 0.4296, | |
| "num_input_tokens_seen": 8086992, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.7010309278350515, | |
| "grad_norm": 0.06144925206899643, | |
| "learning_rate": 4.9845411551409455e-05, | |
| "loss": 0.4259, | |
| "num_input_tokens_seen": 8243376, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.7147766323024055, | |
| "grad_norm": 0.06440005451440811, | |
| "learning_rate": 4.983929639191469e-05, | |
| "loss": 0.4426, | |
| "num_input_tokens_seen": 8409360, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.7285223367697594, | |
| "grad_norm": 0.05730581283569336, | |
| "learning_rate": 4.983306300617969e-05, | |
| "loss": 0.419, | |
| "num_input_tokens_seen": 8565048, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.7422680412371134, | |
| "grad_norm": 0.0626528263092041, | |
| "learning_rate": 4.982671142387316e-05, | |
| "loss": 0.4357, | |
| "num_input_tokens_seen": 8735448, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.7560137457044673, | |
| "grad_norm": 0.053913556039333344, | |
| "learning_rate": 4.9820241675226375e-05, | |
| "loss": 0.3823, | |
| "num_input_tokens_seen": 8929280, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.7697594501718213, | |
| "grad_norm": 0.06888018548488617, | |
| "learning_rate": 4.9813653791033057e-05, | |
| "loss": 0.4652, | |
| "num_input_tokens_seen": 9096752, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.7835051546391752, | |
| "grad_norm": 0.07623863220214844, | |
| "learning_rate": 4.980694780264918e-05, | |
| "loss": 0.4524, | |
| "num_input_tokens_seen": 9229768, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.7972508591065293, | |
| "grad_norm": 0.0763726532459259, | |
| "learning_rate": 4.980012374199288e-05, | |
| "loss": 0.4511, | |
| "num_input_tokens_seen": 9383744, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.8109965635738832, | |
| "grad_norm": 0.06238986924290657, | |
| "learning_rate": 4.979318164154426e-05, | |
| "loss": 0.3777, | |
| "num_input_tokens_seen": 9559536, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.8247422680412371, | |
| "grad_norm": 0.07007978111505508, | |
| "learning_rate": 4.9786121534345265e-05, | |
| "loss": 0.4705, | |
| "num_input_tokens_seen": 9701176, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8384879725085911, | |
| "grad_norm": 0.06909722089767456, | |
| "learning_rate": 4.977894345399949e-05, | |
| "loss": 0.4143, | |
| "num_input_tokens_seen": 9857184, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.852233676975945, | |
| "grad_norm": 0.07460346817970276, | |
| "learning_rate": 4.977164743467206e-05, | |
| "loss": 0.3937, | |
| "num_input_tokens_seen": 10030984, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.865979381443299, | |
| "grad_norm": 0.06020830199122429, | |
| "learning_rate": 4.976423351108943e-05, | |
| "loss": 0.4082, | |
| "num_input_tokens_seen": 10205656, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.8797250859106529, | |
| "grad_norm": 0.06564310938119888, | |
| "learning_rate": 4.975670171853926e-05, | |
| "loss": 0.4115, | |
| "num_input_tokens_seen": 10373672, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.8934707903780069, | |
| "grad_norm": 0.07319474220275879, | |
| "learning_rate": 4.974905209287019e-05, | |
| "loss": 0.4226, | |
| "num_input_tokens_seen": 10536672, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.9072164948453608, | |
| "grad_norm": 0.07514749467372894, | |
| "learning_rate": 4.974128467049176e-05, | |
| "loss": 0.4163, | |
| "num_input_tokens_seen": 10689248, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.9209621993127147, | |
| "grad_norm": 0.07608670741319656, | |
| "learning_rate": 4.973339948837411e-05, | |
| "loss": 0.3826, | |
| "num_input_tokens_seen": 10834784, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.9347079037800687, | |
| "grad_norm": 0.07073520123958588, | |
| "learning_rate": 4.9725396584047925e-05, | |
| "loss": 0.3957, | |
| "num_input_tokens_seen": 11007208, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.9484536082474226, | |
| "grad_norm": 0.07030030339956284, | |
| "learning_rate": 4.9717275995604184e-05, | |
| "loss": 0.4145, | |
| "num_input_tokens_seen": 11157232, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.9621993127147767, | |
| "grad_norm": 0.0753760114312172, | |
| "learning_rate": 4.970903776169402e-05, | |
| "loss": 0.4465, | |
| "num_input_tokens_seen": 11314624, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9759450171821306, | |
| "grad_norm": 0.06567500531673431, | |
| "learning_rate": 4.970068192152849e-05, | |
| "loss": 0.438, | |
| "num_input_tokens_seen": 11527424, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.9896907216494846, | |
| "grad_norm": 0.0673137977719307, | |
| "learning_rate": 4.9692208514878444e-05, | |
| "loss": 0.3952, | |
| "num_input_tokens_seen": 11701328, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.013745704467354, | |
| "grad_norm": 0.13521268963813782, | |
| "learning_rate": 4.9683617582074285e-05, | |
| "loss": 0.8682, | |
| "num_input_tokens_seen": 11929832, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.0274914089347078, | |
| "grad_norm": 0.07115866243839264, | |
| "learning_rate": 4.96749091640058e-05, | |
| "loss": 0.4121, | |
| "num_input_tokens_seen": 12100264, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.041237113402062, | |
| "grad_norm": 0.06969404965639114, | |
| "learning_rate": 4.966608330212198e-05, | |
| "loss": 0.41, | |
| "num_input_tokens_seen": 12281280, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.0549828178694158, | |
| "grad_norm": 0.07221312820911407, | |
| "learning_rate": 4.965714003843079e-05, | |
| "loss": 0.3464, | |
| "num_input_tokens_seen": 12452936, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.0687285223367697, | |
| "grad_norm": 0.08590606600046158, | |
| "learning_rate": 4.9648079415498994e-05, | |
| "loss": 0.4266, | |
| "num_input_tokens_seen": 12609744, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.0824742268041236, | |
| "grad_norm": 0.08766326308250427, | |
| "learning_rate": 4.9638901476451946e-05, | |
| "loss": 0.3901, | |
| "num_input_tokens_seen": 12762232, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.0962199312714778, | |
| "grad_norm": 0.07588525116443634, | |
| "learning_rate": 4.962960626497338e-05, | |
| "loss": 0.3927, | |
| "num_input_tokens_seen": 12931840, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.1099656357388317, | |
| "grad_norm": 0.09582990407943726, | |
| "learning_rate": 4.962019382530521e-05, | |
| "loss": 0.3669, | |
| "num_input_tokens_seen": 13055448, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.1237113402061856, | |
| "grad_norm": 0.08706986159086227, | |
| "learning_rate": 4.9610664202247294e-05, | |
| "loss": 0.3939, | |
| "num_input_tokens_seen": 13208008, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.1374570446735395, | |
| "grad_norm": 0.08638782054185867, | |
| "learning_rate": 4.960101744115727e-05, | |
| "loss": 0.3941, | |
| "num_input_tokens_seen": 13368864, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.1512027491408934, | |
| "grad_norm": 0.08101855963468552, | |
| "learning_rate": 4.95912535879503e-05, | |
| "loss": 0.388, | |
| "num_input_tokens_seen": 13536904, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.1649484536082475, | |
| "grad_norm": 0.09194502979516983, | |
| "learning_rate": 4.958137268909887e-05, | |
| "loss": 0.3678, | |
| "num_input_tokens_seen": 13680208, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.1786941580756014, | |
| "grad_norm": 0.09339412301778793, | |
| "learning_rate": 4.957137479163253e-05, | |
| "loss": 0.3779, | |
| "num_input_tokens_seen": 13851872, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.1924398625429553, | |
| "grad_norm": 0.10707342624664307, | |
| "learning_rate": 4.956125994313774e-05, | |
| "loss": 0.4056, | |
| "num_input_tokens_seen": 13972488, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.2061855670103092, | |
| "grad_norm": 0.09154622256755829, | |
| "learning_rate": 4.95510281917576e-05, | |
| "loss": 0.3903, | |
| "num_input_tokens_seen": 14146832, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.2199312714776633, | |
| "grad_norm": 0.08780011534690857, | |
| "learning_rate": 4.9540679586191605e-05, | |
| "loss": 0.3824, | |
| "num_input_tokens_seen": 14315200, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.2336769759450172, | |
| "grad_norm": 0.08936591446399689, | |
| "learning_rate": 4.9530214175695444e-05, | |
| "loss": 0.3989, | |
| "num_input_tokens_seen": 14502368, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.2474226804123711, | |
| "grad_norm": 0.09415003657341003, | |
| "learning_rate": 4.951963201008076e-05, | |
| "loss": 0.3864, | |
| "num_input_tokens_seen": 14657848, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.261168384879725, | |
| "grad_norm": 0.10225144773721695, | |
| "learning_rate": 4.950893313971492e-05, | |
| "loss": 0.4117, | |
| "num_input_tokens_seen": 14811224, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.274914089347079, | |
| "grad_norm": 0.10015024244785309, | |
| "learning_rate": 4.949811761552074e-05, | |
| "loss": 0.371, | |
| "num_input_tokens_seen": 14956048, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.2886597938144329, | |
| "grad_norm": 0.08459267020225525, | |
| "learning_rate": 4.9487185488976286e-05, | |
| "loss": 0.3264, | |
| "num_input_tokens_seen": 15144080, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.302405498281787, | |
| "grad_norm": 0.12224381417036057, | |
| "learning_rate": 4.94761368121146e-05, | |
| "loss": 0.355, | |
| "num_input_tokens_seen": 15315552, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.3161512027491409, | |
| "grad_norm": 0.09795745462179184, | |
| "learning_rate": 4.946497163752346e-05, | |
| "loss": 0.3503, | |
| "num_input_tokens_seen": 15488288, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.3298969072164948, | |
| "grad_norm": 0.13506200909614563, | |
| "learning_rate": 4.9453690018345144e-05, | |
| "loss": 0.4238, | |
| "num_input_tokens_seen": 15638464, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.343642611683849, | |
| "grad_norm": 0.15308719873428345, | |
| "learning_rate": 4.944229200827615e-05, | |
| "loss": 0.4078, | |
| "num_input_tokens_seen": 15824640, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.3573883161512028, | |
| "grad_norm": 0.11087611317634583, | |
| "learning_rate": 4.943077766156697e-05, | |
| "loss": 0.3398, | |
| "num_input_tokens_seen": 15965328, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.3711340206185567, | |
| "grad_norm": 0.10212060064077377, | |
| "learning_rate": 4.9419147033021814e-05, | |
| "loss": 0.3272, | |
| "num_input_tokens_seen": 16146080, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.3848797250859106, | |
| "grad_norm": 0.1134219765663147, | |
| "learning_rate": 4.940740017799833e-05, | |
| "loss": 0.3834, | |
| "num_input_tokens_seen": 16302672, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.3986254295532645, | |
| "grad_norm": 0.116610586643219, | |
| "learning_rate": 4.9395537152407403e-05, | |
| "loss": 0.4073, | |
| "num_input_tokens_seen": 16479464, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.4123711340206184, | |
| "grad_norm": 0.12147705256938934, | |
| "learning_rate": 4.938355801271282e-05, | |
| "loss": 0.3888, | |
| "num_input_tokens_seen": 16619088, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.4261168384879725, | |
| "grad_norm": 0.0925673320889473, | |
| "learning_rate": 4.937146281593103e-05, | |
| "loss": 0.3069, | |
| "num_input_tokens_seen": 16801104, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.4398625429553265, | |
| "grad_norm": 0.1166766881942749, | |
| "learning_rate": 4.9359251619630886e-05, | |
| "loss": 0.3566, | |
| "num_input_tokens_seen": 16960896, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.4536082474226804, | |
| "grad_norm": 0.11052611470222473, | |
| "learning_rate": 4.934692448193334e-05, | |
| "loss": 0.3207, | |
| "num_input_tokens_seen": 17092392, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.4673539518900345, | |
| "grad_norm": 0.12523284554481506, | |
| "learning_rate": 4.9334481461511215e-05, | |
| "loss": 0.3755, | |
| "num_input_tokens_seen": 17259256, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.4810996563573884, | |
| "grad_norm": 0.11668079346418381, | |
| "learning_rate": 4.932192261758886e-05, | |
| "loss": 0.3198, | |
| "num_input_tokens_seen": 17424752, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.4948453608247423, | |
| "grad_norm": 0.10550828278064728, | |
| "learning_rate": 4.9309248009941914e-05, | |
| "loss": 0.3934, | |
| "num_input_tokens_seen": 17596792, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.5085910652920962, | |
| "grad_norm": 0.12377351522445679, | |
| "learning_rate": 4.929645769889703e-05, | |
| "loss": 0.3773, | |
| "num_input_tokens_seen": 17720352, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.52233676975945, | |
| "grad_norm": 0.11502768844366074, | |
| "learning_rate": 4.9283551745331534e-05, | |
| "loss": 0.3661, | |
| "num_input_tokens_seen": 17879008, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.536082474226804, | |
| "grad_norm": 0.12237806618213654, | |
| "learning_rate": 4.9270530210673205e-05, | |
| "loss": 0.3529, | |
| "num_input_tokens_seen": 18071456, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.5498281786941581, | |
| "grad_norm": 0.13444387912750244, | |
| "learning_rate": 4.925739315689991e-05, | |
| "loss": 0.3744, | |
| "num_input_tokens_seen": 18231968, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.563573883161512, | |
| "grad_norm": 0.12283588945865631, | |
| "learning_rate": 4.924414064653938e-05, | |
| "loss": 0.3608, | |
| "num_input_tokens_seen": 18404248, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.577319587628866, | |
| "grad_norm": 0.12254119664430618, | |
| "learning_rate": 4.9230772742668866e-05, | |
| "loss": 0.3676, | |
| "num_input_tokens_seen": 18583624, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.59106529209622, | |
| "grad_norm": 0.11955845355987549, | |
| "learning_rate": 4.9217289508914834e-05, | |
| "loss": 0.3686, | |
| "num_input_tokens_seen": 18765544, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.604810996563574, | |
| "grad_norm": 0.12808598577976227, | |
| "learning_rate": 4.92036910094527e-05, | |
| "loss": 0.3364, | |
| "num_input_tokens_seen": 18909552, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.6185567010309279, | |
| "grad_norm": 0.1245010495185852, | |
| "learning_rate": 4.9189977309006495e-05, | |
| "loss": 0.3684, | |
| "num_input_tokens_seen": 19093712, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.6323024054982818, | |
| "grad_norm": 0.11291433125734329, | |
| "learning_rate": 4.9176148472848584e-05, | |
| "loss": 0.3248, | |
| "num_input_tokens_seen": 19261208, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.6460481099656357, | |
| "grad_norm": 0.12807808816432953, | |
| "learning_rate": 4.9162204566799306e-05, | |
| "loss": 0.3248, | |
| "num_input_tokens_seen": 19445224, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.6597938144329896, | |
| "grad_norm": 0.14625142514705658, | |
| "learning_rate": 4.914814565722671e-05, | |
| "loss": 0.3515, | |
| "num_input_tokens_seen": 19587800, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.6735395189003437, | |
| "grad_norm": 0.12206698209047318, | |
| "learning_rate": 4.9133971811046225e-05, | |
| "loss": 0.3015, | |
| "num_input_tokens_seen": 19746112, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.6872852233676976, | |
| "grad_norm": 0.13071304559707642, | |
| "learning_rate": 4.9119683095720324e-05, | |
| "loss": 0.36, | |
| "num_input_tokens_seen": 19934832, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.7010309278350515, | |
| "grad_norm": 0.1364990770816803, | |
| "learning_rate": 4.9105279579258234e-05, | |
| "loss": 0.3548, | |
| "num_input_tokens_seen": 20084440, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.7147766323024056, | |
| "grad_norm": 0.14099323749542236, | |
| "learning_rate": 4.909076133021557e-05, | |
| "loss": 0.2695, | |
| "num_input_tokens_seen": 20236728, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.7285223367697595, | |
| "grad_norm": 0.14888378977775574, | |
| "learning_rate": 4.907612841769407e-05, | |
| "loss": 0.3171, | |
| "num_input_tokens_seen": 20418808, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.7422680412371134, | |
| "grad_norm": 0.20660178363323212, | |
| "learning_rate": 4.906138091134118e-05, | |
| "loss": 0.3796, | |
| "num_input_tokens_seen": 20575144, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.7560137457044673, | |
| "grad_norm": 0.12663930654525757, | |
| "learning_rate": 4.9046518881349824e-05, | |
| "loss": 0.3461, | |
| "num_input_tokens_seen": 20728032, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.7697594501718212, | |
| "grad_norm": 0.1320526897907257, | |
| "learning_rate": 4.9031542398457974e-05, | |
| "loss": 0.3176, | |
| "num_input_tokens_seen": 20893568, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.7835051546391751, | |
| "grad_norm": 0.19217291474342346, | |
| "learning_rate": 4.901645153394838e-05, | |
| "loss": 0.3297, | |
| "num_input_tokens_seen": 21058624, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.7972508591065293, | |
| "grad_norm": 0.13877883553504944, | |
| "learning_rate": 4.9001246359648224e-05, | |
| "loss": 0.3103, | |
| "num_input_tokens_seen": 21235744, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.8109965635738832, | |
| "grad_norm": 0.17733679711818695, | |
| "learning_rate": 4.898592694792871e-05, | |
| "loss": 0.3234, | |
| "num_input_tokens_seen": 21416640, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.824742268041237, | |
| "grad_norm": 0.14660485088825226, | |
| "learning_rate": 4.8970493371704826e-05, | |
| "loss": 0.3094, | |
| "num_input_tokens_seen": 21592088, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.8384879725085912, | |
| "grad_norm": 0.14453834295272827, | |
| "learning_rate": 4.895494570443492e-05, | |
| "loss": 0.3343, | |
| "num_input_tokens_seen": 21762184, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.852233676975945, | |
| "grad_norm": 0.12177907675504684, | |
| "learning_rate": 4.8939284020120363e-05, | |
| "loss": 0.3026, | |
| "num_input_tokens_seen": 21942392, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.865979381443299, | |
| "grad_norm": 0.16815918684005737, | |
| "learning_rate": 4.892350839330522e-05, | |
| "loss": 0.3121, | |
| "num_input_tokens_seen": 22106184, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.879725085910653, | |
| "grad_norm": 0.15058790147304535, | |
| "learning_rate": 4.890761889907589e-05, | |
| "loss": 0.3, | |
| "num_input_tokens_seen": 22253064, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.8934707903780068, | |
| "grad_norm": 0.18192680180072784, | |
| "learning_rate": 4.8891615613060715e-05, | |
| "loss": 0.3131, | |
| "num_input_tokens_seen": 22402376, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.9072164948453607, | |
| "grad_norm": 0.17071257531642914, | |
| "learning_rate": 4.8875498611429674e-05, | |
| "loss": 0.3188, | |
| "num_input_tokens_seen": 22548304, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.9209621993127146, | |
| "grad_norm": 0.18575875461101532, | |
| "learning_rate": 4.8859267970893956e-05, | |
| "loss": 0.2957, | |
| "num_input_tokens_seen": 22714976, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.9347079037800687, | |
| "grad_norm": 0.1647319346666336, | |
| "learning_rate": 4.884292376870567e-05, | |
| "loss": 0.3087, | |
| "num_input_tokens_seen": 22877648, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.9484536082474226, | |
| "grad_norm": 0.14444571733474731, | |
| "learning_rate": 4.882646608265743e-05, | |
| "loss": 0.3482, | |
| "num_input_tokens_seen": 23035584, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.9621993127147768, | |
| "grad_norm": 0.18155309557914734, | |
| "learning_rate": 4.8809894991081964e-05, | |
| "loss": 0.3254, | |
| "num_input_tokens_seen": 23181168, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.9759450171821307, | |
| "grad_norm": 0.2374810427427292, | |
| "learning_rate": 4.87932105728518e-05, | |
| "loss": 0.2768, | |
| "num_input_tokens_seen": 23333280, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.9896907216494846, | |
| "grad_norm": 0.13866551220417023, | |
| "learning_rate": 4.877641290737884e-05, | |
| "loss": 0.3319, | |
| "num_input_tokens_seen": 23505256, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.013745704467354, | |
| "grad_norm": 0.3380536139011383, | |
| "learning_rate": 4.8759502074614026e-05, | |
| "loss": 0.6695, | |
| "num_input_tokens_seen": 23769728, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.027491408934708, | |
| "grad_norm": 0.14795149862766266, | |
| "learning_rate": 4.874247815504693e-05, | |
| "loss": 0.2962, | |
| "num_input_tokens_seen": 23904520, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.0412371134020617, | |
| "grad_norm": 0.2052057981491089, | |
| "learning_rate": 4.872534122970535e-05, | |
| "loss": 0.2801, | |
| "num_input_tokens_seen": 24089424, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.0549828178694156, | |
| "grad_norm": 0.2112075388431549, | |
| "learning_rate": 4.8708091380154984e-05, | |
| "loss": 0.271, | |
| "num_input_tokens_seen": 24265464, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.06872852233677, | |
| "grad_norm": 0.14855563640594482, | |
| "learning_rate": 4.8690728688498996e-05, | |
| "loss": 0.3148, | |
| "num_input_tokens_seen": 24444312, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.082474226804124, | |
| "grad_norm": 0.18762660026550293, | |
| "learning_rate": 4.867325323737765e-05, | |
| "loss": 0.2721, | |
| "num_input_tokens_seen": 24625000, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.0962199312714778, | |
| "grad_norm": 0.17686530947685242, | |
| "learning_rate": 4.865566510996787e-05, | |
| "loss": 0.275, | |
| "num_input_tokens_seen": 24797080, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.1099656357388317, | |
| "grad_norm": 0.13860000669956207, | |
| "learning_rate": 4.8637964389982926e-05, | |
| "loss": 0.3163, | |
| "num_input_tokens_seen": 24976024, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.1237113402061856, | |
| "grad_norm": 0.24213668704032898, | |
| "learning_rate": 4.862015116167196e-05, | |
| "loss": 0.2743, | |
| "num_input_tokens_seen": 25103080, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.1374570446735395, | |
| "grad_norm": 0.1803780049085617, | |
| "learning_rate": 4.860222550981961e-05, | |
| "loss": 0.2585, | |
| "num_input_tokens_seen": 25270536, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.1512027491408934, | |
| "grad_norm": 0.14764904975891113, | |
| "learning_rate": 4.8584187519745636e-05, | |
| "loss": 0.2982, | |
| "num_input_tokens_seen": 25444864, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.1649484536082473, | |
| "grad_norm": 0.1767294853925705, | |
| "learning_rate": 4.856603727730447e-05, | |
| "loss": 0.3033, | |
| "num_input_tokens_seen": 25626504, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.178694158075601, | |
| "grad_norm": 0.15214288234710693, | |
| "learning_rate": 4.854777486888481e-05, | |
| "loss": 0.2769, | |
| "num_input_tokens_seen": 25798384, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.1924398625429555, | |
| "grad_norm": 0.16756123304367065, | |
| "learning_rate": 4.852940038140927e-05, | |
| "loss": 0.2777, | |
| "num_input_tokens_seen": 25966864, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.2061855670103094, | |
| "grad_norm": 0.15027989447116852, | |
| "learning_rate": 4.851091390233388e-05, | |
| "loss": 0.34, | |
| "num_input_tokens_seen": 26148840, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.2199312714776633, | |
| "grad_norm": 0.18845343589782715, | |
| "learning_rate": 4.849231551964771e-05, | |
| "loss": 0.2924, | |
| "num_input_tokens_seen": 26357320, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.2336769759450172, | |
| "grad_norm": 0.1529223918914795, | |
| "learning_rate": 4.847360532187248e-05, | |
| "loss": 0.2992, | |
| "num_input_tokens_seen": 26535248, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.247422680412371, | |
| "grad_norm": 0.16276657581329346, | |
| "learning_rate": 4.8454783398062106e-05, | |
| "loss": 0.301, | |
| "num_input_tokens_seen": 26664448, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.261168384879725, | |
| "grad_norm": 0.15381042659282684, | |
| "learning_rate": 4.843584983780225e-05, | |
| "loss": 0.3153, | |
| "num_input_tokens_seen": 26837888, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.274914089347079, | |
| "grad_norm": 0.16993018984794617, | |
| "learning_rate": 4.8416804731209945e-05, | |
| "loss": 0.2347, | |
| "num_input_tokens_seen": 26998208, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.288659793814433, | |
| "grad_norm": 0.16240018606185913, | |
| "learning_rate": 4.839764816893315e-05, | |
| "loss": 0.2904, | |
| "num_input_tokens_seen": 27151176, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.3024054982817868, | |
| "grad_norm": 0.17280317842960358, | |
| "learning_rate": 4.83783802421503e-05, | |
| "loss": 0.3248, | |
| "num_input_tokens_seen": 27304128, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.3161512027491407, | |
| "grad_norm": 0.16115079820156097, | |
| "learning_rate": 4.835900104256989e-05, | |
| "loss": 0.2783, | |
| "num_input_tokens_seen": 27444560, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.329896907216495, | |
| "grad_norm": 0.1730480194091797, | |
| "learning_rate": 4.8339510662430046e-05, | |
| "loss": 0.2644, | |
| "num_input_tokens_seen": 27610048, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.343642611683849, | |
| "grad_norm": 0.17980219423770905, | |
| "learning_rate": 4.831990919449806e-05, | |
| "loss": 0.2843, | |
| "num_input_tokens_seen": 27751888, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.357388316151203, | |
| "grad_norm": 0.17816686630249023, | |
| "learning_rate": 4.830019673206997e-05, | |
| "loss": 0.3137, | |
| "num_input_tokens_seen": 27909264, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.3711340206185567, | |
| "grad_norm": 0.1630104035139084, | |
| "learning_rate": 4.828037336897009e-05, | |
| "loss": 0.2555, | |
| "num_input_tokens_seen": 28057232, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.3848797250859106, | |
| "grad_norm": 0.2017955482006073, | |
| "learning_rate": 4.826043919955062e-05, | |
| "loss": 0.2623, | |
| "num_input_tokens_seen": 28187448, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.3986254295532645, | |
| "grad_norm": 0.19713768362998962, | |
| "learning_rate": 4.8240394318691115e-05, | |
| "loss": 0.268, | |
| "num_input_tokens_seen": 28333480, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.4123711340206184, | |
| "grad_norm": 0.21180681884288788, | |
| "learning_rate": 4.822023882179811e-05, | |
| "loss": 0.2915, | |
| "num_input_tokens_seen": 28487680, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.4261168384879723, | |
| "grad_norm": 0.18539898097515106, | |
| "learning_rate": 4.819997280480462e-05, | |
| "loss": 0.2591, | |
| "num_input_tokens_seen": 28663096, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.4398625429553267, | |
| "grad_norm": 0.18349367380142212, | |
| "learning_rate": 4.817959636416969e-05, | |
| "loss": 0.3019, | |
| "num_input_tokens_seen": 28828128, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.4536082474226806, | |
| "grad_norm": 0.18607863783836365, | |
| "learning_rate": 4.815910959687796e-05, | |
| "loss": 0.2213, | |
| "num_input_tokens_seen": 29033720, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.4673539518900345, | |
| "grad_norm": 0.18565218150615692, | |
| "learning_rate": 4.813851260043916e-05, | |
| "loss": 0.2858, | |
| "num_input_tokens_seen": 29180176, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.4810996563573884, | |
| "grad_norm": 0.20488892495632172, | |
| "learning_rate": 4.811780547288771e-05, | |
| "loss": 0.2873, | |
| "num_input_tokens_seen": 29355408, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.4948453608247423, | |
| "grad_norm": 0.18418937921524048, | |
| "learning_rate": 4.8096988312782174e-05, | |
| "loss": 0.2676, | |
| "num_input_tokens_seen": 29506384, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.508591065292096, | |
| "grad_norm": 0.23041057586669922, | |
| "learning_rate": 4.8076061219204854e-05, | |
| "loss": 0.2885, | |
| "num_input_tokens_seen": 29689968, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.52233676975945, | |
| "grad_norm": 0.18238617479801178, | |
| "learning_rate": 4.80550242917613e-05, | |
| "loss": 0.2785, | |
| "num_input_tokens_seen": 29840984, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.536082474226804, | |
| "grad_norm": 0.1838451772928238, | |
| "learning_rate": 4.8033877630579815e-05, | |
| "loss": 0.2561, | |
| "num_input_tokens_seen": 30011240, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.549828178694158, | |
| "grad_norm": 0.19992855191230774, | |
| "learning_rate": 4.8012621336311016e-05, | |
| "loss": 0.2511, | |
| "num_input_tokens_seen": 30161384, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.563573883161512, | |
| "grad_norm": 0.20350296795368195, | |
| "learning_rate": 4.7991255510127306e-05, | |
| "loss": 0.2489, | |
| "num_input_tokens_seen": 30324832, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.5773195876288657, | |
| "grad_norm": 0.17712481319904327, | |
| "learning_rate": 4.796978025372246e-05, | |
| "loss": 0.2987, | |
| "num_input_tokens_seen": 30487752, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.59106529209622, | |
| "grad_norm": 0.1777142882347107, | |
| "learning_rate": 4.794819566931107e-05, | |
| "loss": 0.3121, | |
| "num_input_tokens_seen": 30645568, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.604810996563574, | |
| "grad_norm": 0.19272726774215698, | |
| "learning_rate": 4.79265018596281e-05, | |
| "loss": 0.2977, | |
| "num_input_tokens_seen": 30792008, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.618556701030928, | |
| "grad_norm": 0.19260460138320923, | |
| "learning_rate": 4.7904698927928406e-05, | |
| "loss": 0.2443, | |
| "num_input_tokens_seen": 30947936, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.6323024054982818, | |
| "grad_norm": 0.2009551227092743, | |
| "learning_rate": 4.788278697798618e-05, | |
| "loss": 0.2386, | |
| "num_input_tokens_seen": 31086808, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.6460481099656357, | |
| "grad_norm": 0.183323472738266, | |
| "learning_rate": 4.786076611409456e-05, | |
| "loss": 0.3116, | |
| "num_input_tokens_seen": 31245416, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.6597938144329896, | |
| "grad_norm": 0.20595161616802216, | |
| "learning_rate": 4.783863644106502e-05, | |
| "loss": 0.2649, | |
| "num_input_tokens_seen": 31451024, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.673539518900344, | |
| "grad_norm": 0.1823594719171524, | |
| "learning_rate": 4.7816398064226984e-05, | |
| "loss": 0.2617, | |
| "num_input_tokens_seen": 31609072, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.687285223367698, | |
| "grad_norm": 0.23399166762828827, | |
| "learning_rate": 4.7794051089427214e-05, | |
| "loss": 0.3078, | |
| "num_input_tokens_seen": 31787728, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.7010309278350517, | |
| "grad_norm": 0.16351115703582764, | |
| "learning_rate": 4.7771595623029394e-05, | |
| "loss": 0.269, | |
| "num_input_tokens_seen": 31952992, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.7147766323024056, | |
| "grad_norm": 0.22723722457885742, | |
| "learning_rate": 4.7749031771913584e-05, | |
| "loss": 0.2618, | |
| "num_input_tokens_seen": 32126672, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.7285223367697595, | |
| "grad_norm": 0.1911366730928421, | |
| "learning_rate": 4.7726359643475696e-05, | |
| "loss": 0.27, | |
| "num_input_tokens_seen": 32296224, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.7422680412371134, | |
| "grad_norm": 0.20369476079940796, | |
| "learning_rate": 4.7703579345627035e-05, | |
| "loss": 0.252, | |
| "num_input_tokens_seen": 32455936, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.7560137457044673, | |
| "grad_norm": 0.18237197399139404, | |
| "learning_rate": 4.768069098679373e-05, | |
| "loss": 0.2683, | |
| "num_input_tokens_seen": 32594424, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.7697594501718212, | |
| "grad_norm": 0.2236253023147583, | |
| "learning_rate": 4.765769467591625e-05, | |
| "loss": 0.307, | |
| "num_input_tokens_seen": 32758272, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.783505154639175, | |
| "grad_norm": 0.2112901508808136, | |
| "learning_rate": 4.7634590522448884e-05, | |
| "loss": 0.3264, | |
| "num_input_tokens_seen": 32915472, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.797250859106529, | |
| "grad_norm": 0.20340733230113983, | |
| "learning_rate": 4.761137863635921e-05, | |
| "loss": 0.2489, | |
| "num_input_tokens_seen": 33064480, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.810996563573883, | |
| "grad_norm": 0.21158529818058014, | |
| "learning_rate": 4.758805912812755e-05, | |
| "loss": 0.2424, | |
| "num_input_tokens_seen": 33221424, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.824742268041237, | |
| "grad_norm": 0.20873288810253143, | |
| "learning_rate": 4.756463210874652e-05, | |
| "loss": 0.2856, | |
| "num_input_tokens_seen": 33383520, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.838487972508591, | |
| "grad_norm": 0.19140155613422394, | |
| "learning_rate": 4.7541097689720406e-05, | |
| "loss": 0.265, | |
| "num_input_tokens_seen": 33531664, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.852233676975945, | |
| "grad_norm": 0.18534792959690094, | |
| "learning_rate": 4.7517455983064694e-05, | |
| "loss": 0.287, | |
| "num_input_tokens_seen": 33698904, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.865979381443299, | |
| "grad_norm": 0.188729926943779, | |
| "learning_rate": 4.749370710130554e-05, | |
| "loss": 0.1996, | |
| "num_input_tokens_seen": 33855080, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.879725085910653, | |
| "grad_norm": 0.22576089203357697, | |
| "learning_rate": 4.7469851157479177e-05, | |
| "loss": 0.3089, | |
| "num_input_tokens_seen": 34013872, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.893470790378007, | |
| "grad_norm": 0.21085543930530548, | |
| "learning_rate": 4.744588826513144e-05, | |
| "loss": 0.28, | |
| "num_input_tokens_seen": 34184568, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.9072164948453607, | |
| "grad_norm": 0.17552731931209564, | |
| "learning_rate": 4.742181853831721e-05, | |
| "loss": 0.2796, | |
| "num_input_tokens_seen": 34374712, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.9209621993127146, | |
| "grad_norm": 0.20688538253307343, | |
| "learning_rate": 4.7397642091599833e-05, | |
| "loss": 0.2203, | |
| "num_input_tokens_seen": 34511208, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.934707903780069, | |
| "grad_norm": 0.19914135336875916, | |
| "learning_rate": 4.737335904005063e-05, | |
| "loss": 0.2496, | |
| "num_input_tokens_seen": 34673936, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.948453608247423, | |
| "grad_norm": 0.19035300612449646, | |
| "learning_rate": 4.7348969499248306e-05, | |
| "loss": 0.28, | |
| "num_input_tokens_seen": 34850560, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.9621993127147768, | |
| "grad_norm": 0.17438046634197235, | |
| "learning_rate": 4.732447358527843e-05, | |
| "loss": 0.2571, | |
| "num_input_tokens_seen": 35006936, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.9759450171821307, | |
| "grad_norm": 0.17543044686317444, | |
| "learning_rate": 4.7299871414732855e-05, | |
| "loss": 0.2586, | |
| "num_input_tokens_seen": 35153264, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.9896907216494846, | |
| "grad_norm": 0.2029866874217987, | |
| "learning_rate": 4.72751631047092e-05, | |
| "loss": 0.2789, | |
| "num_input_tokens_seen": 35318440, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 3.013745704467354, | |
| "grad_norm": 0.35760697722435, | |
| "learning_rate": 4.725034877281025e-05, | |
| "loss": 0.6291, | |
| "num_input_tokens_seen": 35573264, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 3.027491408934708, | |
| "grad_norm": 0.16751083731651306, | |
| "learning_rate": 4.722542853714341e-05, | |
| "loss": 0.2659, | |
| "num_input_tokens_seen": 35731128, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 3.0412371134020617, | |
| "grad_norm": 0.2026410698890686, | |
| "learning_rate": 4.720040251632018e-05, | |
| "loss": 0.2508, | |
| "num_input_tokens_seen": 35895664, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 3.0549828178694156, | |
| "grad_norm": 0.20279920101165771, | |
| "learning_rate": 4.717527082945554e-05, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 36033536, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.06872852233677, | |
| "grad_norm": 0.2317742109298706, | |
| "learning_rate": 4.715003359616741e-05, | |
| "loss": 0.2506, | |
| "num_input_tokens_seen": 36176824, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 3.082474226804124, | |
| "grad_norm": 0.24150508642196655, | |
| "learning_rate": 4.712469093657605e-05, | |
| "loss": 0.2758, | |
| "num_input_tokens_seen": 36337584, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 3.0962199312714778, | |
| "grad_norm": 0.24855269491672516, | |
| "learning_rate": 4.709924297130354e-05, | |
| "loss": 0.2291, | |
| "num_input_tokens_seen": 36478144, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 3.1099656357388317, | |
| "grad_norm": 0.2550382614135742, | |
| "learning_rate": 4.707368982147318e-05, | |
| "loss": 0.2428, | |
| "num_input_tokens_seen": 36655616, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 3.1237113402061856, | |
| "grad_norm": 0.26995137333869934, | |
| "learning_rate": 4.7048031608708876e-05, | |
| "loss": 0.2487, | |
| "num_input_tokens_seen": 36807552, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 3.1374570446735395, | |
| "grad_norm": 0.22369439899921417, | |
| "learning_rate": 4.7022268455134646e-05, | |
| "loss": 0.2368, | |
| "num_input_tokens_seen": 36978472, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 3.1512027491408934, | |
| "grad_norm": 0.21802932024002075, | |
| "learning_rate": 4.699640048337394e-05, | |
| "loss": 0.2413, | |
| "num_input_tokens_seen": 37155768, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 3.1649484536082473, | |
| "grad_norm": 0.1978904753923416, | |
| "learning_rate": 4.697042781654913e-05, | |
| "loss": 0.2492, | |
| "num_input_tokens_seen": 37303928, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 3.178694158075601, | |
| "grad_norm": 0.21425876021385193, | |
| "learning_rate": 4.694435057828092e-05, | |
| "loss": 0.2752, | |
| "num_input_tokens_seen": 37462800, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 3.1924398625429555, | |
| "grad_norm": 0.21123024821281433, | |
| "learning_rate": 4.69181688926877e-05, | |
| "loss": 0.2441, | |
| "num_input_tokens_seen": 37613368, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.2061855670103094, | |
| "grad_norm": 0.18438878655433655, | |
| "learning_rate": 4.6891882884384994e-05, | |
| "loss": 0.1968, | |
| "num_input_tokens_seen": 37772792, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 3.2199312714776633, | |
| "grad_norm": 0.2324478179216385, | |
| "learning_rate": 4.6865492678484895e-05, | |
| "loss": 0.2239, | |
| "num_input_tokens_seen": 37951216, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 3.2336769759450172, | |
| "grad_norm": 0.2039729207754135, | |
| "learning_rate": 4.683899840059542e-05, | |
| "loss": 0.232, | |
| "num_input_tokens_seen": 38114560, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 3.247422680412371, | |
| "grad_norm": 0.23696556687355042, | |
| "learning_rate": 4.681240017681993e-05, | |
| "loss": 0.2694, | |
| "num_input_tokens_seen": 38259672, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 3.261168384879725, | |
| "grad_norm": 0.20615766942501068, | |
| "learning_rate": 4.678569813375654e-05, | |
| "loss": 0.2614, | |
| "num_input_tokens_seen": 38421328, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 3.274914089347079, | |
| "grad_norm": 0.21785439550876617, | |
| "learning_rate": 4.6758892398497494e-05, | |
| "loss": 0.2392, | |
| "num_input_tokens_seen": 38571864, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 3.288659793814433, | |
| "grad_norm": 0.22136537730693817, | |
| "learning_rate": 4.67319830986286e-05, | |
| "loss": 0.2666, | |
| "num_input_tokens_seen": 38731600, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 3.3024054982817868, | |
| "grad_norm": 0.2099449336528778, | |
| "learning_rate": 4.670497036222856e-05, | |
| "loss": 0.2282, | |
| "num_input_tokens_seen": 38892576, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 3.3161512027491407, | |
| "grad_norm": 0.23128457367420197, | |
| "learning_rate": 4.667785431786843e-05, | |
| "loss": 0.2518, | |
| "num_input_tokens_seen": 39087752, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 3.329896907216495, | |
| "grad_norm": 0.2390356957912445, | |
| "learning_rate": 4.665063509461097e-05, | |
| "loss": 0.2261, | |
| "num_input_tokens_seen": 39230920, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.343642611683849, | |
| "grad_norm": 0.18076427280902863, | |
| "learning_rate": 4.662331282201001e-05, | |
| "loss": 0.224, | |
| "num_input_tokens_seen": 39432656, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 3.357388316151203, | |
| "grad_norm": 0.2572460472583771, | |
| "learning_rate": 4.659588763010989e-05, | |
| "loss": 0.2429, | |
| "num_input_tokens_seen": 39582168, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 3.3711340206185567, | |
| "grad_norm": 0.22216255962848663, | |
| "learning_rate": 4.65683596494448e-05, | |
| "loss": 0.2251, | |
| "num_input_tokens_seen": 39740320, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 3.3848797250859106, | |
| "grad_norm": 0.20128430426120758, | |
| "learning_rate": 4.6540729011038146e-05, | |
| "loss": 0.1635, | |
| "num_input_tokens_seen": 39910744, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 3.3986254295532645, | |
| "grad_norm": 0.2261241376399994, | |
| "learning_rate": 4.6512995846401975e-05, | |
| "loss": 0.2707, | |
| "num_input_tokens_seen": 40054968, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 3.4123711340206184, | |
| "grad_norm": 0.22856874763965607, | |
| "learning_rate": 4.648516028753632e-05, | |
| "loss": 0.2502, | |
| "num_input_tokens_seen": 40220256, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 3.4261168384879723, | |
| "grad_norm": 0.20142170786857605, | |
| "learning_rate": 4.645722246692856e-05, | |
| "loss": 0.2395, | |
| "num_input_tokens_seen": 40393312, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 3.4398625429553267, | |
| "grad_norm": 0.22595378756523132, | |
| "learning_rate": 4.642918251755281e-05, | |
| "loss": 0.2762, | |
| "num_input_tokens_seen": 40549872, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 3.4536082474226806, | |
| "grad_norm": 0.23270241916179657, | |
| "learning_rate": 4.6401040572869295e-05, | |
| "loss": 0.239, | |
| "num_input_tokens_seen": 40730192, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 3.4673539518900345, | |
| "grad_norm": 0.23554874956607819, | |
| "learning_rate": 4.637279676682367e-05, | |
| "loss": 0.1764, | |
| "num_input_tokens_seen": 40881976, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.4810996563573884, | |
| "grad_norm": 0.22602517902851105, | |
| "learning_rate": 4.634445123384644e-05, | |
| "loss": 0.2125, | |
| "num_input_tokens_seen": 41057440, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 3.4948453608247423, | |
| "grad_norm": 0.22063471376895905, | |
| "learning_rate": 4.6316004108852305e-05, | |
| "loss": 0.2192, | |
| "num_input_tokens_seen": 41206424, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 3.508591065292096, | |
| "grad_norm": 0.19604997336864471, | |
| "learning_rate": 4.628745552723948e-05, | |
| "loss": 0.2947, | |
| "num_input_tokens_seen": 41357312, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 3.52233676975945, | |
| "grad_norm": 0.2227766364812851, | |
| "learning_rate": 4.6258805624889075e-05, | |
| "loss": 0.2868, | |
| "num_input_tokens_seen": 41517120, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 3.536082474226804, | |
| "grad_norm": 0.2262553870677948, | |
| "learning_rate": 4.6230054538164474e-05, | |
| "loss": 0.2073, | |
| "num_input_tokens_seen": 41666440, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 3.549828178694158, | |
| "grad_norm": 0.45836442708969116, | |
| "learning_rate": 4.620120240391065e-05, | |
| "loss": 0.2633, | |
| "num_input_tokens_seen": 41848000, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 3.563573883161512, | |
| "grad_norm": 0.30991941690444946, | |
| "learning_rate": 4.6172249359453534e-05, | |
| "loss": 0.1893, | |
| "num_input_tokens_seen": 42056616, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 3.5773195876288657, | |
| "grad_norm": 0.34851858019828796, | |
| "learning_rate": 4.614319554259934e-05, | |
| "loss": 0.2236, | |
| "num_input_tokens_seen": 42223312, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 3.59106529209622, | |
| "grad_norm": 0.2241206020116806, | |
| "learning_rate": 4.611404109163392e-05, | |
| "loss": 0.2348, | |
| "num_input_tokens_seen": 42370536, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 3.604810996563574, | |
| "grad_norm": 0.4450778365135193, | |
| "learning_rate": 4.608478614532215e-05, | |
| "loss": 0.2439, | |
| "num_input_tokens_seen": 42510984, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.618556701030928, | |
| "grad_norm": 0.23098166286945343, | |
| "learning_rate": 4.6055430842907167e-05, | |
| "loss": 0.2661, | |
| "num_input_tokens_seen": 42677376, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 3.6323024054982818, | |
| "grad_norm": 0.3660583198070526, | |
| "learning_rate": 4.602597532410981e-05, | |
| "loss": 0.2254, | |
| "num_input_tokens_seen": 42844616, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 3.6460481099656357, | |
| "grad_norm": 0.24277503788471222, | |
| "learning_rate": 4.599641972912791e-05, | |
| "loss": 0.2405, | |
| "num_input_tokens_seen": 43022448, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 3.6597938144329896, | |
| "grad_norm": 0.23149937391281128, | |
| "learning_rate": 4.5966764198635606e-05, | |
| "loss": 0.2822, | |
| "num_input_tokens_seen": 43194784, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 3.673539518900344, | |
| "grad_norm": 0.3232170641422272, | |
| "learning_rate": 4.59370088737827e-05, | |
| "loss": 0.2034, | |
| "num_input_tokens_seen": 43347416, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 3.687285223367698, | |
| "grad_norm": 0.37348437309265137, | |
| "learning_rate": 4.5907153896193985e-05, | |
| "loss": 0.2621, | |
| "num_input_tokens_seen": 43526480, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 3.7010309278350517, | |
| "grad_norm": 0.24496665596961975, | |
| "learning_rate": 4.587719940796858e-05, | |
| "loss": 0.2196, | |
| "num_input_tokens_seen": 43698192, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 3.7147766323024056, | |
| "grad_norm": 0.3418863117694855, | |
| "learning_rate": 4.5847145551679206e-05, | |
| "loss": 0.2788, | |
| "num_input_tokens_seen": 43857288, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 3.7285223367697595, | |
| "grad_norm": 0.24283042550086975, | |
| "learning_rate": 4.581699247037157e-05, | |
| "loss": 0.2048, | |
| "num_input_tokens_seen": 44029272, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 3.7422680412371134, | |
| "grad_norm": 0.3411911129951477, | |
| "learning_rate": 4.5786740307563636e-05, | |
| "loss": 0.1998, | |
| "num_input_tokens_seen": 44179152, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.7560137457044673, | |
| "grad_norm": 0.3600534498691559, | |
| "learning_rate": 4.5756389207244965e-05, | |
| "loss": 0.2202, | |
| "num_input_tokens_seen": 44350640, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 3.7697594501718212, | |
| "grad_norm": 0.19958753883838654, | |
| "learning_rate": 4.572593931387604e-05, | |
| "loss": 0.2, | |
| "num_input_tokens_seen": 44549824, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 3.783505154639175, | |
| "grad_norm": 0.3326575458049774, | |
| "learning_rate": 4.569539077238756e-05, | |
| "loss": 0.2196, | |
| "num_input_tokens_seen": 44715672, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 3.797250859106529, | |
| "grad_norm": 0.31120291352272034, | |
| "learning_rate": 4.566474372817972e-05, | |
| "loss": 0.1855, | |
| "num_input_tokens_seen": 44900592, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 3.810996563573883, | |
| "grad_norm": 0.24602434039115906, | |
| "learning_rate": 4.5633998327121595e-05, | |
| "loss": 0.24, | |
| "num_input_tokens_seen": 45068960, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 3.824742268041237, | |
| "grad_norm": 0.5087242722511292, | |
| "learning_rate": 4.5603154715550386e-05, | |
| "loss": 0.2459, | |
| "num_input_tokens_seen": 45224712, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 3.838487972508591, | |
| "grad_norm": 0.23740601539611816, | |
| "learning_rate": 4.5572213040270765e-05, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 45367680, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 3.852233676975945, | |
| "grad_norm": 0.38796210289001465, | |
| "learning_rate": 4.55411734485541e-05, | |
| "loss": 0.2133, | |
| "num_input_tokens_seen": 45534568, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 3.865979381443299, | |
| "grad_norm": 0.30596432089805603, | |
| "learning_rate": 4.551003608813784e-05, | |
| "loss": 0.2771, | |
| "num_input_tokens_seen": 45704688, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 3.879725085910653, | |
| "grad_norm": 0.22843089699745178, | |
| "learning_rate": 4.54788011072248e-05, | |
| "loss": 0.1917, | |
| "num_input_tokens_seen": 45848416, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.893470790378007, | |
| "grad_norm": 0.45724159479141235, | |
| "learning_rate": 4.544746865448239e-05, | |
| "loss": 0.2629, | |
| "num_input_tokens_seen": 45973880, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 3.9072164948453607, | |
| "grad_norm": 0.39729124307632446, | |
| "learning_rate": 4.541603887904198e-05, | |
| "loss": 0.2679, | |
| "num_input_tokens_seen": 46127024, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 3.9209621993127146, | |
| "grad_norm": 0.31810271739959717, | |
| "learning_rate": 4.538451193049814e-05, | |
| "loss": 0.1642, | |
| "num_input_tokens_seen": 46268152, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 3.934707903780069, | |
| "grad_norm": 0.37692317366600037, | |
| "learning_rate": 4.535288795890798e-05, | |
| "loss": 0.2288, | |
| "num_input_tokens_seen": 46439416, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 3.948453608247423, | |
| "grad_norm": 0.3331313729286194, | |
| "learning_rate": 4.5321167114790385e-05, | |
| "loss": 0.2196, | |
| "num_input_tokens_seen": 46610744, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.9621993127147768, | |
| "grad_norm": 0.3591002821922302, | |
| "learning_rate": 4.528934954912531e-05, | |
| "loss": 0.1917, | |
| "num_input_tokens_seen": 46778272, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 3.9759450171821307, | |
| "grad_norm": 0.6103318333625793, | |
| "learning_rate": 4.525743541335309e-05, | |
| "loss": 0.2781, | |
| "num_input_tokens_seen": 46941080, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 3.9896907216494846, | |
| "grad_norm": 0.22901976108551025, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 0.2576, | |
| "num_input_tokens_seen": 47112720, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 4.013745704467354, | |
| "grad_norm": 0.8023688197135925, | |
| "learning_rate": 4.519331803954599e-05, | |
| "loss": 0.4452, | |
| "num_input_tokens_seen": 47336144, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 4.027491408934708, | |
| "grad_norm": 0.2645151913166046, | |
| "learning_rate": 4.516111510668707e-05, | |
| "loss": 0.1975, | |
| "num_input_tokens_seen": 47502344, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.041237113402062, | |
| "grad_norm": 0.39036113023757935, | |
| "learning_rate": 4.5128816214071454e-05, | |
| "loss": 0.2099, | |
| "num_input_tokens_seen": 47668336, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 4.054982817869416, | |
| "grad_norm": 0.43727049231529236, | |
| "learning_rate": 4.509642151543043e-05, | |
| "loss": 0.2489, | |
| "num_input_tokens_seen": 47814888, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 4.0687285223367695, | |
| "grad_norm": 0.23602111637592316, | |
| "learning_rate": 4.5063931164951276e-05, | |
| "loss": 0.1986, | |
| "num_input_tokens_seen": 47969952, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 4.082474226804123, | |
| "grad_norm": 0.24923105537891388, | |
| "learning_rate": 4.503134531727652e-05, | |
| "loss": 0.2115, | |
| "num_input_tokens_seen": 48130808, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 4.096219931271477, | |
| "grad_norm": 0.3742007911205292, | |
| "learning_rate": 4.499866412750324e-05, | |
| "loss": 0.2242, | |
| "num_input_tokens_seen": 48281472, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 4.109965635738831, | |
| "grad_norm": 0.243782177567482, | |
| "learning_rate": 4.496588775118232e-05, | |
| "loss": 0.2135, | |
| "num_input_tokens_seen": 48487112, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 4.123711340206185, | |
| "grad_norm": 0.27415668964385986, | |
| "learning_rate": 4.493301634431768e-05, | |
| "loss": 0.1989, | |
| "num_input_tokens_seen": 48666792, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 4.13745704467354, | |
| "grad_norm": 0.33320093154907227, | |
| "learning_rate": 4.490005006336555e-05, | |
| "loss": 0.2289, | |
| "num_input_tokens_seen": 48835232, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 4.151202749140894, | |
| "grad_norm": 0.3165149390697479, | |
| "learning_rate": 4.486698906523375e-05, | |
| "loss": 0.1998, | |
| "num_input_tokens_seen": 48995760, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 4.164948453608248, | |
| "grad_norm": 0.24121354520320892, | |
| "learning_rate": 4.4833833507280884e-05, | |
| "loss": 0.1768, | |
| "num_input_tokens_seen": 49150720, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.178694158075602, | |
| "grad_norm": 0.3779366612434387, | |
| "learning_rate": 4.480058354731566e-05, | |
| "loss": 0.2186, | |
| "num_input_tokens_seen": 49303176, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 4.1924398625429555, | |
| "grad_norm": 0.2691211402416229, | |
| "learning_rate": 4.476723934359609e-05, | |
| "loss": 0.1617, | |
| "num_input_tokens_seen": 49448312, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 4.206185567010309, | |
| "grad_norm": 0.29209020733833313, | |
| "learning_rate": 4.473380105482875e-05, | |
| "loss": 0.2304, | |
| "num_input_tokens_seen": 49602640, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 4.219931271477663, | |
| "grad_norm": 0.2639620006084442, | |
| "learning_rate": 4.4700268840168045e-05, | |
| "loss": 0.209, | |
| "num_input_tokens_seen": 49761432, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 4.233676975945017, | |
| "grad_norm": 0.2334647923707962, | |
| "learning_rate": 4.466664285921542e-05, | |
| "loss": 0.2189, | |
| "num_input_tokens_seen": 49914784, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 4.247422680412371, | |
| "grad_norm": 0.24197332561016083, | |
| "learning_rate": 4.463292327201862e-05, | |
| "loss": 0.1938, | |
| "num_input_tokens_seen": 50103400, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 4.261168384879725, | |
| "grad_norm": 0.23365367949008942, | |
| "learning_rate": 4.459911023907092e-05, | |
| "loss": 0.1832, | |
| "num_input_tokens_seen": 50266424, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 4.274914089347079, | |
| "grad_norm": 0.26379579305648804, | |
| "learning_rate": 4.456520392131035e-05, | |
| "loss": 0.1713, | |
| "num_input_tokens_seen": 50425736, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 4.288659793814433, | |
| "grad_norm": 0.22317898273468018, | |
| "learning_rate": 4.453120448011897e-05, | |
| "loss": 0.1777, | |
| "num_input_tokens_seen": 50586864, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 4.302405498281787, | |
| "grad_norm": 0.2554914653301239, | |
| "learning_rate": 4.4497112077322044e-05, | |
| "loss": 0.214, | |
| "num_input_tokens_seen": 50771744, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.316151202749141, | |
| "grad_norm": 0.26939499378204346, | |
| "learning_rate": 4.446292687518734e-05, | |
| "loss": 0.2374, | |
| "num_input_tokens_seen": 50926016, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 4.329896907216495, | |
| "grad_norm": 0.2551642656326294, | |
| "learning_rate": 4.442864903642428e-05, | |
| "loss": 0.1871, | |
| "num_input_tokens_seen": 51081064, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 4.3436426116838485, | |
| "grad_norm": 0.3198559582233429, | |
| "learning_rate": 4.4394278724183215e-05, | |
| "loss": 0.2009, | |
| "num_input_tokens_seen": 51250976, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 4.357388316151202, | |
| "grad_norm": 0.2643042802810669, | |
| "learning_rate": 4.435981610205464e-05, | |
| "loss": 0.2399, | |
| "num_input_tokens_seen": 51381456, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 4.371134020618557, | |
| "grad_norm": 0.2745197117328644, | |
| "learning_rate": 4.4325261334068426e-05, | |
| "loss": 0.2432, | |
| "num_input_tokens_seen": 51516552, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 4.384879725085911, | |
| "grad_norm": 0.26187458634376526, | |
| "learning_rate": 4.4290614584693004e-05, | |
| "loss": 0.2023, | |
| "num_input_tokens_seen": 51665440, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 4.398625429553265, | |
| "grad_norm": 0.22071537375450134, | |
| "learning_rate": 4.425587601883461e-05, | |
| "loss": 0.2391, | |
| "num_input_tokens_seen": 51813416, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 4.412371134020619, | |
| "grad_norm": 0.2484397292137146, | |
| "learning_rate": 4.4221045801836494e-05, | |
| "loss": 0.2503, | |
| "num_input_tokens_seen": 51976712, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 4.426116838487973, | |
| "grad_norm": 0.26108404994010925, | |
| "learning_rate": 4.418612409947813e-05, | |
| "loss": 0.2052, | |
| "num_input_tokens_seen": 52127472, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 4.439862542955327, | |
| "grad_norm": 0.7923046350479126, | |
| "learning_rate": 4.415111107797445e-05, | |
| "loss": 0.2154, | |
| "num_input_tokens_seen": 52313744, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.453608247422681, | |
| "grad_norm": 0.2435230314731598, | |
| "learning_rate": 4.411600690397501e-05, | |
| "loss": 0.2231, | |
| "num_input_tokens_seen": 52465024, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 4.4673539518900345, | |
| "grad_norm": 0.302497535943985, | |
| "learning_rate": 4.408081174456322e-05, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 52613816, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 4.481099656357388, | |
| "grad_norm": 0.22382588684558868, | |
| "learning_rate": 4.404552576725557e-05, | |
| "loss": 0.2, | |
| "num_input_tokens_seen": 52780920, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 4.494845360824742, | |
| "grad_norm": 0.28364303708076477, | |
| "learning_rate": 4.401014914000078e-05, | |
| "loss": 0.2471, | |
| "num_input_tokens_seen": 52946400, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 4.508591065292096, | |
| "grad_norm": 0.27115166187286377, | |
| "learning_rate": 4.397468203117905e-05, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 53100768, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 4.52233676975945, | |
| "grad_norm": 0.24532680213451385, | |
| "learning_rate": 4.393912460960124e-05, | |
| "loss": 0.2088, | |
| "num_input_tokens_seen": 53277912, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 4.536082474226804, | |
| "grad_norm": 0.2653828263282776, | |
| "learning_rate": 4.3903477044508066e-05, | |
| "loss": 0.1733, | |
| "num_input_tokens_seen": 53415768, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 4.549828178694158, | |
| "grad_norm": 0.26437294483184814, | |
| "learning_rate": 4.386773950556931e-05, | |
| "loss": 0.2361, | |
| "num_input_tokens_seen": 53570440, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 4.563573883161512, | |
| "grad_norm": 0.2845474183559418, | |
| "learning_rate": 4.383191216288294e-05, | |
| "loss": 0.2554, | |
| "num_input_tokens_seen": 53713344, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 4.577319587628866, | |
| "grad_norm": 0.2253095805644989, | |
| "learning_rate": 4.379599518697444e-05, | |
| "loss": 0.2071, | |
| "num_input_tokens_seen": 53914032, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.59106529209622, | |
| "grad_norm": 0.26933300495147705, | |
| "learning_rate": 4.375998874879586e-05, | |
| "loss": 0.2078, | |
| "num_input_tokens_seen": 54082168, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 4.6048109965635735, | |
| "grad_norm": 0.2638954818248749, | |
| "learning_rate": 4.372389301972506e-05, | |
| "loss": 0.1879, | |
| "num_input_tokens_seen": 54270240, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 4.618556701030927, | |
| "grad_norm": 0.2769315540790558, | |
| "learning_rate": 4.3687708171564925e-05, | |
| "loss": 0.1885, | |
| "num_input_tokens_seen": 54464880, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 4.632302405498281, | |
| "grad_norm": 0.2243657261133194, | |
| "learning_rate": 4.3651434376542486e-05, | |
| "loss": 0.1635, | |
| "num_input_tokens_seen": 54606368, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 4.646048109965636, | |
| "grad_norm": 0.231032595038414, | |
| "learning_rate": 4.361507180730816e-05, | |
| "loss": 0.2003, | |
| "num_input_tokens_seen": 54777520, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 4.65979381443299, | |
| "grad_norm": 0.23758399486541748, | |
| "learning_rate": 4.357862063693486e-05, | |
| "loss": 0.1621, | |
| "num_input_tokens_seen": 54948888, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 4.673539518900344, | |
| "grad_norm": 0.24598275125026703, | |
| "learning_rate": 4.354208103891723e-05, | |
| "loss": 0.2042, | |
| "num_input_tokens_seen": 55112264, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 4.687285223367698, | |
| "grad_norm": 0.19837449491024017, | |
| "learning_rate": 4.3505453187170805e-05, | |
| "loss": 0.2108, | |
| "num_input_tokens_seen": 55305344, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 4.701030927835052, | |
| "grad_norm": 0.2365684062242508, | |
| "learning_rate": 4.346873725603116e-05, | |
| "loss": 0.193, | |
| "num_input_tokens_seen": 55488464, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 4.714776632302406, | |
| "grad_norm": 0.30427196621894836, | |
| "learning_rate": 4.34319334202531e-05, | |
| "loss": 0.2117, | |
| "num_input_tokens_seen": 55619288, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.7285223367697595, | |
| "grad_norm": 0.2726595401763916, | |
| "learning_rate": 4.339504185500984e-05, | |
| "loss": 0.1312, | |
| "num_input_tokens_seen": 55788920, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 4.742268041237113, | |
| "grad_norm": 0.3252059817314148, | |
| "learning_rate": 4.335806273589214e-05, | |
| "loss": 0.2707, | |
| "num_input_tokens_seen": 55944064, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 4.756013745704467, | |
| "grad_norm": 0.24577906727790833, | |
| "learning_rate": 4.332099623890748e-05, | |
| "loss": 0.2351, | |
| "num_input_tokens_seen": 56105648, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 4.769759450171821, | |
| "grad_norm": 0.3615509867668152, | |
| "learning_rate": 4.3283842540479264e-05, | |
| "loss": 0.2433, | |
| "num_input_tokens_seen": 56250752, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 4.783505154639175, | |
| "grad_norm": 0.30944108963012695, | |
| "learning_rate": 4.324660181744589e-05, | |
| "loss": 0.239, | |
| "num_input_tokens_seen": 56397520, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 4.797250859106529, | |
| "grad_norm": 0.24751578271389008, | |
| "learning_rate": 4.3209274247060004e-05, | |
| "loss": 0.2149, | |
| "num_input_tokens_seen": 56568152, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 4.810996563573883, | |
| "grad_norm": 0.33283287286758423, | |
| "learning_rate": 4.3171860006987605e-05, | |
| "loss": 0.2504, | |
| "num_input_tokens_seen": 56734400, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 4.824742268041237, | |
| "grad_norm": 0.2200183868408203, | |
| "learning_rate": 4.313435927530719e-05, | |
| "loss": 0.1806, | |
| "num_input_tokens_seen": 56914952, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 4.838487972508591, | |
| "grad_norm": 0.22224342823028564, | |
| "learning_rate": 4.309677223050895e-05, | |
| "loss": 0.2282, | |
| "num_input_tokens_seen": 57079920, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 4.852233676975945, | |
| "grad_norm": 0.3516889810562134, | |
| "learning_rate": 4.305909905149389e-05, | |
| "loss": 0.1922, | |
| "num_input_tokens_seen": 57253328, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.8659793814432994, | |
| "grad_norm": 0.2355610430240631, | |
| "learning_rate": 4.302133991757297e-05, | |
| "loss": 0.2155, | |
| "num_input_tokens_seen": 57394208, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 4.879725085910653, | |
| "grad_norm": 0.22704970836639404, | |
| "learning_rate": 4.2983495008466276e-05, | |
| "loss": 0.2044, | |
| "num_input_tokens_seen": 57580424, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 4.893470790378007, | |
| "grad_norm": 0.2274133712053299, | |
| "learning_rate": 4.294556450430216e-05, | |
| "loss": 0.1964, | |
| "num_input_tokens_seen": 57754136, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 4.907216494845361, | |
| "grad_norm": 0.23982001841068268, | |
| "learning_rate": 4.290754858561637e-05, | |
| "loss": 0.2139, | |
| "num_input_tokens_seen": 57932952, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 4.920962199312715, | |
| "grad_norm": 0.2361903190612793, | |
| "learning_rate": 4.2869447433351165e-05, | |
| "loss": 0.1962, | |
| "num_input_tokens_seen": 58081896, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 4.934707903780069, | |
| "grad_norm": 0.3595963716506958, | |
| "learning_rate": 4.2831261228854544e-05, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 58238736, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 4.948453608247423, | |
| "grad_norm": 0.26844820380210876, | |
| "learning_rate": 4.2792990153879284e-05, | |
| "loss": 0.2175, | |
| "num_input_tokens_seen": 58406488, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 4.962199312714777, | |
| "grad_norm": 0.20506052672863007, | |
| "learning_rate": 4.275463439058214e-05, | |
| "loss": 0.1931, | |
| "num_input_tokens_seen": 58580952, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 4.975945017182131, | |
| "grad_norm": 0.28067535161972046, | |
| "learning_rate": 4.271619412152292e-05, | |
| "loss": 0.2098, | |
| "num_input_tokens_seen": 58742312, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 4.989690721649485, | |
| "grad_norm": 0.2571549415588379, | |
| "learning_rate": 4.267766952966369e-05, | |
| "loss": 0.2243, | |
| "num_input_tokens_seen": 58910632, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.013745704467354, | |
| "grad_norm": 0.4295743405818939, | |
| "learning_rate": 4.263906079836783e-05, | |
| "loss": 0.3285, | |
| "num_input_tokens_seen": 59167400, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 5.027491408934708, | |
| "grad_norm": 0.30337727069854736, | |
| "learning_rate": 4.260036811139921e-05, | |
| "loss": 0.1803, | |
| "num_input_tokens_seen": 59319088, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 5.041237113402062, | |
| "grad_norm": 0.31406325101852417, | |
| "learning_rate": 4.25615916529213e-05, | |
| "loss": 0.1576, | |
| "num_input_tokens_seen": 59493568, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 5.054982817869416, | |
| "grad_norm": 0.25515082478523254, | |
| "learning_rate": 4.2522731607496275e-05, | |
| "loss": 0.2384, | |
| "num_input_tokens_seen": 59659032, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 5.0687285223367695, | |
| "grad_norm": 0.28770965337753296, | |
| "learning_rate": 4.248378816008418e-05, | |
| "loss": 0.1967, | |
| "num_input_tokens_seen": 59794736, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 5.082474226804123, | |
| "grad_norm": 0.279924601316452, | |
| "learning_rate": 4.244476149604201e-05, | |
| "loss": 0.1535, | |
| "num_input_tokens_seen": 59948600, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 5.096219931271477, | |
| "grad_norm": 0.30893272161483765, | |
| "learning_rate": 4.240565180112284e-05, | |
| "loss": 0.1737, | |
| "num_input_tokens_seen": 60117080, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 5.109965635738831, | |
| "grad_norm": 0.2537633180618286, | |
| "learning_rate": 4.2366459261474933e-05, | |
| "loss": 0.2354, | |
| "num_input_tokens_seen": 60273656, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 5.123711340206185, | |
| "grad_norm": 0.40629541873931885, | |
| "learning_rate": 4.23271840636409e-05, | |
| "loss": 0.1922, | |
| "num_input_tokens_seen": 60431472, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 5.13745704467354, | |
| "grad_norm": 0.23534585535526276, | |
| "learning_rate": 4.228782639455674e-05, | |
| "loss": 0.183, | |
| "num_input_tokens_seen": 60550176, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 5.151202749140894, | |
| "grad_norm": 0.32684120535850525, | |
| "learning_rate": 4.224838644155099e-05, | |
| "loss": 0.2056, | |
| "num_input_tokens_seen": 60685264, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 5.164948453608248, | |
| "grad_norm": 0.29426658153533936, | |
| "learning_rate": 4.220886439234385e-05, | |
| "loss": 0.1795, | |
| "num_input_tokens_seen": 60851704, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 5.178694158075602, | |
| "grad_norm": 0.26738616824150085, | |
| "learning_rate": 4.216926043504625e-05, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 60997448, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 5.1924398625429555, | |
| "grad_norm": 0.27694034576416016, | |
| "learning_rate": 4.212957475815898e-05, | |
| "loss": 0.174, | |
| "num_input_tokens_seen": 61147192, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 5.206185567010309, | |
| "grad_norm": 0.2717275321483612, | |
| "learning_rate": 4.208980755057178e-05, | |
| "loss": 0.2052, | |
| "num_input_tokens_seen": 61314976, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 5.219931271477663, | |
| "grad_norm": 0.23557984828948975, | |
| "learning_rate": 4.2049959001562464e-05, | |
| "loss": 0.1877, | |
| "num_input_tokens_seen": 61486656, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 5.233676975945017, | |
| "grad_norm": 0.28602030873298645, | |
| "learning_rate": 4.201002930079598e-05, | |
| "loss": 0.1989, | |
| "num_input_tokens_seen": 61658608, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 5.247422680412371, | |
| "grad_norm": 0.2592761516571045, | |
| "learning_rate": 4.197001863832355e-05, | |
| "loss": 0.1957, | |
| "num_input_tokens_seen": 61838112, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 5.261168384879725, | |
| "grad_norm": 0.2270824909210205, | |
| "learning_rate": 4.192992720458172e-05, | |
| "loss": 0.1825, | |
| "num_input_tokens_seen": 62002664, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 5.274914089347079, | |
| "grad_norm": 0.25427162647247314, | |
| "learning_rate": 4.188975519039151e-05, | |
| "loss": 0.2111, | |
| "num_input_tokens_seen": 62173688, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.288659793814433, | |
| "grad_norm": 0.27077263593673706, | |
| "learning_rate": 4.184950278695745e-05, | |
| "loss": 0.1971, | |
| "num_input_tokens_seen": 62340104, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 5.302405498281787, | |
| "grad_norm": 0.25208866596221924, | |
| "learning_rate": 4.18091701858667e-05, | |
| "loss": 0.1854, | |
| "num_input_tokens_seen": 62495400, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 5.316151202749141, | |
| "grad_norm": 0.2774513065814972, | |
| "learning_rate": 4.176875757908815e-05, | |
| "loss": 0.2067, | |
| "num_input_tokens_seen": 62649864, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 5.329896907216495, | |
| "grad_norm": 0.2898416519165039, | |
| "learning_rate": 4.172826515897146e-05, | |
| "loss": 0.2015, | |
| "num_input_tokens_seen": 62823464, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 5.3436426116838485, | |
| "grad_norm": 0.2488073855638504, | |
| "learning_rate": 4.1687693118246184e-05, | |
| "loss": 0.23, | |
| "num_input_tokens_seen": 62994656, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 5.357388316151202, | |
| "grad_norm": 0.2701277732849121, | |
| "learning_rate": 4.164704165002086e-05, | |
| "loss": 0.2079, | |
| "num_input_tokens_seen": 63150072, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 5.371134020618557, | |
| "grad_norm": 0.2599446475505829, | |
| "learning_rate": 4.1606310947782044e-05, | |
| "loss": 0.1808, | |
| "num_input_tokens_seen": 63289128, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 5.384879725085911, | |
| "grad_norm": 0.28086861968040466, | |
| "learning_rate": 4.1565501205393445e-05, | |
| "loss": 0.1783, | |
| "num_input_tokens_seen": 63455720, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 5.398625429553265, | |
| "grad_norm": 0.2717934250831604, | |
| "learning_rate": 4.1524612617094935e-05, | |
| "loss": 0.1604, | |
| "num_input_tokens_seen": 63631048, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 5.412371134020619, | |
| "grad_norm": 0.2995658218860626, | |
| "learning_rate": 4.148364537750172e-05, | |
| "loss": 0.1964, | |
| "num_input_tokens_seen": 63777400, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.426116838487973, | |
| "grad_norm": 0.29845938086509705, | |
| "learning_rate": 4.1442599681603326e-05, | |
| "loss": 0.1796, | |
| "num_input_tokens_seen": 63906328, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 5.439862542955327, | |
| "grad_norm": 0.2695382535457611, | |
| "learning_rate": 4.140147572476268e-05, | |
| "loss": 0.1722, | |
| "num_input_tokens_seen": 64080920, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 5.453608247422681, | |
| "grad_norm": 0.2749643623828888, | |
| "learning_rate": 4.136027370271526e-05, | |
| "loss": 0.1432, | |
| "num_input_tokens_seen": 64283728, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 5.4673539518900345, | |
| "grad_norm": 0.32363370060920715, | |
| "learning_rate": 4.131899381156806e-05, | |
| "loss": 0.2004, | |
| "num_input_tokens_seen": 64442744, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 5.481099656357388, | |
| "grad_norm": 0.23776426911354065, | |
| "learning_rate": 4.127763624779873e-05, | |
| "loss": 0.1651, | |
| "num_input_tokens_seen": 64616136, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 5.494845360824742, | |
| "grad_norm": 0.27142253518104553, | |
| "learning_rate": 4.123620120825459e-05, | |
| "loss": 0.1595, | |
| "num_input_tokens_seen": 64777608, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 5.508591065292096, | |
| "grad_norm": 0.2718152701854706, | |
| "learning_rate": 4.119468889015174e-05, | |
| "loss": 0.1846, | |
| "num_input_tokens_seen": 64944824, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 5.52233676975945, | |
| "grad_norm": 0.24402709305286407, | |
| "learning_rate": 4.11530994910741e-05, | |
| "loss": 0.1957, | |
| "num_input_tokens_seen": 65092144, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 5.536082474226804, | |
| "grad_norm": 0.300231397151947, | |
| "learning_rate": 4.111143320897244e-05, | |
| "loss": 0.2216, | |
| "num_input_tokens_seen": 65252144, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 5.549828178694158, | |
| "grad_norm": 0.2824464738368988, | |
| "learning_rate": 4.1069690242163484e-05, | |
| "loss": 0.1648, | |
| "num_input_tokens_seen": 65420968, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.563573883161512, | |
| "grad_norm": 0.27086901664733887, | |
| "learning_rate": 4.102787078932896e-05, | |
| "loss": 0.1768, | |
| "num_input_tokens_seen": 65601320, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 5.577319587628866, | |
| "grad_norm": 0.2992557883262634, | |
| "learning_rate": 4.098597504951462e-05, | |
| "loss": 0.1906, | |
| "num_input_tokens_seen": 65755464, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 5.59106529209622, | |
| "grad_norm": 0.25411731004714966, | |
| "learning_rate": 4.0944003222129337e-05, | |
| "loss": 0.203, | |
| "num_input_tokens_seen": 65953216, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 5.6048109965635735, | |
| "grad_norm": 0.2789401412010193, | |
| "learning_rate": 4.09019555069441e-05, | |
| "loss": 0.192, | |
| "num_input_tokens_seen": 66110944, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 5.618556701030927, | |
| "grad_norm": 0.2699018120765686, | |
| "learning_rate": 4.085983210409114e-05, | |
| "loss": 0.1984, | |
| "num_input_tokens_seen": 66244232, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 5.632302405498281, | |
| "grad_norm": 0.2474309504032135, | |
| "learning_rate": 4.081763321406291e-05, | |
| "loss": 0.1613, | |
| "num_input_tokens_seen": 66409424, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 5.646048109965636, | |
| "grad_norm": 0.2745888829231262, | |
| "learning_rate": 4.0775359037711144e-05, | |
| "loss": 0.2047, | |
| "num_input_tokens_seen": 66586888, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 5.65979381443299, | |
| "grad_norm": 0.24484439194202423, | |
| "learning_rate": 4.073300977624594e-05, | |
| "loss": 0.1837, | |
| "num_input_tokens_seen": 66742096, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 5.673539518900344, | |
| "grad_norm": 0.23629799485206604, | |
| "learning_rate": 4.0690585631234755e-05, | |
| "loss": 0.2377, | |
| "num_input_tokens_seen": 66922120, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 5.687285223367698, | |
| "grad_norm": 0.2751019597053528, | |
| "learning_rate": 4.064808680460148e-05, | |
| "loss": 0.2208, | |
| "num_input_tokens_seen": 67090192, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.701030927835052, | |
| "grad_norm": 0.25368085503578186, | |
| "learning_rate": 4.0605513498625445e-05, | |
| "loss": 0.1819, | |
| "num_input_tokens_seen": 67243968, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 5.714776632302406, | |
| "grad_norm": 0.2721220552921295, | |
| "learning_rate": 4.0562865915940496e-05, | |
| "loss": 0.1899, | |
| "num_input_tokens_seen": 67393800, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 5.7285223367697595, | |
| "grad_norm": 0.30302292108535767, | |
| "learning_rate": 4.052014425953399e-05, | |
| "loss": 0.2022, | |
| "num_input_tokens_seen": 67536912, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 5.742268041237113, | |
| "grad_norm": 0.27584776282310486, | |
| "learning_rate": 4.047734873274586e-05, | |
| "loss": 0.2198, | |
| "num_input_tokens_seen": 67693936, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 5.756013745704467, | |
| "grad_norm": 0.2189093977212906, | |
| "learning_rate": 4.043447953926763e-05, | |
| "loss": 0.1778, | |
| "num_input_tokens_seen": 67905904, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 5.769759450171821, | |
| "grad_norm": 0.2949201762676239, | |
| "learning_rate": 4.039153688314145e-05, | |
| "loss": 0.1925, | |
| "num_input_tokens_seen": 68056632, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 5.783505154639175, | |
| "grad_norm": 0.2826312184333801, | |
| "learning_rate": 4.034852096875916e-05, | |
| "loss": 0.2104, | |
| "num_input_tokens_seen": 68203720, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 5.797250859106529, | |
| "grad_norm": 0.27696746587753296, | |
| "learning_rate": 4.030543200086123e-05, | |
| "loss": 0.1766, | |
| "num_input_tokens_seen": 68355256, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 5.810996563573883, | |
| "grad_norm": 0.2535838484764099, | |
| "learning_rate": 4.026227018453587e-05, | |
| "loss": 0.1662, | |
| "num_input_tokens_seen": 68528344, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 5.824742268041237, | |
| "grad_norm": 0.28252360224723816, | |
| "learning_rate": 4.021903572521802e-05, | |
| "loss": 0.1763, | |
| "num_input_tokens_seen": 68697976, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.838487972508591, | |
| "grad_norm": 0.31302717328071594, | |
| "learning_rate": 4.017572882868836e-05, | |
| "loss": 0.1875, | |
| "num_input_tokens_seen": 68844432, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 5.852233676975945, | |
| "grad_norm": 0.2511797547340393, | |
| "learning_rate": 4.013234970107236e-05, | |
| "loss": 0.1512, | |
| "num_input_tokens_seen": 68986032, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 5.8659793814432994, | |
| "grad_norm": 0.2649623155593872, | |
| "learning_rate": 4.008889854883929e-05, | |
| "loss": 0.1916, | |
| "num_input_tokens_seen": 69149600, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 5.879725085910653, | |
| "grad_norm": 0.22738507390022278, | |
| "learning_rate": 4.0045375578801214e-05, | |
| "loss": 0.1658, | |
| "num_input_tokens_seen": 69339920, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 5.893470790378007, | |
| "grad_norm": 0.25965777039527893, | |
| "learning_rate": 4.0001780998112026e-05, | |
| "loss": 0.1697, | |
| "num_input_tokens_seen": 69532016, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 5.907216494845361, | |
| "grad_norm": 0.2891072630882263, | |
| "learning_rate": 3.995811501426648e-05, | |
| "loss": 0.1578, | |
| "num_input_tokens_seen": 69686216, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 5.920962199312715, | |
| "grad_norm": 0.3100375533103943, | |
| "learning_rate": 3.991437783509916e-05, | |
| "loss": 0.1982, | |
| "num_input_tokens_seen": 69846408, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 5.934707903780069, | |
| "grad_norm": 0.2592156231403351, | |
| "learning_rate": 3.9870569668783536e-05, | |
| "loss": 0.1946, | |
| "num_input_tokens_seen": 70017088, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 5.948453608247423, | |
| "grad_norm": 0.2801019549369812, | |
| "learning_rate": 3.982669072383093e-05, | |
| "loss": 0.1701, | |
| "num_input_tokens_seen": 70199248, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 5.962199312714777, | |
| "grad_norm": 0.3101609945297241, | |
| "learning_rate": 3.978274120908956e-05, | |
| "loss": 0.1996, | |
| "num_input_tokens_seen": 70354704, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.975945017182131, | |
| "grad_norm": 0.3238377571105957, | |
| "learning_rate": 3.9738721333743535e-05, | |
| "loss": 0.1834, | |
| "num_input_tokens_seen": 70507968, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 5.989690721649485, | |
| "grad_norm": 0.3042110204696655, | |
| "learning_rate": 3.969463130731183e-05, | |
| "loss": 0.1874, | |
| "num_input_tokens_seen": 70680688, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 6.013745704467354, | |
| "grad_norm": 0.4883939325809479, | |
| "learning_rate": 3.965047133964735e-05, | |
| "loss": 0.3421, | |
| "num_input_tokens_seen": 70942480, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 6.027491408934708, | |
| "grad_norm": 0.3593299984931946, | |
| "learning_rate": 3.9606241640935864e-05, | |
| "loss": 0.1395, | |
| "num_input_tokens_seen": 71083800, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 6.041237113402062, | |
| "grad_norm": 0.3317866027355194, | |
| "learning_rate": 3.956194242169506e-05, | |
| "loss": 0.1748, | |
| "num_input_tokens_seen": 71223784, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 6.054982817869416, | |
| "grad_norm": 0.29188627004623413, | |
| "learning_rate": 3.9517573892773494e-05, | |
| "loss": 0.1327, | |
| "num_input_tokens_seen": 71375552, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 6.0687285223367695, | |
| "grad_norm": 0.4742688536643982, | |
| "learning_rate": 3.947313626534965e-05, | |
| "loss": 0.1632, | |
| "num_input_tokens_seen": 71561288, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 6.082474226804123, | |
| "grad_norm": 0.2812863886356354, | |
| "learning_rate": 3.942862975093085e-05, | |
| "loss": 0.1591, | |
| "num_input_tokens_seen": 71776512, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 6.096219931271477, | |
| "grad_norm": 0.31736019253730774, | |
| "learning_rate": 3.938405456135231e-05, | |
| "loss": 0.1567, | |
| "num_input_tokens_seen": 71933152, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 6.109965635738831, | |
| "grad_norm": 0.34313711524009705, | |
| "learning_rate": 3.933941090877615e-05, | |
| "loss": 0.2032, | |
| "num_input_tokens_seen": 72085088, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 6.123711340206185, | |
| "grad_norm": 0.2815231382846832, | |
| "learning_rate": 3.9294699005690305e-05, | |
| "loss": 0.1856, | |
| "num_input_tokens_seen": 72233992, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 6.13745704467354, | |
| "grad_norm": 0.3491072952747345, | |
| "learning_rate": 3.924991906490758e-05, | |
| "loss": 0.1875, | |
| "num_input_tokens_seen": 72357440, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 6.151202749140894, | |
| "grad_norm": 0.3489060401916504, | |
| "learning_rate": 3.92050712995646e-05, | |
| "loss": 0.1961, | |
| "num_input_tokens_seen": 72532528, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 6.164948453608248, | |
| "grad_norm": 0.2562100887298584, | |
| "learning_rate": 3.916015592312082e-05, | |
| "loss": 0.1716, | |
| "num_input_tokens_seen": 72675088, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 6.178694158075602, | |
| "grad_norm": 0.3180806040763855, | |
| "learning_rate": 3.911517314935752e-05, | |
| "loss": 0.1869, | |
| "num_input_tokens_seen": 72821048, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 6.1924398625429555, | |
| "grad_norm": 0.27191007137298584, | |
| "learning_rate": 3.907012319237672e-05, | |
| "loss": 0.1922, | |
| "num_input_tokens_seen": 72974112, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 6.206185567010309, | |
| "grad_norm": 0.32862401008605957, | |
| "learning_rate": 3.902500626660025e-05, | |
| "loss": 0.1545, | |
| "num_input_tokens_seen": 73163200, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 6.219931271477663, | |
| "grad_norm": 0.2990303635597229, | |
| "learning_rate": 3.897982258676867e-05, | |
| "loss": 0.1857, | |
| "num_input_tokens_seen": 73332968, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 6.233676975945017, | |
| "grad_norm": 0.24679060280323029, | |
| "learning_rate": 3.893457236794028e-05, | |
| "loss": 0.1717, | |
| "num_input_tokens_seen": 73509064, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 6.247422680412371, | |
| "grad_norm": 0.2459973841905594, | |
| "learning_rate": 3.888925582549006e-05, | |
| "loss": 0.1312, | |
| "num_input_tokens_seen": 73672608, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.261168384879725, | |
| "grad_norm": 0.2952354848384857, | |
| "learning_rate": 3.884387317510868e-05, | |
| "loss": 0.163, | |
| "num_input_tokens_seen": 73865352, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 6.274914089347079, | |
| "grad_norm": 0.25799819827079773, | |
| "learning_rate": 3.879842463280145e-05, | |
| "loss": 0.1924, | |
| "num_input_tokens_seen": 74042128, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 6.288659793814433, | |
| "grad_norm": 0.35253551602363586, | |
| "learning_rate": 3.8752910414887336e-05, | |
| "loss": 0.1817, | |
| "num_input_tokens_seen": 74169720, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 6.302405498281787, | |
| "grad_norm": 0.38375216722488403, | |
| "learning_rate": 3.870733073799785e-05, | |
| "loss": 0.1935, | |
| "num_input_tokens_seen": 74333240, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 6.316151202749141, | |
| "grad_norm": 0.2821910083293915, | |
| "learning_rate": 3.8661685819076085e-05, | |
| "loss": 0.1649, | |
| "num_input_tokens_seen": 74485936, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 6.329896907216495, | |
| "grad_norm": 0.3070038855075836, | |
| "learning_rate": 3.861597587537568e-05, | |
| "loss": 0.1628, | |
| "num_input_tokens_seen": 74645576, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 6.3436426116838485, | |
| "grad_norm": 0.29285964369773865, | |
| "learning_rate": 3.857020112445974e-05, | |
| "loss": 0.1581, | |
| "num_input_tokens_seen": 74833272, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 6.357388316151202, | |
| "grad_norm": 0.3231853246688843, | |
| "learning_rate": 3.8524361784199853e-05, | |
| "loss": 0.2051, | |
| "num_input_tokens_seen": 74972240, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 6.371134020618557, | |
| "grad_norm": 0.2808345854282379, | |
| "learning_rate": 3.847845807277502e-05, | |
| "loss": 0.148, | |
| "num_input_tokens_seen": 75145472, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 6.384879725085911, | |
| "grad_norm": 0.31740885972976685, | |
| "learning_rate": 3.84324902086706e-05, | |
| "loss": 0.1526, | |
| "num_input_tokens_seen": 75318096, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 6.398625429553265, | |
| "grad_norm": 0.25884810090065, | |
| "learning_rate": 3.838645841067735e-05, | |
| "loss": 0.1593, | |
| "num_input_tokens_seen": 75510240, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 6.412371134020619, | |
| "grad_norm": 0.26926836371421814, | |
| "learning_rate": 3.834036289789029e-05, | |
| "loss": 0.2143, | |
| "num_input_tokens_seen": 75685352, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 6.426116838487973, | |
| "grad_norm": 0.27780061960220337, | |
| "learning_rate": 3.829420388970771e-05, | |
| "loss": 0.2202, | |
| "num_input_tokens_seen": 75844448, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 6.439862542955327, | |
| "grad_norm": 0.26217731833457947, | |
| "learning_rate": 3.824798160583012e-05, | |
| "loss": 0.1563, | |
| "num_input_tokens_seen": 76025064, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 6.453608247422681, | |
| "grad_norm": 0.27686452865600586, | |
| "learning_rate": 3.82016962662592e-05, | |
| "loss": 0.1627, | |
| "num_input_tokens_seen": 76174480, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 6.4673539518900345, | |
| "grad_norm": 0.30067116022109985, | |
| "learning_rate": 3.8155348091296736e-05, | |
| "loss": 0.1863, | |
| "num_input_tokens_seen": 76316920, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 6.481099656357388, | |
| "grad_norm": 0.257575660943985, | |
| "learning_rate": 3.810893730154361e-05, | |
| "loss": 0.1425, | |
| "num_input_tokens_seen": 76492072, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 6.494845360824742, | |
| "grad_norm": 0.27667343616485596, | |
| "learning_rate": 3.8062464117898724e-05, | |
| "loss": 0.1893, | |
| "num_input_tokens_seen": 76649512, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 6.508591065292096, | |
| "grad_norm": 0.3068046271800995, | |
| "learning_rate": 3.801592876155794e-05, | |
| "loss": 0.1616, | |
| "num_input_tokens_seen": 76815584, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 6.52233676975945, | |
| "grad_norm": 0.24059297144412994, | |
| "learning_rate": 3.796933145401304e-05, | |
| "loss": 0.1737, | |
| "num_input_tokens_seen": 76985112, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 6.536082474226804, | |
| "grad_norm": 0.31432652473449707, | |
| "learning_rate": 3.7922672417050684e-05, | |
| "loss": 0.1657, | |
| "num_input_tokens_seen": 77150624, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 6.549828178694158, | |
| "grad_norm": 0.2746025323867798, | |
| "learning_rate": 3.787595187275136e-05, | |
| "loss": 0.1626, | |
| "num_input_tokens_seen": 77343024, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 6.563573883161512, | |
| "grad_norm": 0.2621474862098694, | |
| "learning_rate": 3.782917004348826e-05, | |
| "loss": 0.2096, | |
| "num_input_tokens_seen": 77526048, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 6.577319587628866, | |
| "grad_norm": 0.32739999890327454, | |
| "learning_rate": 3.77823271519263e-05, | |
| "loss": 0.1992, | |
| "num_input_tokens_seen": 77683952, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 6.59106529209622, | |
| "grad_norm": 0.3026455044746399, | |
| "learning_rate": 3.773542342102105e-05, | |
| "loss": 0.21, | |
| "num_input_tokens_seen": 77863848, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 6.6048109965635735, | |
| "grad_norm": 0.3309582471847534, | |
| "learning_rate": 3.7688459074017606e-05, | |
| "loss": 0.1886, | |
| "num_input_tokens_seen": 78024928, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 6.618556701030927, | |
| "grad_norm": 0.30102699995040894, | |
| "learning_rate": 3.764143433444962e-05, | |
| "loss": 0.1505, | |
| "num_input_tokens_seen": 78181704, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 6.632302405498281, | |
| "grad_norm": 0.2929239869117737, | |
| "learning_rate": 3.759434942613816e-05, | |
| "loss": 0.1626, | |
| "num_input_tokens_seen": 78354112, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 6.646048109965636, | |
| "grad_norm": 0.3340705931186676, | |
| "learning_rate": 3.7547204573190695e-05, | |
| "loss": 0.1604, | |
| "num_input_tokens_seen": 78513064, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 6.65979381443299, | |
| "grad_norm": 0.30690452456474304, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.1747, | |
| "num_input_tokens_seen": 78682360, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 6.673539518900344, | |
| "grad_norm": 0.2929702699184418, | |
| "learning_rate": 3.74527359312431e-05, | |
| "loss": 0.1601, | |
| "num_input_tokens_seen": 78834480, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 6.687285223367698, | |
| "grad_norm": 0.3245471119880676, | |
| "learning_rate": 3.7405412591880215e-05, | |
| "loss": 0.1475, | |
| "num_input_tokens_seen": 78979232, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 6.701030927835052, | |
| "grad_norm": 0.27008935809135437, | |
| "learning_rate": 3.7358030207153616e-05, | |
| "loss": 0.1412, | |
| "num_input_tokens_seen": 79134888, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 6.714776632302406, | |
| "grad_norm": 0.31673839688301086, | |
| "learning_rate": 3.731058900258668e-05, | |
| "loss": 0.1292, | |
| "num_input_tokens_seen": 79321520, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 6.7285223367697595, | |
| "grad_norm": 0.35502973198890686, | |
| "learning_rate": 3.72630892039827e-05, | |
| "loss": 0.2106, | |
| "num_input_tokens_seen": 79461152, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 6.742268041237113, | |
| "grad_norm": 0.26527366042137146, | |
| "learning_rate": 3.721553103742388e-05, | |
| "loss": 0.1557, | |
| "num_input_tokens_seen": 79631552, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 6.756013745704467, | |
| "grad_norm": 0.3058886229991913, | |
| "learning_rate": 3.7167914729270206e-05, | |
| "loss": 0.1434, | |
| "num_input_tokens_seen": 79809184, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 6.769759450171821, | |
| "grad_norm": 0.34565672278404236, | |
| "learning_rate": 3.712024050615843e-05, | |
| "loss": 0.1756, | |
| "num_input_tokens_seen": 79963752, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 6.783505154639175, | |
| "grad_norm": 0.29767486453056335, | |
| "learning_rate": 3.707250859500093e-05, | |
| "loss": 0.1672, | |
| "num_input_tokens_seen": 80109432, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 6.797250859106529, | |
| "grad_norm": 0.3332200050354004, | |
| "learning_rate": 3.702471922298469e-05, | |
| "loss": 0.1916, | |
| "num_input_tokens_seen": 80274160, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 6.810996563573883, | |
| "grad_norm": 0.3192325532436371, | |
| "learning_rate": 3.697687261757016e-05, | |
| "loss": 0.1889, | |
| "num_input_tokens_seen": 80427184, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 6.824742268041237, | |
| "grad_norm": 0.29341429471969604, | |
| "learning_rate": 3.692896900649021e-05, | |
| "loss": 0.1533, | |
| "num_input_tokens_seen": 80561448, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 6.838487972508591, | |
| "grad_norm": 0.26884809136390686, | |
| "learning_rate": 3.688100861774904e-05, | |
| "loss": 0.1957, | |
| "num_input_tokens_seen": 80729216, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 6.852233676975945, | |
| "grad_norm": 0.28385940194129944, | |
| "learning_rate": 3.6832991679621086e-05, | |
| "loss": 0.1642, | |
| "num_input_tokens_seen": 80898392, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 6.8659793814432994, | |
| "grad_norm": 0.25916504859924316, | |
| "learning_rate": 3.678491842064995e-05, | |
| "loss": 0.153, | |
| "num_input_tokens_seen": 81044576, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 6.879725085910653, | |
| "grad_norm": 0.28135421872138977, | |
| "learning_rate": 3.673678906964727e-05, | |
| "loss": 0.1848, | |
| "num_input_tokens_seen": 81188504, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 6.893470790378007, | |
| "grad_norm": 0.3360958695411682, | |
| "learning_rate": 3.668860385569171e-05, | |
| "loss": 0.1695, | |
| "num_input_tokens_seen": 81329304, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 6.907216494845361, | |
| "grad_norm": 0.2755764126777649, | |
| "learning_rate": 3.6640363008127784e-05, | |
| "loss": 0.1539, | |
| "num_input_tokens_seen": 81486096, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 6.920962199312715, | |
| "grad_norm": 0.2774079442024231, | |
| "learning_rate": 3.6592066756564826e-05, | |
| "loss": 0.1666, | |
| "num_input_tokens_seen": 81665840, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 6.934707903780069, | |
| "grad_norm": 0.25408604741096497, | |
| "learning_rate": 3.654371533087586e-05, | |
| "loss": 0.1491, | |
| "num_input_tokens_seen": 81845936, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 6.948453608247423, | |
| "grad_norm": 0.295030802488327, | |
| "learning_rate": 3.64953089611965e-05, | |
| "loss": 0.1584, | |
| "num_input_tokens_seen": 81979056, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 6.962199312714777, | |
| "grad_norm": 0.30526483058929443, | |
| "learning_rate": 3.644684787792392e-05, | |
| "loss": 0.1408, | |
| "num_input_tokens_seen": 82148144, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 6.975945017182131, | |
| "grad_norm": 0.25207042694091797, | |
| "learning_rate": 3.639833231171569e-05, | |
| "loss": 0.1709, | |
| "num_input_tokens_seen": 82332864, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 6.989690721649485, | |
| "grad_norm": 0.29031825065612793, | |
| "learning_rate": 3.634976249348867e-05, | |
| "loss": 0.16, | |
| "num_input_tokens_seen": 82494000, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 7.013745704467354, | |
| "grad_norm": 0.4886781871318817, | |
| "learning_rate": 3.6301138654418e-05, | |
| "loss": 0.3974, | |
| "num_input_tokens_seen": 82755776, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 7.027491408934708, | |
| "grad_norm": 0.32394877076148987, | |
| "learning_rate": 3.625246102593588e-05, | |
| "loss": 0.1591, | |
| "num_input_tokens_seen": 82917800, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 7.041237113402062, | |
| "grad_norm": 0.28803592920303345, | |
| "learning_rate": 3.620372983973057e-05, | |
| "loss": 0.1491, | |
| "num_input_tokens_seen": 83077392, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 7.054982817869416, | |
| "grad_norm": 0.3031829595565796, | |
| "learning_rate": 3.615494532774522e-05, | |
| "loss": 0.1898, | |
| "num_input_tokens_seen": 83202240, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 7.0687285223367695, | |
| "grad_norm": 0.2681405246257782, | |
| "learning_rate": 3.610610772217682e-05, | |
| "loss": 0.1468, | |
| "num_input_tokens_seen": 83373472, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 7.082474226804123, | |
| "grad_norm": 0.2581174373626709, | |
| "learning_rate": 3.6057217255475034e-05, | |
| "loss": 0.1547, | |
| "num_input_tokens_seen": 83529264, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 7.096219931271477, | |
| "grad_norm": 0.2969578206539154, | |
| "learning_rate": 3.600827416034115e-05, | |
| "loss": 0.1599, | |
| "num_input_tokens_seen": 83687784, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 7.109965635738831, | |
| "grad_norm": 0.2711568772792816, | |
| "learning_rate": 3.5959278669726935e-05, | |
| "loss": 0.1507, | |
| "num_input_tokens_seen": 83829112, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 7.123711340206185, | |
| "grad_norm": 0.29683706164360046, | |
| "learning_rate": 3.591023101683355e-05, | |
| "loss": 0.1838, | |
| "num_input_tokens_seen": 83989064, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 7.13745704467354, | |
| "grad_norm": 0.26808294653892517, | |
| "learning_rate": 3.586113143511043e-05, | |
| "loss": 0.1575, | |
| "num_input_tokens_seen": 84192320, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 7.151202749140894, | |
| "grad_norm": 0.2626827657222748, | |
| "learning_rate": 3.5811980158254155e-05, | |
| "loss": 0.1738, | |
| "num_input_tokens_seen": 84349080, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 7.164948453608248, | |
| "grad_norm": 0.2749732732772827, | |
| "learning_rate": 3.576277742020738e-05, | |
| "loss": 0.1311, | |
| "num_input_tokens_seen": 84497672, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 7.178694158075602, | |
| "grad_norm": 0.2715819776058197, | |
| "learning_rate": 3.571352345515768e-05, | |
| "loss": 0.1428, | |
| "num_input_tokens_seen": 84670168, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 7.1924398625429555, | |
| "grad_norm": 0.2685604393482208, | |
| "learning_rate": 3.566421849753646e-05, | |
| "loss": 0.1332, | |
| "num_input_tokens_seen": 84844968, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 7.206185567010309, | |
| "grad_norm": 0.28202638030052185, | |
| "learning_rate": 3.5614862782017835e-05, | |
| "loss": 0.1291, | |
| "num_input_tokens_seen": 84979224, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 7.219931271477663, | |
| "grad_norm": 0.2904205620288849, | |
| "learning_rate": 3.556545654351749e-05, | |
| "loss": 0.1347, | |
| "num_input_tokens_seen": 85145648, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 7.233676975945017, | |
| "grad_norm": 0.28707465529441833, | |
| "learning_rate": 3.551600001719161e-05, | |
| "loss": 0.1566, | |
| "num_input_tokens_seen": 85292816, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 7.247422680412371, | |
| "grad_norm": 0.23132464289665222, | |
| "learning_rate": 3.54664934384357e-05, | |
| "loss": 0.1377, | |
| "num_input_tokens_seen": 85472472, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 7.261168384879725, | |
| "grad_norm": 0.25163745880126953, | |
| "learning_rate": 3.541693704288355e-05, | |
| "loss": 0.1601, | |
| "num_input_tokens_seen": 85652544, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 7.274914089347079, | |
| "grad_norm": 0.294009268283844, | |
| "learning_rate": 3.536733106640598e-05, | |
| "loss": 0.1667, | |
| "num_input_tokens_seen": 85798064, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 7.288659793814433, | |
| "grad_norm": 0.3121618926525116, | |
| "learning_rate": 3.5317675745109866e-05, | |
| "loss": 0.1957, | |
| "num_input_tokens_seen": 85965472, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 7.302405498281787, | |
| "grad_norm": 0.4132276475429535, | |
| "learning_rate": 3.526797131533693e-05, | |
| "loss": 0.1658, | |
| "num_input_tokens_seen": 86120328, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 7.316151202749141, | |
| "grad_norm": 0.2591868042945862, | |
| "learning_rate": 3.5218218013662625e-05, | |
| "loss": 0.146, | |
| "num_input_tokens_seen": 86287208, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 7.329896907216495, | |
| "grad_norm": 0.26948240399360657, | |
| "learning_rate": 3.516841607689501e-05, | |
| "loss": 0.138, | |
| "num_input_tokens_seen": 86433152, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 7.3436426116838485, | |
| "grad_norm": 0.2934092879295349, | |
| "learning_rate": 3.5118565742073636e-05, | |
| "loss": 0.1638, | |
| "num_input_tokens_seen": 86593680, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 7.357388316151202, | |
| "grad_norm": 0.27545493841171265, | |
| "learning_rate": 3.5068667246468436e-05, | |
| "loss": 0.1654, | |
| "num_input_tokens_seen": 86762536, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 7.371134020618557, | |
| "grad_norm": 0.33026641607284546, | |
| "learning_rate": 3.5018720827578524e-05, | |
| "loss": 0.1655, | |
| "num_input_tokens_seen": 86895232, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 7.384879725085911, | |
| "grad_norm": 0.3183770477771759, | |
| "learning_rate": 3.496872672313116e-05, | |
| "loss": 0.1681, | |
| "num_input_tokens_seen": 87043088, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 7.398625429553265, | |
| "grad_norm": 0.29758843779563904, | |
| "learning_rate": 3.491868517108053e-05, | |
| "loss": 0.1289, | |
| "num_input_tokens_seen": 87214024, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 7.412371134020619, | |
| "grad_norm": 0.2753027379512787, | |
| "learning_rate": 3.486859640960668e-05, | |
| "loss": 0.1414, | |
| "num_input_tokens_seen": 87395120, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 7.426116838487973, | |
| "grad_norm": 0.27710992097854614, | |
| "learning_rate": 3.481846067711435e-05, | |
| "loss": 0.1428, | |
| "num_input_tokens_seen": 87557888, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 7.439862542955327, | |
| "grad_norm": 0.27963539958000183, | |
| "learning_rate": 3.476827821223184e-05, | |
| "loss": 0.1702, | |
| "num_input_tokens_seen": 87730488, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 7.453608247422681, | |
| "grad_norm": 0.32884612679481506, | |
| "learning_rate": 3.4718049253809895e-05, | |
| "loss": 0.185, | |
| "num_input_tokens_seen": 87885576, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 7.4673539518900345, | |
| "grad_norm": 0.3076169192790985, | |
| "learning_rate": 3.466777404092052e-05, | |
| "loss": 0.1527, | |
| "num_input_tokens_seen": 88036152, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 7.481099656357388, | |
| "grad_norm": 0.34016454219818115, | |
| "learning_rate": 3.461745281285591e-05, | |
| "loss": 0.1824, | |
| "num_input_tokens_seen": 88215088, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 7.494845360824742, | |
| "grad_norm": 0.3335823118686676, | |
| "learning_rate": 3.456708580912725e-05, | |
| "loss": 0.1848, | |
| "num_input_tokens_seen": 88345112, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 7.508591065292096, | |
| "grad_norm": 0.3280513882637024, | |
| "learning_rate": 3.451667326946362e-05, | |
| "loss": 0.147, | |
| "num_input_tokens_seen": 88507760, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 7.52233676975945, | |
| "grad_norm": 0.3153906762599945, | |
| "learning_rate": 3.446621543381083e-05, | |
| "loss": 0.1516, | |
| "num_input_tokens_seen": 88649312, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 7.536082474226804, | |
| "grad_norm": 0.2794785499572754, | |
| "learning_rate": 3.441571254233027e-05, | |
| "loss": 0.1441, | |
| "num_input_tokens_seen": 88830944, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 7.549828178694158, | |
| "grad_norm": 0.2983139455318451, | |
| "learning_rate": 3.436516483539781e-05, | |
| "loss": 0.1356, | |
| "num_input_tokens_seen": 88997624, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 7.563573883161512, | |
| "grad_norm": 0.28691616654396057, | |
| "learning_rate": 3.4314572553602576e-05, | |
| "loss": 0.1473, | |
| "num_input_tokens_seen": 89169784, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 7.577319587628866, | |
| "grad_norm": 0.36932337284088135, | |
| "learning_rate": 3.426393593774591e-05, | |
| "loss": 0.1867, | |
| "num_input_tokens_seen": 89326344, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 7.59106529209622, | |
| "grad_norm": 0.2710317075252533, | |
| "learning_rate": 3.421325522884013e-05, | |
| "loss": 0.1479, | |
| "num_input_tokens_seen": 89513384, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 7.6048109965635735, | |
| "grad_norm": 0.28699591755867004, | |
| "learning_rate": 3.4162530668107434e-05, | |
| "loss": 0.1909, | |
| "num_input_tokens_seen": 89693912, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 7.618556701030927, | |
| "grad_norm": 0.3139185309410095, | |
| "learning_rate": 3.411176249697875e-05, | |
| "loss": 0.1692, | |
| "num_input_tokens_seen": 89842392, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 7.632302405498281, | |
| "grad_norm": 0.2990357279777527, | |
| "learning_rate": 3.406095095709254e-05, | |
| "loss": 0.1696, | |
| "num_input_tokens_seen": 89986960, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 7.646048109965636, | |
| "grad_norm": 0.2928309142589569, | |
| "learning_rate": 3.4010096290293747e-05, | |
| "loss": 0.1357, | |
| "num_input_tokens_seen": 90141432, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 7.65979381443299, | |
| "grad_norm": 0.306242972612381, | |
| "learning_rate": 3.39591987386325e-05, | |
| "loss": 0.1659, | |
| "num_input_tokens_seen": 90291056, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 7.673539518900344, | |
| "grad_norm": 0.30208972096443176, | |
| "learning_rate": 3.390825854436314e-05, | |
| "loss": 0.1647, | |
| "num_input_tokens_seen": 90474680, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 7.687285223367698, | |
| "grad_norm": 0.30466949939727783, | |
| "learning_rate": 3.3857275949942893e-05, | |
| "loss": 0.1936, | |
| "num_input_tokens_seen": 90627248, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 7.701030927835052, | |
| "grad_norm": 0.2880082428455353, | |
| "learning_rate": 3.380625119803084e-05, | |
| "loss": 0.1168, | |
| "num_input_tokens_seen": 90788624, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 7.714776632302406, | |
| "grad_norm": 0.29013437032699585, | |
| "learning_rate": 3.375518453148669e-05, | |
| "loss": 0.165, | |
| "num_input_tokens_seen": 90956016, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 7.7285223367697595, | |
| "grad_norm": 0.23719368875026703, | |
| "learning_rate": 3.370407619336966e-05, | |
| "loss": 0.1261, | |
| "num_input_tokens_seen": 91134024, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 7.742268041237113, | |
| "grad_norm": 0.3531164526939392, | |
| "learning_rate": 3.365292642693732e-05, | |
| "loss": 0.1674, | |
| "num_input_tokens_seen": 91288704, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 7.756013745704467, | |
| "grad_norm": 0.297727108001709, | |
| "learning_rate": 3.360173547564442e-05, | |
| "loss": 0.1508, | |
| "num_input_tokens_seen": 91447040, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 7.769759450171821, | |
| "grad_norm": 0.35359999537467957, | |
| "learning_rate": 3.355050358314172e-05, | |
| "loss": 0.1491, | |
| "num_input_tokens_seen": 91599136, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 7.783505154639175, | |
| "grad_norm": 0.33164218068122864, | |
| "learning_rate": 3.3499230993274854e-05, | |
| "loss": 0.1475, | |
| "num_input_tokens_seen": 91762000, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 7.797250859106529, | |
| "grad_norm": 0.30343785881996155, | |
| "learning_rate": 3.344791795008318e-05, | |
| "loss": 0.183, | |
| "num_input_tokens_seen": 91925384, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 7.810996563573883, | |
| "grad_norm": 0.37099689245224, | |
| "learning_rate": 3.339656469779856e-05, | |
| "loss": 0.1197, | |
| "num_input_tokens_seen": 92110504, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 7.824742268041237, | |
| "grad_norm": 0.27034127712249756, | |
| "learning_rate": 3.3345171480844275e-05, | |
| "loss": 0.1648, | |
| "num_input_tokens_seen": 92282280, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 7.838487972508591, | |
| "grad_norm": 0.45039281249046326, | |
| "learning_rate": 3.329373854383381e-05, | |
| "loss": 0.1504, | |
| "num_input_tokens_seen": 92437376, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 7.852233676975945, | |
| "grad_norm": 0.2738780379295349, | |
| "learning_rate": 3.324226613156968e-05, | |
| "loss": 0.1386, | |
| "num_input_tokens_seen": 92624952, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 7.8659793814432994, | |
| "grad_norm": 0.30165544152259827, | |
| "learning_rate": 3.319075448904234e-05, | |
| "loss": 0.2132, | |
| "num_input_tokens_seen": 92792072, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 7.879725085910653, | |
| "grad_norm": 0.28674596548080444, | |
| "learning_rate": 3.313920386142892e-05, | |
| "loss": 0.167, | |
| "num_input_tokens_seen": 92946888, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 7.893470790378007, | |
| "grad_norm": 0.3369404971599579, | |
| "learning_rate": 3.308761449409213e-05, | |
| "loss": 0.1624, | |
| "num_input_tokens_seen": 93079184, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 7.907216494845361, | |
| "grad_norm": 0.2932247817516327, | |
| "learning_rate": 3.303598663257904e-05, | |
| "loss": 0.1834, | |
| "num_input_tokens_seen": 93247904, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 7.920962199312715, | |
| "grad_norm": 0.33443498611450195, | |
| "learning_rate": 3.298432052261998e-05, | |
| "loss": 0.1494, | |
| "num_input_tokens_seen": 93382360, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 7.934707903780069, | |
| "grad_norm": 0.33472296595573425, | |
| "learning_rate": 3.293261641012731e-05, | |
| "loss": 0.119, | |
| "num_input_tokens_seen": 93589320, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 7.948453608247423, | |
| "grad_norm": 0.3345872759819031, | |
| "learning_rate": 3.288087454119425e-05, | |
| "loss": 0.1651, | |
| "num_input_tokens_seen": 93752096, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 7.962199312714777, | |
| "grad_norm": 0.31766408681869507, | |
| "learning_rate": 3.2829095162093734e-05, | |
| "loss": 0.1375, | |
| "num_input_tokens_seen": 93927672, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 7.975945017182131, | |
| "grad_norm": 0.31698375940322876, | |
| "learning_rate": 3.277727851927727e-05, | |
| "loss": 0.1431, | |
| "num_input_tokens_seen": 94102968, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 7.989690721649485, | |
| "grad_norm": 0.30808258056640625, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.1119, | |
| "num_input_tokens_seen": 94263656, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 8.013745704467354, | |
| "grad_norm": 0.43731462955474854, | |
| "learning_rate": 3.2673534429188e-05, | |
| "loss": 0.295, | |
| "num_input_tokens_seen": 94517968, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 8.027491408934708, | |
| "grad_norm": 0.29076042771339417, | |
| "learning_rate": 3.2621607475700275e-05, | |
| "loss": 0.1113, | |
| "num_input_tokens_seen": 94683288, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 8.041237113402062, | |
| "grad_norm": 0.3487173914909363, | |
| "learning_rate": 3.2569644246064366e-05, | |
| "loss": 0.1625, | |
| "num_input_tokens_seen": 94816392, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 8.054982817869416, | |
| "grad_norm": 0.26258090138435364, | |
| "learning_rate": 3.251764498760683e-05, | |
| "loss": 0.1342, | |
| "num_input_tokens_seen": 94956176, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 8.06872852233677, | |
| "grad_norm": 0.285749614238739, | |
| "learning_rate": 3.2465609947825695e-05, | |
| "loss": 0.1419, | |
| "num_input_tokens_seen": 95122088, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 8.082474226804123, | |
| "grad_norm": 0.27495115995407104, | |
| "learning_rate": 3.241353937438927e-05, | |
| "loss": 0.1265, | |
| "num_input_tokens_seen": 95268872, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 8.096219931271477, | |
| "grad_norm": 0.2916254997253418, | |
| "learning_rate": 3.2361433515135056e-05, | |
| "loss": 0.1501, | |
| "num_input_tokens_seen": 95429312, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 8.109965635738831, | |
| "grad_norm": 0.34390848875045776, | |
| "learning_rate": 3.230929261806842e-05, | |
| "loss": 0.1823, | |
| "num_input_tokens_seen": 95552336, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 8.123711340206185, | |
| "grad_norm": 0.2918390929698944, | |
| "learning_rate": 3.225711693136156e-05, | |
| "loss": 0.154, | |
| "num_input_tokens_seen": 95707360, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 8.137457044673539, | |
| "grad_norm": 0.2951171398162842, | |
| "learning_rate": 3.2204906703352236e-05, | |
| "loss": 0.1367, | |
| "num_input_tokens_seen": 95865528, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 8.151202749140893, | |
| "grad_norm": 0.23622985184192657, | |
| "learning_rate": 3.215266218254261e-05, | |
| "loss": 0.1284, | |
| "num_input_tokens_seen": 96044208, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 8.164948453608247, | |
| "grad_norm": 0.263412743806839, | |
| "learning_rate": 3.210038361759807e-05, | |
| "loss": 0.1243, | |
| "num_input_tokens_seen": 96234232, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 8.1786941580756, | |
| "grad_norm": 0.3040781319141388, | |
| "learning_rate": 3.204807125734604e-05, | |
| "loss": 0.129, | |
| "num_input_tokens_seen": 96376432, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 8.192439862542955, | |
| "grad_norm": 0.2723235785961151, | |
| "learning_rate": 3.1995725350774806e-05, | |
| "loss": 0.1776, | |
| "num_input_tokens_seen": 96555968, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 8.206185567010309, | |
| "grad_norm": 0.26820462942123413, | |
| "learning_rate": 3.194334614703231e-05, | |
| "loss": 0.1303, | |
| "num_input_tokens_seen": 96699592, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 8.219931271477662, | |
| "grad_norm": 0.3210727870464325, | |
| "learning_rate": 3.1890933895424976e-05, | |
| "loss": 0.1739, | |
| "num_input_tokens_seen": 96836688, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 8.233676975945016, | |
| "grad_norm": 0.32428908348083496, | |
| "learning_rate": 3.183848884541656e-05, | |
| "loss": 0.1542, | |
| "num_input_tokens_seen": 96985808, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 8.24742268041237, | |
| "grad_norm": 0.27002865076065063, | |
| "learning_rate": 3.178601124662686e-05, | |
| "loss": 0.1293, | |
| "num_input_tokens_seen": 97162440, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 8.261168384879726, | |
| "grad_norm": 0.27363839745521545, | |
| "learning_rate": 3.173350134883066e-05, | |
| "loss": 0.1263, | |
| "num_input_tokens_seen": 97346856, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 8.27491408934708, | |
| "grad_norm": 0.28412649035453796, | |
| "learning_rate": 3.168095940195642e-05, | |
| "loss": 0.1729, | |
| "num_input_tokens_seen": 97527728, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 8.288659793814434, | |
| "grad_norm": 0.3273032009601593, | |
| "learning_rate": 3.16283856560852e-05, | |
| "loss": 0.1413, | |
| "num_input_tokens_seen": 97705472, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 8.302405498281788, | |
| "grad_norm": 0.34350278973579407, | |
| "learning_rate": 3.157578036144937e-05, | |
| "loss": 0.1861, | |
| "num_input_tokens_seen": 97836504, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 8.316151202749142, | |
| "grad_norm": 0.3030097484588623, | |
| "learning_rate": 3.1523143768431475e-05, | |
| "loss": 0.1125, | |
| "num_input_tokens_seen": 98004624, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 8.329896907216495, | |
| "grad_norm": 0.31564685702323914, | |
| "learning_rate": 3.147047612756302e-05, | |
| "loss": 0.1224, | |
| "num_input_tokens_seen": 98171840, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 8.34364261168385, | |
| "grad_norm": 0.269195556640625, | |
| "learning_rate": 3.14177776895233e-05, | |
| "loss": 0.1192, | |
| "num_input_tokens_seen": 98331152, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 8.357388316151203, | |
| "grad_norm": 0.31241536140441895, | |
| "learning_rate": 3.136504870513819e-05, | |
| "loss": 0.134, | |
| "num_input_tokens_seen": 98493480, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 8.371134020618557, | |
| "grad_norm": 0.25421205163002014, | |
| "learning_rate": 3.131228942537895e-05, | |
| "loss": 0.1446, | |
| "num_input_tokens_seen": 98683296, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 8.384879725085911, | |
| "grad_norm": 0.31273579597473145, | |
| "learning_rate": 3.125950010136104e-05, | |
| "loss": 0.155, | |
| "num_input_tokens_seen": 98858248, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 8.398625429553265, | |
| "grad_norm": 0.3347190022468567, | |
| "learning_rate": 3.120668098434291e-05, | |
| "loss": 0.1432, | |
| "num_input_tokens_seen": 99012984, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 8.412371134020619, | |
| "grad_norm": 0.29279547929763794, | |
| "learning_rate": 3.115383232572483e-05, | |
| "loss": 0.1697, | |
| "num_input_tokens_seen": 99157024, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 8.426116838487973, | |
| "grad_norm": 0.33964666724205017, | |
| "learning_rate": 3.1100954377047663e-05, | |
| "loss": 0.1479, | |
| "num_input_tokens_seen": 99331528, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 8.439862542955327, | |
| "grad_norm": 0.24091492593288422, | |
| "learning_rate": 3.104804738999169e-05, | |
| "loss": 0.1295, | |
| "num_input_tokens_seen": 99522344, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 8.45360824742268, | |
| "grad_norm": 0.2860211730003357, | |
| "learning_rate": 3.0995111616375414e-05, | |
| "loss": 0.1638, | |
| "num_input_tokens_seen": 99677176, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 8.467353951890034, | |
| "grad_norm": 0.298226535320282, | |
| "learning_rate": 3.094214730815433e-05, | |
| "loss": 0.1468, | |
| "num_input_tokens_seen": 99844360, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 8.481099656357388, | |
| "grad_norm": 0.326921671628952, | |
| "learning_rate": 3.088915471741976e-05, | |
| "loss": 0.1504, | |
| "num_input_tokens_seen": 99990800, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 8.494845360824742, | |
| "grad_norm": 0.2758296728134155, | |
| "learning_rate": 3.083613409639764e-05, | |
| "loss": 0.1273, | |
| "num_input_tokens_seen": 100166976, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 8.508591065292096, | |
| "grad_norm": 0.350284218788147, | |
| "learning_rate": 3.078308569744732e-05, | |
| "loss": 0.1508, | |
| "num_input_tokens_seen": 100335016, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 8.52233676975945, | |
| "grad_norm": 0.2879723012447357, | |
| "learning_rate": 3.073000977306036e-05, | |
| "loss": 0.1559, | |
| "num_input_tokens_seen": 100468768, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 8.536082474226804, | |
| "grad_norm": 0.324969619512558, | |
| "learning_rate": 3.0676906575859334e-05, | |
| "loss": 0.1529, | |
| "num_input_tokens_seen": 100652144, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 8.549828178694158, | |
| "grad_norm": 0.395110547542572, | |
| "learning_rate": 3.062377635859663e-05, | |
| "loss": 0.1718, | |
| "num_input_tokens_seen": 100825568, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 8.563573883161512, | |
| "grad_norm": 0.2902062237262726, | |
| "learning_rate": 3.057061937415323e-05, | |
| "loss": 0.141, | |
| "num_input_tokens_seen": 100985432, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 8.577319587628866, | |
| "grad_norm": 0.39724087715148926, | |
| "learning_rate": 3.0517435875537536e-05, | |
| "loss": 0.142, | |
| "num_input_tokens_seen": 101118056, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 8.59106529209622, | |
| "grad_norm": 0.32123881578445435, | |
| "learning_rate": 3.0464226115884116e-05, | |
| "loss": 0.1223, | |
| "num_input_tokens_seen": 101274920, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 8.604810996563574, | |
| "grad_norm": 0.35859179496765137, | |
| "learning_rate": 3.0410990348452573e-05, | |
| "loss": 0.151, | |
| "num_input_tokens_seen": 101460552, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 8.618556701030927, | |
| "grad_norm": 0.4564644694328308, | |
| "learning_rate": 3.035772882662627e-05, | |
| "loss": 0.1251, | |
| "num_input_tokens_seen": 101627008, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 8.632302405498281, | |
| "grad_norm": 0.2856820821762085, | |
| "learning_rate": 3.030444180391116e-05, | |
| "loss": 0.1486, | |
| "num_input_tokens_seen": 101804424, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 8.646048109965635, | |
| "grad_norm": 0.38874202966690063, | |
| "learning_rate": 3.0251129533934562e-05, | |
| "loss": 0.1348, | |
| "num_input_tokens_seen": 101953616, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 8.65979381443299, | |
| "grad_norm": 0.45752501487731934, | |
| "learning_rate": 3.0197792270443982e-05, | |
| "loss": 0.1418, | |
| "num_input_tokens_seen": 102122720, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 8.673539518900343, | |
| "grad_norm": 0.2998276650905609, | |
| "learning_rate": 3.0144430267305872e-05, | |
| "loss": 0.1574, | |
| "num_input_tokens_seen": 102285920, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 8.687285223367697, | |
| "grad_norm": 0.46059149503707886, | |
| "learning_rate": 3.0091043778504436e-05, | |
| "loss": 0.1781, | |
| "num_input_tokens_seen": 102453576, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 8.70103092783505, | |
| "grad_norm": 0.49796387553215027, | |
| "learning_rate": 3.003763305814043e-05, | |
| "loss": 0.1566, | |
| "num_input_tokens_seen": 102644480, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 8.714776632302405, | |
| "grad_norm": 0.3332822918891907, | |
| "learning_rate": 2.9984198360429932e-05, | |
| "loss": 0.1537, | |
| "num_input_tokens_seen": 102808944, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 8.728522336769759, | |
| "grad_norm": 0.4872013032436371, | |
| "learning_rate": 2.9930739939703157e-05, | |
| "loss": 0.1666, | |
| "num_input_tokens_seen": 102964360, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 8.742268041237114, | |
| "grad_norm": 0.3459216356277466, | |
| "learning_rate": 2.9877258050403212e-05, | |
| "loss": 0.1261, | |
| "num_input_tokens_seen": 103127464, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 8.756013745704468, | |
| "grad_norm": 0.27105268836021423, | |
| "learning_rate": 2.9823752947084926e-05, | |
| "loss": 0.1254, | |
| "num_input_tokens_seen": 103308264, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 8.769759450171822, | |
| "grad_norm": 0.5594639182090759, | |
| "learning_rate": 2.9770224884413623e-05, | |
| "loss": 0.1294, | |
| "num_input_tokens_seen": 103487016, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 8.783505154639176, | |
| "grad_norm": 0.4965916872024536, | |
| "learning_rate": 2.9716674117163883e-05, | |
| "loss": 0.153, | |
| "num_input_tokens_seen": 103635352, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 8.79725085910653, | |
| "grad_norm": 0.3914627134799957, | |
| "learning_rate": 2.966310090021837e-05, | |
| "loss": 0.1701, | |
| "num_input_tokens_seen": 103808112, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 8.810996563573884, | |
| "grad_norm": 0.49857333302497864, | |
| "learning_rate": 2.9609505488566587e-05, | |
| "loss": 0.1358, | |
| "num_input_tokens_seen": 103971304, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 8.824742268041238, | |
| "grad_norm": 0.31615906953811646, | |
| "learning_rate": 2.9555888137303695e-05, | |
| "loss": 0.1076, | |
| "num_input_tokens_seen": 104140320, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 8.838487972508592, | |
| "grad_norm": 0.34977391362190247, | |
| "learning_rate": 2.9502249101629247e-05, | |
| "loss": 0.1465, | |
| "num_input_tokens_seen": 104280240, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 8.852233676975946, | |
| "grad_norm": 0.3259906470775604, | |
| "learning_rate": 2.9448588636846046e-05, | |
| "loss": 0.1307, | |
| "num_input_tokens_seen": 104441616, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 8.8659793814433, | |
| "grad_norm": 0.5946896076202393, | |
| "learning_rate": 2.9394906998358868e-05, | |
| "loss": 0.1575, | |
| "num_input_tokens_seen": 104600816, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 8.879725085910653, | |
| "grad_norm": 0.36117005348205566, | |
| "learning_rate": 2.9341204441673266e-05, | |
| "loss": 0.1396, | |
| "num_input_tokens_seen": 104760360, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 8.893470790378007, | |
| "grad_norm": 0.4462480843067169, | |
| "learning_rate": 2.9287481222394357e-05, | |
| "loss": 0.1483, | |
| "num_input_tokens_seen": 104900912, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 8.907216494845361, | |
| "grad_norm": 0.3371461033821106, | |
| "learning_rate": 2.9233737596225613e-05, | |
| "loss": 0.1126, | |
| "num_input_tokens_seen": 105066936, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 8.920962199312715, | |
| "grad_norm": 0.3044605851173401, | |
| "learning_rate": 2.9179973818967643e-05, | |
| "loss": 0.1481, | |
| "num_input_tokens_seen": 105228632, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 8.934707903780069, | |
| "grad_norm": 0.4637247323989868, | |
| "learning_rate": 2.9126190146516942e-05, | |
| "loss": 0.1706, | |
| "num_input_tokens_seen": 105395464, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 8.948453608247423, | |
| "grad_norm": 0.3099256753921509, | |
| "learning_rate": 2.9072386834864724e-05, | |
| "loss": 0.1581, | |
| "num_input_tokens_seen": 105541752, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 8.962199312714777, | |
| "grad_norm": 0.30710747838020325, | |
| "learning_rate": 2.9018564140095657e-05, | |
| "loss": 0.1584, | |
| "num_input_tokens_seen": 105709360, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 8.97594501718213, | |
| "grad_norm": 0.35643064975738525, | |
| "learning_rate": 2.896472231838668e-05, | |
| "loss": 0.1299, | |
| "num_input_tokens_seen": 105901488, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 8.989690721649485, | |
| "grad_norm": 0.33289802074432373, | |
| "learning_rate": 2.8910861626005776e-05, | |
| "loss": 0.1117, | |
| "num_input_tokens_seen": 106061136, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 9.013745704467354, | |
| "grad_norm": 0.6210712194442749, | |
| "learning_rate": 2.8856982319310722e-05, | |
| "loss": 0.2548, | |
| "num_input_tokens_seen": 106320680, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 9.027491408934708, | |
| "grad_norm": 0.25133639574050903, | |
| "learning_rate": 2.8803084654747918e-05, | |
| "loss": 0.1091, | |
| "num_input_tokens_seen": 106479200, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 9.041237113402062, | |
| "grad_norm": 0.2998102605342865, | |
| "learning_rate": 2.8749168888851125e-05, | |
| "loss": 0.1505, | |
| "num_input_tokens_seen": 106639192, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 9.054982817869416, | |
| "grad_norm": 0.29671013355255127, | |
| "learning_rate": 2.8695235278240272e-05, | |
| "loss": 0.1114, | |
| "num_input_tokens_seen": 106812384, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 9.06872852233677, | |
| "grad_norm": 0.2994160056114197, | |
| "learning_rate": 2.8641284079620202e-05, | |
| "loss": 0.1416, | |
| "num_input_tokens_seen": 106981864, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 9.082474226804123, | |
| "grad_norm": 0.3068687319755554, | |
| "learning_rate": 2.858731554977948e-05, | |
| "loss": 0.1286, | |
| "num_input_tokens_seen": 107118984, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 9.096219931271477, | |
| "grad_norm": 0.3013548254966736, | |
| "learning_rate": 2.8533329945589194e-05, | |
| "loss": 0.1273, | |
| "num_input_tokens_seen": 107286656, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 9.109965635738831, | |
| "grad_norm": 0.2906287610530853, | |
| "learning_rate": 2.8479327524001636e-05, | |
| "loss": 0.1342, | |
| "num_input_tokens_seen": 107454328, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 9.123711340206185, | |
| "grad_norm": 0.30069291591644287, | |
| "learning_rate": 2.8425308542049206e-05, | |
| "loss": 0.1438, | |
| "num_input_tokens_seen": 107613824, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 9.137457044673539, | |
| "grad_norm": 0.3309033513069153, | |
| "learning_rate": 2.837127325684308e-05, | |
| "loss": 0.1416, | |
| "num_input_tokens_seen": 107767224, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 9.151202749140893, | |
| "grad_norm": 0.2773381769657135, | |
| "learning_rate": 2.8317221925572056e-05, | |
| "loss": 0.0978, | |
| "num_input_tokens_seen": 107927584, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 9.164948453608247, | |
| "grad_norm": 0.3253799378871918, | |
| "learning_rate": 2.8263154805501297e-05, | |
| "loss": 0.1367, | |
| "num_input_tokens_seen": 108080472, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 9.1786941580756, | |
| "grad_norm": 0.29655560851097107, | |
| "learning_rate": 2.8209072153971112e-05, | |
| "loss": 0.1169, | |
| "num_input_tokens_seen": 108238632, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 9.192439862542955, | |
| "grad_norm": 0.3956676423549652, | |
| "learning_rate": 2.815497422839575e-05, | |
| "loss": 0.1697, | |
| "num_input_tokens_seen": 108369768, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 9.206185567010309, | |
| "grad_norm": 0.3614160716533661, | |
| "learning_rate": 2.8100861286262137e-05, | |
| "loss": 0.1677, | |
| "num_input_tokens_seen": 108524112, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 9.219931271477662, | |
| "grad_norm": 0.33044761419296265, | |
| "learning_rate": 2.8046733585128687e-05, | |
| "loss": 0.121, | |
| "num_input_tokens_seen": 108688264, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 9.233676975945016, | |
| "grad_norm": 0.2850668132305145, | |
| "learning_rate": 2.7992591382624063e-05, | |
| "loss": 0.1231, | |
| "num_input_tokens_seen": 108849776, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 9.24742268041237, | |
| "grad_norm": 0.3155098557472229, | |
| "learning_rate": 2.7938434936445945e-05, | |
| "loss": 0.1177, | |
| "num_input_tokens_seen": 109004464, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 9.261168384879726, | |
| "grad_norm": 0.31230416893959045, | |
| "learning_rate": 2.7884264504359797e-05, | |
| "loss": 0.1459, | |
| "num_input_tokens_seen": 109164240, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 9.27491408934708, | |
| "grad_norm": 0.27618101239204407, | |
| "learning_rate": 2.7830080344197674e-05, | |
| "loss": 0.1125, | |
| "num_input_tokens_seen": 109317432, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 9.288659793814434, | |
| "grad_norm": 0.2933465540409088, | |
| "learning_rate": 2.7775882713856942e-05, | |
| "loss": 0.1395, | |
| "num_input_tokens_seen": 109499824, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 9.302405498281788, | |
| "grad_norm": 0.3569648265838623, | |
| "learning_rate": 2.7721671871299116e-05, | |
| "loss": 0.1227, | |
| "num_input_tokens_seen": 109652288, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 9.316151202749142, | |
| "grad_norm": 0.28516238927841187, | |
| "learning_rate": 2.766744807454857e-05, | |
| "loss": 0.1247, | |
| "num_input_tokens_seen": 109824880, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 9.329896907216495, | |
| "grad_norm": 0.28275778889656067, | |
| "learning_rate": 2.761321158169134e-05, | |
| "loss": 0.1238, | |
| "num_input_tokens_seen": 110003264, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 9.34364261168385, | |
| "grad_norm": 0.3060102164745331, | |
| "learning_rate": 2.7558962650873897e-05, | |
| "loss": 0.1455, | |
| "num_input_tokens_seen": 110157816, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 9.357388316151203, | |
| "grad_norm": 0.3358934819698334, | |
| "learning_rate": 2.7504701540301907e-05, | |
| "loss": 0.1369, | |
| "num_input_tokens_seen": 110318272, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 9.371134020618557, | |
| "grad_norm": 0.26785629987716675, | |
| "learning_rate": 2.7450428508239024e-05, | |
| "loss": 0.1496, | |
| "num_input_tokens_seen": 110488048, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 9.384879725085911, | |
| "grad_norm": 0.28915175795555115, | |
| "learning_rate": 2.7396143813005602e-05, | |
| "loss": 0.125, | |
| "num_input_tokens_seen": 110639240, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 9.398625429553265, | |
| "grad_norm": 0.22255295515060425, | |
| "learning_rate": 2.7341847712977557e-05, | |
| "loss": 0.0917, | |
| "num_input_tokens_seen": 110826488, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 9.412371134020619, | |
| "grad_norm": 0.2752317786216736, | |
| "learning_rate": 2.7287540466585065e-05, | |
| "loss": 0.0929, | |
| "num_input_tokens_seen": 110979344, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 9.426116838487973, | |
| "grad_norm": 0.323539674282074, | |
| "learning_rate": 2.7233222332311343e-05, | |
| "loss": 0.1594, | |
| "num_input_tokens_seen": 111164680, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 9.439862542955327, | |
| "grad_norm": 0.2564748525619507, | |
| "learning_rate": 2.717889356869146e-05, | |
| "loss": 0.1183, | |
| "num_input_tokens_seen": 111350120, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 9.45360824742268, | |
| "grad_norm": 0.3039367198944092, | |
| "learning_rate": 2.7124554434311043e-05, | |
| "loss": 0.1434, | |
| "num_input_tokens_seen": 111523936, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 9.467353951890034, | |
| "grad_norm": 0.28405293822288513, | |
| "learning_rate": 2.7070205187805108e-05, | |
| "loss": 0.1446, | |
| "num_input_tokens_seen": 111699072, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 9.481099656357388, | |
| "grad_norm": 0.28414756059646606, | |
| "learning_rate": 2.7015846087856794e-05, | |
| "loss": 0.1287, | |
| "num_input_tokens_seen": 111885360, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 9.494845360824742, | |
| "grad_norm": 0.2785322666168213, | |
| "learning_rate": 2.6961477393196126e-05, | |
| "loss": 0.1075, | |
| "num_input_tokens_seen": 112043856, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 9.508591065292096, | |
| "grad_norm": 0.30990102887153625, | |
| "learning_rate": 2.6907099362598815e-05, | |
| "loss": 0.1494, | |
| "num_input_tokens_seen": 112212640, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 9.52233676975945, | |
| "grad_norm": 0.31125110387802124, | |
| "learning_rate": 2.6852712254884988e-05, | |
| "loss": 0.131, | |
| "num_input_tokens_seen": 112393528, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 9.536082474226804, | |
| "grad_norm": 0.2579840421676636, | |
| "learning_rate": 2.679831632891799e-05, | |
| "loss": 0.1042, | |
| "num_input_tokens_seen": 112583368, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 9.549828178694158, | |
| "grad_norm": 0.31920382380485535, | |
| "learning_rate": 2.674391184360313e-05, | |
| "loss": 0.1387, | |
| "num_input_tokens_seen": 112718016, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 9.563573883161512, | |
| "grad_norm": 0.2835806906223297, | |
| "learning_rate": 2.668949905788648e-05, | |
| "loss": 0.1108, | |
| "num_input_tokens_seen": 112872696, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 9.577319587628866, | |
| "grad_norm": 0.2723243236541748, | |
| "learning_rate": 2.663507823075358e-05, | |
| "loss": 0.1384, | |
| "num_input_tokens_seen": 113047248, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 9.59106529209622, | |
| "grad_norm": 0.3115779459476471, | |
| "learning_rate": 2.6580649621228266e-05, | |
| "loss": 0.1624, | |
| "num_input_tokens_seen": 113199480, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 9.604810996563574, | |
| "grad_norm": 0.2740122973918915, | |
| "learning_rate": 2.6526213488371427e-05, | |
| "loss": 0.1178, | |
| "num_input_tokens_seen": 113346712, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 9.618556701030927, | |
| "grad_norm": 0.30309829115867615, | |
| "learning_rate": 2.6471770091279724e-05, | |
| "loss": 0.1067, | |
| "num_input_tokens_seen": 113501832, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 9.632302405498281, | |
| "grad_norm": 0.29904693365097046, | |
| "learning_rate": 2.641731968908444e-05, | |
| "loss": 0.1502, | |
| "num_input_tokens_seen": 113676016, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 9.646048109965635, | |
| "grad_norm": 0.3149542808532715, | |
| "learning_rate": 2.6362862540950162e-05, | |
| "loss": 0.1498, | |
| "num_input_tokens_seen": 113828272, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 9.65979381443299, | |
| "grad_norm": 0.32262223958969116, | |
| "learning_rate": 2.63083989060736e-05, | |
| "loss": 0.1244, | |
| "num_input_tokens_seen": 113961872, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 9.673539518900343, | |
| "grad_norm": 0.2663904130458832, | |
| "learning_rate": 2.6253929043682335e-05, | |
| "loss": 0.1258, | |
| "num_input_tokens_seen": 114141600, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 9.687285223367697, | |
| "grad_norm": 0.33275941014289856, | |
| "learning_rate": 2.6199453213033598e-05, | |
| "loss": 0.1635, | |
| "num_input_tokens_seen": 114290272, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 9.70103092783505, | |
| "grad_norm": 0.31957611441612244, | |
| "learning_rate": 2.614497167341302e-05, | |
| "loss": 0.1282, | |
| "num_input_tokens_seen": 114441696, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 9.714776632302405, | |
| "grad_norm": 0.2801245450973511, | |
| "learning_rate": 2.6090484684133404e-05, | |
| "loss": 0.1267, | |
| "num_input_tokens_seen": 114596104, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 9.728522336769759, | |
| "grad_norm": 0.3149997889995575, | |
| "learning_rate": 2.6035992504533492e-05, | |
| "loss": 0.1335, | |
| "num_input_tokens_seen": 114758768, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 9.742268041237114, | |
| "grad_norm": 0.3023546636104584, | |
| "learning_rate": 2.598149539397672e-05, | |
| "loss": 0.1403, | |
| "num_input_tokens_seen": 114930352, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 9.756013745704468, | |
| "grad_norm": 0.26859769225120544, | |
| "learning_rate": 2.592699361185002e-05, | |
| "loss": 0.1191, | |
| "num_input_tokens_seen": 115091072, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 9.769759450171822, | |
| "grad_norm": 0.31525665521621704, | |
| "learning_rate": 2.587248741756253e-05, | |
| "loss": 0.1397, | |
| "num_input_tokens_seen": 115262088, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 9.783505154639176, | |
| "grad_norm": 0.3614005744457245, | |
| "learning_rate": 2.5817977070544407e-05, | |
| "loss": 0.1457, | |
| "num_input_tokens_seen": 115397128, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 9.79725085910653, | |
| "grad_norm": 0.2974054217338562, | |
| "learning_rate": 2.5763462830245572e-05, | |
| "loss": 0.1078, | |
| "num_input_tokens_seen": 115564328, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 9.810996563573884, | |
| "grad_norm": 0.4052600860595703, | |
| "learning_rate": 2.570894495613446e-05, | |
| "loss": 0.1138, | |
| "num_input_tokens_seen": 115730688, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 9.824742268041238, | |
| "grad_norm": 0.26603081822395325, | |
| "learning_rate": 2.5654423707696833e-05, | |
| "loss": 0.1192, | |
| "num_input_tokens_seen": 115922888, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 9.838487972508592, | |
| "grad_norm": 0.28035417199134827, | |
| "learning_rate": 2.5599899344434475e-05, | |
| "loss": 0.1299, | |
| "num_input_tokens_seen": 116087224, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 9.852233676975946, | |
| "grad_norm": 0.3163240849971771, | |
| "learning_rate": 2.5545372125864032e-05, | |
| "loss": 0.1754, | |
| "num_input_tokens_seen": 116247912, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 9.8659793814433, | |
| "grad_norm": 0.24861401319503784, | |
| "learning_rate": 2.5490842311515707e-05, | |
| "loss": 0.1309, | |
| "num_input_tokens_seen": 116437392, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 9.879725085910653, | |
| "grad_norm": 0.2968957722187042, | |
| "learning_rate": 2.5436310160932092e-05, | |
| "loss": 0.1449, | |
| "num_input_tokens_seen": 116600008, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 9.893470790378007, | |
| "grad_norm": 0.328169047832489, | |
| "learning_rate": 2.5381775933666864e-05, | |
| "loss": 0.1347, | |
| "num_input_tokens_seen": 116759440, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 9.907216494845361, | |
| "grad_norm": 0.3450523018836975, | |
| "learning_rate": 2.5327239889283612e-05, | |
| "loss": 0.1744, | |
| "num_input_tokens_seen": 116905760, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 9.920962199312715, | |
| "grad_norm": 0.27799859642982483, | |
| "learning_rate": 2.527270228735456e-05, | |
| "loss": 0.1403, | |
| "num_input_tokens_seen": 117051600, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 9.934707903780069, | |
| "grad_norm": 0.30592623353004456, | |
| "learning_rate": 2.521816338745935e-05, | |
| "loss": 0.1296, | |
| "num_input_tokens_seen": 117207304, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 9.948453608247423, | |
| "grad_norm": 0.33801671862602234, | |
| "learning_rate": 2.5163623449183798e-05, | |
| "loss": 0.1797, | |
| "num_input_tokens_seen": 117370672, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 9.962199312714777, | |
| "grad_norm": 0.2604334056377411, | |
| "learning_rate": 2.5109082732118665e-05, | |
| "loss": 0.1337, | |
| "num_input_tokens_seen": 117532712, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 9.97594501718213, | |
| "grad_norm": 0.2962709367275238, | |
| "learning_rate": 2.5054541495858425e-05, | |
| "loss": 0.1202, | |
| "num_input_tokens_seen": 117680488, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 9.989690721649485, | |
| "grad_norm": 0.39495667815208435, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.14, | |
| "num_input_tokens_seen": 117859344, | |
| "step": 720 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 1440, | |
| "num_input_tokens_seen": 117859344, | |
| "num_train_epochs": 20, | |
| "save_steps": 72, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.942811394669806e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |