Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
hdong0's picture
Model save
2f3bbec verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4395,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0034129692832764505,
"grad_norm": 2.9234242474780587,
"learning_rate": 9.09090909090909e-07,
"loss": 0.8721,
"num_tokens": 1933925.0,
"step": 5
},
{
"epoch": 0.006825938566552901,
"grad_norm": 1.8660906811263716,
"learning_rate": 2.0454545454545457e-06,
"loss": 0.8356,
"num_tokens": 3759146.0,
"step": 10
},
{
"epoch": 0.010238907849829351,
"grad_norm": 1.6011343042226884,
"learning_rate": 3.1818181818181817e-06,
"loss": 0.8325,
"num_tokens": 5644524.0,
"step": 15
},
{
"epoch": 0.013651877133105802,
"grad_norm": 1.2454863399184872,
"learning_rate": 4.3181818181818185e-06,
"loss": 0.7876,
"num_tokens": 7668808.0,
"step": 20
},
{
"epoch": 0.017064846416382253,
"grad_norm": 0.7868446109328924,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.7459,
"num_tokens": 9513010.0,
"step": 25
},
{
"epoch": 0.020477815699658702,
"grad_norm": 0.5924484737157731,
"learning_rate": 6.59090909090909e-06,
"loss": 0.6947,
"num_tokens": 11368873.0,
"step": 30
},
{
"epoch": 0.023890784982935155,
"grad_norm": 0.5604182612081587,
"learning_rate": 7.727272727272727e-06,
"loss": 0.6899,
"num_tokens": 13219424.0,
"step": 35
},
{
"epoch": 0.027303754266211604,
"grad_norm": 0.4895549616398094,
"learning_rate": 8.863636363636365e-06,
"loss": 0.661,
"num_tokens": 15118063.0,
"step": 40
},
{
"epoch": 0.030716723549488054,
"grad_norm": 0.42034621767587177,
"learning_rate": 1e-05,
"loss": 0.6604,
"num_tokens": 17042190.0,
"step": 45
},
{
"epoch": 0.034129692832764506,
"grad_norm": 0.4180115353544924,
"learning_rate": 1.1136363636363637e-05,
"loss": 0.6408,
"num_tokens": 18906839.0,
"step": 50
},
{
"epoch": 0.03754266211604096,
"grad_norm": 0.38973426726946303,
"learning_rate": 1.2272727272727273e-05,
"loss": 0.626,
"num_tokens": 20791105.0,
"step": 55
},
{
"epoch": 0.040955631399317405,
"grad_norm": 0.3729163992496813,
"learning_rate": 1.340909090909091e-05,
"loss": 0.6319,
"num_tokens": 22641755.0,
"step": 60
},
{
"epoch": 0.04436860068259386,
"grad_norm": 0.43970034712774936,
"learning_rate": 1.4545454545454545e-05,
"loss": 0.6308,
"num_tokens": 24604914.0,
"step": 65
},
{
"epoch": 0.04778156996587031,
"grad_norm": 0.391042984403886,
"learning_rate": 1.5681818181818182e-05,
"loss": 0.6221,
"num_tokens": 26636629.0,
"step": 70
},
{
"epoch": 0.051194539249146756,
"grad_norm": 0.36147517581553934,
"learning_rate": 1.6818181818181818e-05,
"loss": 0.5868,
"num_tokens": 28441260.0,
"step": 75
},
{
"epoch": 0.05460750853242321,
"grad_norm": 0.4193365439715771,
"learning_rate": 1.7954545454545454e-05,
"loss": 0.6088,
"num_tokens": 30417967.0,
"step": 80
},
{
"epoch": 0.05802047781569966,
"grad_norm": 0.4171218491287503,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.6029,
"num_tokens": 32351323.0,
"step": 85
},
{
"epoch": 0.06143344709897611,
"grad_norm": 0.39471843177954063,
"learning_rate": 2.022727272727273e-05,
"loss": 0.6014,
"num_tokens": 34231333.0,
"step": 90
},
{
"epoch": 0.06484641638225255,
"grad_norm": 0.43024779590710627,
"learning_rate": 2.1363636363636362e-05,
"loss": 0.5977,
"num_tokens": 36068371.0,
"step": 95
},
{
"epoch": 0.06825938566552901,
"grad_norm": 0.46974196929259243,
"learning_rate": 2.25e-05,
"loss": 0.609,
"num_tokens": 37961424.0,
"step": 100
},
{
"epoch": 0.07167235494880546,
"grad_norm": 0.49424283775259037,
"learning_rate": 2.3636363636363637e-05,
"loss": 0.6079,
"num_tokens": 39846738.0,
"step": 105
},
{
"epoch": 0.07508532423208192,
"grad_norm": 0.44091065089968234,
"learning_rate": 2.4772727272727277e-05,
"loss": 0.6013,
"num_tokens": 41826860.0,
"step": 110
},
{
"epoch": 0.07849829351535836,
"grad_norm": 0.4784434654985161,
"learning_rate": 2.590909090909091e-05,
"loss": 0.596,
"num_tokens": 43675049.0,
"step": 115
},
{
"epoch": 0.08191126279863481,
"grad_norm": 0.5357275694123541,
"learning_rate": 2.7045454545454545e-05,
"loss": 0.5887,
"num_tokens": 45543403.0,
"step": 120
},
{
"epoch": 0.08532423208191127,
"grad_norm": 0.49911922995763447,
"learning_rate": 2.818181818181818e-05,
"loss": 0.6015,
"num_tokens": 47466657.0,
"step": 125
},
{
"epoch": 0.08873720136518772,
"grad_norm": 0.5036954772376355,
"learning_rate": 2.9318181818181817e-05,
"loss": 0.5923,
"num_tokens": 49369486.0,
"step": 130
},
{
"epoch": 0.09215017064846416,
"grad_norm": 0.4507843677307472,
"learning_rate": 3.0454545454545456e-05,
"loss": 0.5874,
"num_tokens": 51193029.0,
"step": 135
},
{
"epoch": 0.09556313993174062,
"grad_norm": 0.4181777811200571,
"learning_rate": 3.159090909090909e-05,
"loss": 0.5859,
"num_tokens": 53010874.0,
"step": 140
},
{
"epoch": 0.09897610921501707,
"grad_norm": 0.4591933150676979,
"learning_rate": 3.272727272727273e-05,
"loss": 0.5793,
"num_tokens": 54919625.0,
"step": 145
},
{
"epoch": 0.10238907849829351,
"grad_norm": 0.41064630834685967,
"learning_rate": 3.3863636363636364e-05,
"loss": 0.5984,
"num_tokens": 56909889.0,
"step": 150
},
{
"epoch": 0.10580204778156997,
"grad_norm": 0.4595803439510107,
"learning_rate": 3.5e-05,
"loss": 0.5686,
"num_tokens": 58759721.0,
"step": 155
},
{
"epoch": 0.10921501706484642,
"grad_norm": 0.47432883262205655,
"learning_rate": 3.613636363636364e-05,
"loss": 0.576,
"num_tokens": 60650570.0,
"step": 160
},
{
"epoch": 0.11262798634812286,
"grad_norm": 0.4476702414488316,
"learning_rate": 3.7272727272727276e-05,
"loss": 0.56,
"num_tokens": 62662896.0,
"step": 165
},
{
"epoch": 0.11604095563139932,
"grad_norm": 0.4882186117934918,
"learning_rate": 3.840909090909091e-05,
"loss": 0.5721,
"num_tokens": 64564660.0,
"step": 170
},
{
"epoch": 0.11945392491467577,
"grad_norm": 0.4419362623733834,
"learning_rate": 3.954545454545455e-05,
"loss": 0.5772,
"num_tokens": 66485190.0,
"step": 175
},
{
"epoch": 0.12286689419795221,
"grad_norm": 0.5206425917443764,
"learning_rate": 4.068181818181818e-05,
"loss": 0.5914,
"num_tokens": 68426882.0,
"step": 180
},
{
"epoch": 0.12627986348122866,
"grad_norm": 0.4594589383240807,
"learning_rate": 4.181818181818182e-05,
"loss": 0.5807,
"num_tokens": 70332714.0,
"step": 185
},
{
"epoch": 0.1296928327645051,
"grad_norm": 0.6185767143477556,
"learning_rate": 4.295454545454546e-05,
"loss": 0.5801,
"num_tokens": 72252819.0,
"step": 190
},
{
"epoch": 0.13310580204778158,
"grad_norm": 0.5441186147701085,
"learning_rate": 4.409090909090909e-05,
"loss": 0.5543,
"num_tokens": 74221358.0,
"step": 195
},
{
"epoch": 0.13651877133105803,
"grad_norm": 0.5964199268078774,
"learning_rate": 4.522727272727273e-05,
"loss": 0.5801,
"num_tokens": 76160914.0,
"step": 200
},
{
"epoch": 0.13993174061433447,
"grad_norm": 0.5161872417575659,
"learning_rate": 4.636363636363636e-05,
"loss": 0.568,
"num_tokens": 78170639.0,
"step": 205
},
{
"epoch": 0.14334470989761092,
"grad_norm": 0.539803223391811,
"learning_rate": 4.75e-05,
"loss": 0.5717,
"num_tokens": 80152955.0,
"step": 210
},
{
"epoch": 0.14675767918088736,
"grad_norm": 0.49826016397215506,
"learning_rate": 4.863636363636364e-05,
"loss": 0.5699,
"num_tokens": 81973764.0,
"step": 215
},
{
"epoch": 0.15017064846416384,
"grad_norm": 0.4478810201640095,
"learning_rate": 4.9772727272727275e-05,
"loss": 0.5778,
"num_tokens": 83901702.0,
"step": 220
},
{
"epoch": 0.15358361774744028,
"grad_norm": 0.47960005318311366,
"learning_rate": 4.999989808010608e-05,
"loss": 0.5527,
"num_tokens": 85675897.0,
"step": 225
},
{
"epoch": 0.15699658703071673,
"grad_norm": 0.4154208311619921,
"learning_rate": 4.999948403211952e-05,
"loss": 0.5934,
"num_tokens": 87691018.0,
"step": 230
},
{
"epoch": 0.16040955631399317,
"grad_norm": 0.5045441094237715,
"learning_rate": 4.9998751491903514e-05,
"loss": 0.5624,
"num_tokens": 89579835.0,
"step": 235
},
{
"epoch": 0.16382252559726962,
"grad_norm": 0.4828789457579032,
"learning_rate": 4.999770046982755e-05,
"loss": 0.561,
"num_tokens": 91542428.0,
"step": 240
},
{
"epoch": 0.16723549488054607,
"grad_norm": 0.4694103846101932,
"learning_rate": 4.999633098076944e-05,
"loss": 0.5619,
"num_tokens": 93492833.0,
"step": 245
},
{
"epoch": 0.17064846416382254,
"grad_norm": 0.45281850476362134,
"learning_rate": 4.999464304411499e-05,
"loss": 0.5762,
"num_tokens": 95438170.0,
"step": 250
},
{
"epoch": 0.17406143344709898,
"grad_norm": 0.45563179331695774,
"learning_rate": 4.999263668375788e-05,
"loss": 0.5677,
"num_tokens": 97303144.0,
"step": 255
},
{
"epoch": 0.17747440273037543,
"grad_norm": 0.594320538310398,
"learning_rate": 4.999031192809919e-05,
"loss": 0.5609,
"num_tokens": 99383692.0,
"step": 260
},
{
"epoch": 0.18088737201365188,
"grad_norm": 0.4877418400224137,
"learning_rate": 4.998766881004709e-05,
"loss": 0.5661,
"num_tokens": 101357212.0,
"step": 265
},
{
"epoch": 0.18430034129692832,
"grad_norm": 0.5047110175224094,
"learning_rate": 4.998470736701634e-05,
"loss": 0.5607,
"num_tokens": 103223537.0,
"step": 270
},
{
"epoch": 0.18771331058020477,
"grad_norm": 0.42926499615516883,
"learning_rate": 4.9981427640927737e-05,
"loss": 0.5712,
"num_tokens": 105044617.0,
"step": 275
},
{
"epoch": 0.19112627986348124,
"grad_norm": 0.47060617173242353,
"learning_rate": 4.9977829678207565e-05,
"loss": 0.5799,
"num_tokens": 106901687.0,
"step": 280
},
{
"epoch": 0.1945392491467577,
"grad_norm": 0.3918775245841588,
"learning_rate": 4.99739135297869e-05,
"loss": 0.5348,
"num_tokens": 108801293.0,
"step": 285
},
{
"epoch": 0.19795221843003413,
"grad_norm": 0.4423073930617194,
"learning_rate": 4.996967925110093e-05,
"loss": 0.5586,
"num_tokens": 110840758.0,
"step": 290
},
{
"epoch": 0.20136518771331058,
"grad_norm": 0.40496989215921997,
"learning_rate": 4.996512690208813e-05,
"loss": 0.5611,
"num_tokens": 112713541.0,
"step": 295
},
{
"epoch": 0.20477815699658702,
"grad_norm": 0.4624172582770661,
"learning_rate": 4.996025654718942e-05,
"loss": 0.5616,
"num_tokens": 114521504.0,
"step": 300
},
{
"epoch": 0.20819112627986347,
"grad_norm": 0.4157666838153778,
"learning_rate": 4.99550682553473e-05,
"loss": 0.5464,
"num_tokens": 116445027.0,
"step": 305
},
{
"epoch": 0.21160409556313994,
"grad_norm": 0.41867883871171135,
"learning_rate": 4.994956210000481e-05,
"loss": 0.546,
"num_tokens": 118445759.0,
"step": 310
},
{
"epoch": 0.2150170648464164,
"grad_norm": 0.44173080298578826,
"learning_rate": 4.994373815910452e-05,
"loss": 0.5529,
"num_tokens": 120303683.0,
"step": 315
},
{
"epoch": 0.21843003412969283,
"grad_norm": 0.4355421923299714,
"learning_rate": 4.9937596515087434e-05,
"loss": 0.5571,
"num_tokens": 122311693.0,
"step": 320
},
{
"epoch": 0.22184300341296928,
"grad_norm": 0.36598314246434627,
"learning_rate": 4.993113725489179e-05,
"loss": 0.559,
"num_tokens": 124240526.0,
"step": 325
},
{
"epoch": 0.22525597269624573,
"grad_norm": 0.35614304303048105,
"learning_rate": 4.9924360469951894e-05,
"loss": 0.5545,
"num_tokens": 126094524.0,
"step": 330
},
{
"epoch": 0.22866894197952217,
"grad_norm": 0.3812958674545772,
"learning_rate": 4.991726625619675e-05,
"loss": 0.5377,
"num_tokens": 128075174.0,
"step": 335
},
{
"epoch": 0.23208191126279865,
"grad_norm": 0.39183276636151587,
"learning_rate": 4.990985471404874e-05,
"loss": 0.5626,
"num_tokens": 129971056.0,
"step": 340
},
{
"epoch": 0.2354948805460751,
"grad_norm": 0.4268019296022907,
"learning_rate": 4.990212594842222e-05,
"loss": 0.5613,
"num_tokens": 131847305.0,
"step": 345
},
{
"epoch": 0.23890784982935154,
"grad_norm": 0.4201099497432233,
"learning_rate": 4.989408006872199e-05,
"loss": 0.5692,
"num_tokens": 133765982.0,
"step": 350
},
{
"epoch": 0.24232081911262798,
"grad_norm": 0.3970069501554659,
"learning_rate": 4.98857171888418e-05,
"loss": 0.541,
"num_tokens": 135541389.0,
"step": 355
},
{
"epoch": 0.24573378839590443,
"grad_norm": 0.43166762110766976,
"learning_rate": 4.9877037427162664e-05,
"loss": 0.5498,
"num_tokens": 137522281.0,
"step": 360
},
{
"epoch": 0.24914675767918087,
"grad_norm": 0.391722053079388,
"learning_rate": 4.986804090655126e-05,
"loss": 0.5384,
"num_tokens": 139412841.0,
"step": 365
},
{
"epoch": 0.2525597269624573,
"grad_norm": 0.4313722584733286,
"learning_rate": 4.9858727754358156e-05,
"loss": 0.5415,
"num_tokens": 141196084.0,
"step": 370
},
{
"epoch": 0.25597269624573377,
"grad_norm": 0.3759279885998296,
"learning_rate": 4.984909810241598e-05,
"loss": 0.5622,
"num_tokens": 143277757.0,
"step": 375
},
{
"epoch": 0.2593856655290102,
"grad_norm": 0.4247856463406985,
"learning_rate": 4.9839152087037634e-05,
"loss": 0.54,
"num_tokens": 145136704.0,
"step": 380
},
{
"epoch": 0.2627986348122867,
"grad_norm": 0.40601821570497126,
"learning_rate": 4.982888984901427e-05,
"loss": 0.5305,
"num_tokens": 147007396.0,
"step": 385
},
{
"epoch": 0.26621160409556316,
"grad_norm": 0.4310619885316953,
"learning_rate": 4.9818311533613356e-05,
"loss": 0.5472,
"num_tokens": 148940122.0,
"step": 390
},
{
"epoch": 0.2696245733788396,
"grad_norm": 0.45012436714415344,
"learning_rate": 4.9807417290576604e-05,
"loss": 0.5548,
"num_tokens": 150912454.0,
"step": 395
},
{
"epoch": 0.27303754266211605,
"grad_norm": 0.44666759037455367,
"learning_rate": 4.979620727411785e-05,
"loss": 0.5419,
"num_tokens": 152809678.0,
"step": 400
},
{
"epoch": 0.2764505119453925,
"grad_norm": 0.46491094622823037,
"learning_rate": 4.978468164292087e-05,
"loss": 0.5264,
"num_tokens": 154713852.0,
"step": 405
},
{
"epoch": 0.27986348122866894,
"grad_norm": 0.4241223884347823,
"learning_rate": 4.977284056013714e-05,
"loss": 0.5567,
"num_tokens": 156683573.0,
"step": 410
},
{
"epoch": 0.2832764505119454,
"grad_norm": 0.38952457164726123,
"learning_rate": 4.976068419338352e-05,
"loss": 0.5322,
"num_tokens": 158515202.0,
"step": 415
},
{
"epoch": 0.28668941979522183,
"grad_norm": 0.41276839941571136,
"learning_rate": 4.974821271473989e-05,
"loss": 0.554,
"num_tokens": 160433326.0,
"step": 420
},
{
"epoch": 0.2901023890784983,
"grad_norm": 0.46776340260588906,
"learning_rate": 4.973542630074669e-05,
"loss": 0.5379,
"num_tokens": 162331530.0,
"step": 425
},
{
"epoch": 0.2935153583617747,
"grad_norm": 0.3942126322792489,
"learning_rate": 4.9722325132402456e-05,
"loss": 0.5544,
"num_tokens": 164374316.0,
"step": 430
},
{
"epoch": 0.29692832764505117,
"grad_norm": 0.42485340574568903,
"learning_rate": 4.970890939516122e-05,
"loss": 0.5212,
"num_tokens": 166249544.0,
"step": 435
},
{
"epoch": 0.3003412969283277,
"grad_norm": 0.36931812888506377,
"learning_rate": 4.969517927892993e-05,
"loss": 0.5307,
"num_tokens": 168223299.0,
"step": 440
},
{
"epoch": 0.3037542662116041,
"grad_norm": 0.37893051071556805,
"learning_rate": 4.968113497806571e-05,
"loss": 0.5495,
"num_tokens": 170189422.0,
"step": 445
},
{
"epoch": 0.30716723549488056,
"grad_norm": 0.391417342854194,
"learning_rate": 4.966677669137318e-05,
"loss": 0.5273,
"num_tokens": 172132000.0,
"step": 450
},
{
"epoch": 0.310580204778157,
"grad_norm": 0.43359270381089615,
"learning_rate": 4.965210462210153e-05,
"loss": 0.5391,
"num_tokens": 174083393.0,
"step": 455
},
{
"epoch": 0.31399317406143346,
"grad_norm": 0.3814563282884313,
"learning_rate": 4.963711897794177e-05,
"loss": 0.5283,
"num_tokens": 176086331.0,
"step": 460
},
{
"epoch": 0.3174061433447099,
"grad_norm": 0.3646920542467526,
"learning_rate": 4.962181997102371e-05,
"loss": 0.5127,
"num_tokens": 178019887.0,
"step": 465
},
{
"epoch": 0.32081911262798635,
"grad_norm": 0.4136924968720306,
"learning_rate": 4.9606207817912956e-05,
"loss": 0.5394,
"num_tokens": 179917640.0,
"step": 470
},
{
"epoch": 0.3242320819112628,
"grad_norm": 0.3583143442753951,
"learning_rate": 4.95902827396079e-05,
"loss": 0.5432,
"num_tokens": 181861425.0,
"step": 475
},
{
"epoch": 0.32764505119453924,
"grad_norm": 0.36301898613611383,
"learning_rate": 4.957404496153654e-05,
"loss": 0.5331,
"num_tokens": 183746129.0,
"step": 480
},
{
"epoch": 0.3310580204778157,
"grad_norm": 0.4058100440696107,
"learning_rate": 4.95574947135533e-05,
"loss": 0.55,
"num_tokens": 185690559.0,
"step": 485
},
{
"epoch": 0.33447098976109213,
"grad_norm": 0.3963433287989491,
"learning_rate": 4.95406322299358e-05,
"loss": 0.5431,
"num_tokens": 187699370.0,
"step": 490
},
{
"epoch": 0.3378839590443686,
"grad_norm": 0.3358571679241484,
"learning_rate": 4.952345774938151e-05,
"loss": 0.5282,
"num_tokens": 189580008.0,
"step": 495
},
{
"epoch": 0.3412969283276451,
"grad_norm": 0.3876973392915836,
"learning_rate": 4.95059715150044e-05,
"loss": 0.5304,
"num_tokens": 191512142.0,
"step": 500
},
{
"epoch": 0.3447098976109215,
"grad_norm": 0.39675038817247194,
"learning_rate": 4.948817377433145e-05,
"loss": 0.5264,
"num_tokens": 193419599.0,
"step": 505
},
{
"epoch": 0.34812286689419797,
"grad_norm": 0.352402123633437,
"learning_rate": 4.947006477929921e-05,
"loss": 0.5347,
"num_tokens": 195367749.0,
"step": 510
},
{
"epoch": 0.3515358361774744,
"grad_norm": 0.3811400127650845,
"learning_rate": 4.945164478625018e-05,
"loss": 0.5146,
"num_tokens": 197256984.0,
"step": 515
},
{
"epoch": 0.35494880546075086,
"grad_norm": 0.3515851376722721,
"learning_rate": 4.943291405592923e-05,
"loss": 0.524,
"num_tokens": 199215371.0,
"step": 520
},
{
"epoch": 0.3583617747440273,
"grad_norm": 0.36260566573089453,
"learning_rate": 4.9413872853479845e-05,
"loss": 0.5503,
"num_tokens": 201102127.0,
"step": 525
},
{
"epoch": 0.36177474402730375,
"grad_norm": 0.3952389227344783,
"learning_rate": 4.9394521448440445e-05,
"loss": 0.5273,
"num_tokens": 202903048.0,
"step": 530
},
{
"epoch": 0.3651877133105802,
"grad_norm": 0.40864664488422103,
"learning_rate": 4.9374860114740504e-05,
"loss": 0.5286,
"num_tokens": 204769389.0,
"step": 535
},
{
"epoch": 0.36860068259385664,
"grad_norm": 0.33805338721530187,
"learning_rate": 4.9354889130696724e-05,
"loss": 0.5277,
"num_tokens": 206761213.0,
"step": 540
},
{
"epoch": 0.3720136518771331,
"grad_norm": 0.3627209335431525,
"learning_rate": 4.933460877900907e-05,
"loss": 0.524,
"num_tokens": 208772401.0,
"step": 545
},
{
"epoch": 0.37542662116040953,
"grad_norm": 0.3611582489390528,
"learning_rate": 4.931401934675675e-05,
"loss": 0.5239,
"num_tokens": 210585632.0,
"step": 550
},
{
"epoch": 0.378839590443686,
"grad_norm": 0.347893449988355,
"learning_rate": 4.9293121125394203e-05,
"loss": 0.5334,
"num_tokens": 212446145.0,
"step": 555
},
{
"epoch": 0.3822525597269625,
"grad_norm": 0.3819699892037328,
"learning_rate": 4.927191441074692e-05,
"loss": 0.5256,
"num_tokens": 214261738.0,
"step": 560
},
{
"epoch": 0.3856655290102389,
"grad_norm": 0.36539693889703045,
"learning_rate": 4.92503995030073e-05,
"loss": 0.54,
"num_tokens": 216231144.0,
"step": 565
},
{
"epoch": 0.3890784982935154,
"grad_norm": 0.4184658120058824,
"learning_rate": 4.9228576706730355e-05,
"loss": 0.5292,
"num_tokens": 218144024.0,
"step": 570
},
{
"epoch": 0.3924914675767918,
"grad_norm": 0.35963118361907315,
"learning_rate": 4.9206446330829456e-05,
"loss": 0.5286,
"num_tokens": 220108684.0,
"step": 575
},
{
"epoch": 0.39590443686006827,
"grad_norm": 0.3683166909297301,
"learning_rate": 4.9184008688571884e-05,
"loss": 0.5311,
"num_tokens": 222057295.0,
"step": 580
},
{
"epoch": 0.3993174061433447,
"grad_norm": 0.3340559497592815,
"learning_rate": 4.9161264097574483e-05,
"loss": 0.5167,
"num_tokens": 223892351.0,
"step": 585
},
{
"epoch": 0.40273037542662116,
"grad_norm": 0.33966650506262697,
"learning_rate": 4.913821287979909e-05,
"loss": 0.5253,
"num_tokens": 225828573.0,
"step": 590
},
{
"epoch": 0.4061433447098976,
"grad_norm": 0.31277344378918026,
"learning_rate": 4.911485536154804e-05,
"loss": 0.5205,
"num_tokens": 227748575.0,
"step": 595
},
{
"epoch": 0.40955631399317405,
"grad_norm": 0.36134254991055687,
"learning_rate": 4.909119187345948e-05,
"loss": 0.5193,
"num_tokens": 229710487.0,
"step": 600
},
{
"epoch": 0.4129692832764505,
"grad_norm": 0.3344543802661992,
"learning_rate": 4.906722275050274e-05,
"loss": 0.5262,
"num_tokens": 231678126.0,
"step": 605
},
{
"epoch": 0.41638225255972694,
"grad_norm": 0.3554138286052159,
"learning_rate": 4.904294833197358e-05,
"loss": 0.5437,
"num_tokens": 233617525.0,
"step": 610
},
{
"epoch": 0.4197952218430034,
"grad_norm": 0.3486512810565428,
"learning_rate": 4.901836896148938e-05,
"loss": 0.5302,
"num_tokens": 235464027.0,
"step": 615
},
{
"epoch": 0.4232081911262799,
"grad_norm": 0.3466494965430672,
"learning_rate": 4.8993484986984265e-05,
"loss": 0.5353,
"num_tokens": 237394471.0,
"step": 620
},
{
"epoch": 0.42662116040955633,
"grad_norm": 0.3503225313716609,
"learning_rate": 4.896829676070421e-05,
"loss": 0.5251,
"num_tokens": 239271145.0,
"step": 625
},
{
"epoch": 0.4300341296928328,
"grad_norm": 0.35994178207363314,
"learning_rate": 4.894280463920201e-05,
"loss": 0.5296,
"num_tokens": 241114261.0,
"step": 630
},
{
"epoch": 0.4334470989761092,
"grad_norm": 0.33650308530603384,
"learning_rate": 4.8917008983332307e-05,
"loss": 0.5402,
"num_tokens": 242992742.0,
"step": 635
},
{
"epoch": 0.43686006825938567,
"grad_norm": 0.321344393861645,
"learning_rate": 4.889091015824639e-05,
"loss": 0.5151,
"num_tokens": 244967987.0,
"step": 640
},
{
"epoch": 0.4402730375426621,
"grad_norm": 0.3663954539776663,
"learning_rate": 4.886450853338709e-05,
"loss": 0.5502,
"num_tokens": 246976387.0,
"step": 645
},
{
"epoch": 0.44368600682593856,
"grad_norm": 0.45975821746996437,
"learning_rate": 4.883780448248353e-05,
"loss": 0.5183,
"num_tokens": 248751095.0,
"step": 650
},
{
"epoch": 0.447098976109215,
"grad_norm": 0.3555087234083296,
"learning_rate": 4.881079838354582e-05,
"loss": 0.5277,
"num_tokens": 250671856.0,
"step": 655
},
{
"epoch": 0.45051194539249145,
"grad_norm": 0.37726144835555214,
"learning_rate": 4.878349061885974e-05,
"loss": 0.5157,
"num_tokens": 252607265.0,
"step": 660
},
{
"epoch": 0.4539249146757679,
"grad_norm": 0.3616385768897013,
"learning_rate": 4.875588157498131e-05,
"loss": 0.5288,
"num_tokens": 254497259.0,
"step": 665
},
{
"epoch": 0.45733788395904434,
"grad_norm": 0.3450279758346968,
"learning_rate": 4.87279716427313e-05,
"loss": 0.5279,
"num_tokens": 256417909.0,
"step": 670
},
{
"epoch": 0.46075085324232085,
"grad_norm": 0.33204144900991867,
"learning_rate": 4.8699761217189735e-05,
"loss": 0.5102,
"num_tokens": 258396985.0,
"step": 675
},
{
"epoch": 0.4641638225255973,
"grad_norm": 0.3431329045509487,
"learning_rate": 4.867125069769027e-05,
"loss": 0.5322,
"num_tokens": 260456429.0,
"step": 680
},
{
"epoch": 0.46757679180887374,
"grad_norm": 0.3809746183245989,
"learning_rate": 4.864244048781458e-05,
"loss": 0.5119,
"num_tokens": 262384995.0,
"step": 685
},
{
"epoch": 0.4709897610921502,
"grad_norm": 0.3712668805710275,
"learning_rate": 4.861333099538656e-05,
"loss": 0.5154,
"num_tokens": 264287905.0,
"step": 690
},
{
"epoch": 0.47440273037542663,
"grad_norm": 0.36608602643678284,
"learning_rate": 4.858392263246666e-05,
"loss": 0.5226,
"num_tokens": 266198448.0,
"step": 695
},
{
"epoch": 0.4778156996587031,
"grad_norm": 0.40431967372937866,
"learning_rate": 4.8554215815345985e-05,
"loss": 0.5223,
"num_tokens": 268098790.0,
"step": 700
},
{
"epoch": 0.4812286689419795,
"grad_norm": 0.3571318972995455,
"learning_rate": 4.852421096454041e-05,
"loss": 0.5339,
"num_tokens": 270033989.0,
"step": 705
},
{
"epoch": 0.48464163822525597,
"grad_norm": 0.3432800376569745,
"learning_rate": 4.849390850478465e-05,
"loss": 0.5301,
"num_tokens": 271974065.0,
"step": 710
},
{
"epoch": 0.4880546075085324,
"grad_norm": 0.40999627806439953,
"learning_rate": 4.846330886502625e-05,
"loss": 0.5401,
"num_tokens": 273982161.0,
"step": 715
},
{
"epoch": 0.49146757679180886,
"grad_norm": 0.3829538235957624,
"learning_rate": 4.843241247841944e-05,
"loss": 0.5317,
"num_tokens": 275852045.0,
"step": 720
},
{
"epoch": 0.4948805460750853,
"grad_norm": 0.32021121908893335,
"learning_rate": 4.8401219782319114e-05,
"loss": 0.502,
"num_tokens": 277687831.0,
"step": 725
},
{
"epoch": 0.49829351535836175,
"grad_norm": 0.30949205453341727,
"learning_rate": 4.8369731218274567e-05,
"loss": 0.5252,
"num_tokens": 279504484.0,
"step": 730
},
{
"epoch": 0.5017064846416383,
"grad_norm": 0.31840825885431656,
"learning_rate": 4.833794723202327e-05,
"loss": 0.5098,
"num_tokens": 281456293.0,
"step": 735
},
{
"epoch": 0.5051194539249146,
"grad_norm": 0.3167489912000843,
"learning_rate": 4.8305868273484537e-05,
"loss": 0.521,
"num_tokens": 283461546.0,
"step": 740
},
{
"epoch": 0.5085324232081911,
"grad_norm": 0.30595968475470414,
"learning_rate": 4.8273494796753175e-05,
"loss": 0.5165,
"num_tokens": 285433304.0,
"step": 745
},
{
"epoch": 0.5119453924914675,
"grad_norm": 0.3746798875916052,
"learning_rate": 4.824082726009308e-05,
"loss": 0.53,
"num_tokens": 287371918.0,
"step": 750
},
{
"epoch": 0.515358361774744,
"grad_norm": 0.345045842082928,
"learning_rate": 4.8207866125930696e-05,
"loss": 0.502,
"num_tokens": 289259239.0,
"step": 755
},
{
"epoch": 0.5187713310580204,
"grad_norm": 0.38441521290672914,
"learning_rate": 4.81746118608485e-05,
"loss": 0.5184,
"num_tokens": 291057738.0,
"step": 760
},
{
"epoch": 0.5221843003412969,
"grad_norm": 0.3459495605757955,
"learning_rate": 4.8141064935578404e-05,
"loss": 0.5238,
"num_tokens": 293034663.0,
"step": 765
},
{
"epoch": 0.5255972696245734,
"grad_norm": 0.3465676375928197,
"learning_rate": 4.810722582499508e-05,
"loss": 0.5142,
"num_tokens": 294881963.0,
"step": 770
},
{
"epoch": 0.5290102389078498,
"grad_norm": 0.3431332856561044,
"learning_rate": 4.8073095008109234e-05,
"loss": 0.5229,
"num_tokens": 296794020.0,
"step": 775
},
{
"epoch": 0.5324232081911263,
"grad_norm": 0.34723558691112266,
"learning_rate": 4.8038672968060824e-05,
"loss": 0.5205,
"num_tokens": 298677895.0,
"step": 780
},
{
"epoch": 0.5358361774744027,
"grad_norm": 0.3661333544764211,
"learning_rate": 4.800396019211224e-05,
"loss": 0.5115,
"num_tokens": 300610709.0,
"step": 785
},
{
"epoch": 0.5392491467576792,
"grad_norm": 0.3261317551653855,
"learning_rate": 4.79689571716414e-05,
"loss": 0.5005,
"num_tokens": 302387985.0,
"step": 790
},
{
"epoch": 0.5426621160409556,
"grad_norm": 0.44334818660500824,
"learning_rate": 4.793366440213477e-05,
"loss": 0.5294,
"num_tokens": 304293467.0,
"step": 795
},
{
"epoch": 0.5460750853242321,
"grad_norm": 0.3081814955106798,
"learning_rate": 4.789808238318037e-05,
"loss": 0.5258,
"num_tokens": 306130593.0,
"step": 800
},
{
"epoch": 0.5494880546075085,
"grad_norm": 0.3576761308612535,
"learning_rate": 4.786221161846072e-05,
"loss": 0.5309,
"num_tokens": 308025370.0,
"step": 805
},
{
"epoch": 0.552901023890785,
"grad_norm": 0.3270050276920037,
"learning_rate": 4.782605261574568e-05,
"loss": 0.5209,
"num_tokens": 309952008.0,
"step": 810
},
{
"epoch": 0.5563139931740614,
"grad_norm": 0.31786503490293877,
"learning_rate": 4.778960588688527e-05,
"loss": 0.5082,
"num_tokens": 311856787.0,
"step": 815
},
{
"epoch": 0.5597269624573379,
"grad_norm": 0.33017454672312996,
"learning_rate": 4.775287194780241e-05,
"loss": 0.5147,
"num_tokens": 313725489.0,
"step": 820
},
{
"epoch": 0.5631399317406144,
"grad_norm": 0.3604312004052758,
"learning_rate": 4.771585131848569e-05,
"loss": 0.5133,
"num_tokens": 315690477.0,
"step": 825
},
{
"epoch": 0.5665529010238908,
"grad_norm": 0.333952918780597,
"learning_rate": 4.767854452298188e-05,
"loss": 0.5204,
"num_tokens": 317525148.0,
"step": 830
},
{
"epoch": 0.5699658703071673,
"grad_norm": 0.3267362876583091,
"learning_rate": 4.764095208938865e-05,
"loss": 0.5149,
"num_tokens": 319505322.0,
"step": 835
},
{
"epoch": 0.5733788395904437,
"grad_norm": 0.337839892043544,
"learning_rate": 4.7603074549846986e-05,
"loss": 0.5182,
"num_tokens": 321410101.0,
"step": 840
},
{
"epoch": 0.5767918088737202,
"grad_norm": 0.37147409137067106,
"learning_rate": 4.7564912440533734e-05,
"loss": 0.5097,
"num_tokens": 323302156.0,
"step": 845
},
{
"epoch": 0.5802047781569966,
"grad_norm": 0.3385281078202508,
"learning_rate": 4.752646630165393e-05,
"loss": 0.5126,
"num_tokens": 325102278.0,
"step": 850
},
{
"epoch": 0.5836177474402731,
"grad_norm": 0.4040596596048006,
"learning_rate": 4.7487736677433256e-05,
"loss": 0.5172,
"num_tokens": 326903105.0,
"step": 855
},
{
"epoch": 0.5870307167235495,
"grad_norm": 0.39849331627110846,
"learning_rate": 4.7448724116110264e-05,
"loss": 0.5146,
"num_tokens": 328831071.0,
"step": 860
},
{
"epoch": 0.590443686006826,
"grad_norm": 0.31821166064088463,
"learning_rate": 4.74094291699286e-05,
"loss": 0.5133,
"num_tokens": 330823718.0,
"step": 865
},
{
"epoch": 0.5938566552901023,
"grad_norm": 0.32627133466823544,
"learning_rate": 4.736985239512925e-05,
"loss": 0.4976,
"num_tokens": 332767044.0,
"step": 870
},
{
"epoch": 0.5972696245733788,
"grad_norm": 0.3177734291813305,
"learning_rate": 4.732999435194265e-05,
"loss": 0.5165,
"num_tokens": 334743310.0,
"step": 875
},
{
"epoch": 0.6006825938566553,
"grad_norm": 0.30605344292447234,
"learning_rate": 4.728985560458068e-05,
"loss": 0.4884,
"num_tokens": 336595638.0,
"step": 880
},
{
"epoch": 0.6040955631399317,
"grad_norm": 0.32401153361842944,
"learning_rate": 4.7249436721228795e-05,
"loss": 0.5108,
"num_tokens": 338382273.0,
"step": 885
},
{
"epoch": 0.6075085324232082,
"grad_norm": 0.31304402477648574,
"learning_rate": 4.720873827403791e-05,
"loss": 0.5198,
"num_tokens": 340358925.0,
"step": 890
},
{
"epoch": 0.6109215017064846,
"grad_norm": 0.3218084504074191,
"learning_rate": 4.716776083911631e-05,
"loss": 0.5189,
"num_tokens": 342188299.0,
"step": 895
},
{
"epoch": 0.6143344709897611,
"grad_norm": 0.3510785346642464,
"learning_rate": 4.7126504996521484e-05,
"loss": 0.5096,
"num_tokens": 344239058.0,
"step": 900
},
{
"epoch": 0.6177474402730375,
"grad_norm": 0.36297363440496044,
"learning_rate": 4.7084971330251974e-05,
"loss": 0.5204,
"num_tokens": 346206452.0,
"step": 905
},
{
"epoch": 0.621160409556314,
"grad_norm": 0.3068962752186269,
"learning_rate": 4.704316042823902e-05,
"loss": 0.5152,
"num_tokens": 348080585.0,
"step": 910
},
{
"epoch": 0.6245733788395904,
"grad_norm": 0.31633950180676207,
"learning_rate": 4.700107288233831e-05,
"loss": 0.5224,
"num_tokens": 349930176.0,
"step": 915
},
{
"epoch": 0.6279863481228669,
"grad_norm": 0.31119199537147196,
"learning_rate": 4.695870928832156e-05,
"loss": 0.4999,
"num_tokens": 351817695.0,
"step": 920
},
{
"epoch": 0.6313993174061433,
"grad_norm": 0.30951025365629076,
"learning_rate": 4.6916070245868085e-05,
"loss": 0.5052,
"num_tokens": 353737820.0,
"step": 925
},
{
"epoch": 0.6348122866894198,
"grad_norm": 0.34341366500684045,
"learning_rate": 4.6873156358556375e-05,
"loss": 0.5059,
"num_tokens": 355639183.0,
"step": 930
},
{
"epoch": 0.6382252559726962,
"grad_norm": 0.33780786043507366,
"learning_rate": 4.682996823385541e-05,
"loss": 0.5263,
"num_tokens": 357548627.0,
"step": 935
},
{
"epoch": 0.6416382252559727,
"grad_norm": 0.3602258225812933,
"learning_rate": 4.678650648311624e-05,
"loss": 0.5235,
"num_tokens": 359437581.0,
"step": 940
},
{
"epoch": 0.6450511945392492,
"grad_norm": 0.3623769577177757,
"learning_rate": 4.6742771721563146e-05,
"loss": 0.4995,
"num_tokens": 361329513.0,
"step": 945
},
{
"epoch": 0.6484641638225256,
"grad_norm": 0.36738029330256716,
"learning_rate": 4.66987645682851e-05,
"loss": 0.5039,
"num_tokens": 363189983.0,
"step": 950
},
{
"epoch": 0.6518771331058021,
"grad_norm": 0.2994153691319221,
"learning_rate": 4.665448564622687e-05,
"loss": 0.5085,
"num_tokens": 365024705.0,
"step": 955
},
{
"epoch": 0.6552901023890785,
"grad_norm": 0.30495566025924037,
"learning_rate": 4.660993558218028e-05,
"loss": 0.499,
"num_tokens": 366863209.0,
"step": 960
},
{
"epoch": 0.658703071672355,
"grad_norm": 0.3396370071116669,
"learning_rate": 4.6565115006775335e-05,
"loss": 0.5251,
"num_tokens": 368837047.0,
"step": 965
},
{
"epoch": 0.6621160409556314,
"grad_norm": 0.3174353904822871,
"learning_rate": 4.6520024554471224e-05,
"loss": 0.5087,
"num_tokens": 370725860.0,
"step": 970
},
{
"epoch": 0.6655290102389079,
"grad_norm": 0.28442067468175136,
"learning_rate": 4.647466486354743e-05,
"loss": 0.4907,
"num_tokens": 372569301.0,
"step": 975
},
{
"epoch": 0.6689419795221843,
"grad_norm": 0.2919040706685906,
"learning_rate": 4.642903657609463e-05,
"loss": 0.5137,
"num_tokens": 374566515.0,
"step": 980
},
{
"epoch": 0.6723549488054608,
"grad_norm": 0.29173671828185904,
"learning_rate": 4.638314033800564e-05,
"loss": 0.4944,
"num_tokens": 376431969.0,
"step": 985
},
{
"epoch": 0.6757679180887372,
"grad_norm": 0.31461039206966784,
"learning_rate": 4.633697679896626e-05,
"loss": 0.495,
"num_tokens": 378330489.0,
"step": 990
},
{
"epoch": 0.6791808873720137,
"grad_norm": 0.2915321504491105,
"learning_rate": 4.629054661244606e-05,
"loss": 0.4871,
"num_tokens": 380245892.0,
"step": 995
},
{
"epoch": 0.6825938566552902,
"grad_norm": 0.286886601156865,
"learning_rate": 4.624385043568917e-05,
"loss": 0.5114,
"num_tokens": 382103468.0,
"step": 1000
},
{
"epoch": 0.6860068259385665,
"grad_norm": 0.31706955102919415,
"learning_rate": 4.6196888929704954e-05,
"loss": 0.5196,
"num_tokens": 384011945.0,
"step": 1005
},
{
"epoch": 0.689419795221843,
"grad_norm": 0.33284742322766536,
"learning_rate": 4.614966275925863e-05,
"loss": 0.4963,
"num_tokens": 385837297.0,
"step": 1010
},
{
"epoch": 0.6928327645051194,
"grad_norm": 0.4000880667080671,
"learning_rate": 4.61021725928619e-05,
"loss": 0.5107,
"num_tokens": 387718762.0,
"step": 1015
},
{
"epoch": 0.6962457337883959,
"grad_norm": 0.318145983785095,
"learning_rate": 4.6054419102763476e-05,
"loss": 0.5,
"num_tokens": 389586680.0,
"step": 1020
},
{
"epoch": 0.6996587030716723,
"grad_norm": 0.29446028177317474,
"learning_rate": 4.600640296493953e-05,
"loss": 0.5173,
"num_tokens": 391567792.0,
"step": 1025
},
{
"epoch": 0.7030716723549488,
"grad_norm": 0.33638404359083496,
"learning_rate": 4.59581248590842e-05,
"loss": 0.5021,
"num_tokens": 393428760.0,
"step": 1030
},
{
"epoch": 0.7064846416382252,
"grad_norm": 0.3178901269025221,
"learning_rate": 4.590958546859988e-05,
"loss": 0.5184,
"num_tokens": 395264025.0,
"step": 1035
},
{
"epoch": 0.7098976109215017,
"grad_norm": 0.3175572857551193,
"learning_rate": 4.58607854805876e-05,
"loss": 0.5251,
"num_tokens": 397158700.0,
"step": 1040
},
{
"epoch": 0.7133105802047781,
"grad_norm": 0.3108349326509548,
"learning_rate": 4.581172558583729e-05,
"loss": 0.497,
"num_tokens": 399092348.0,
"step": 1045
},
{
"epoch": 0.7167235494880546,
"grad_norm": 0.3097661968317486,
"learning_rate": 4.576240647881801e-05,
"loss": 0.5148,
"num_tokens": 400923938.0,
"step": 1050
},
{
"epoch": 0.7201365187713311,
"grad_norm": 0.29168496463784344,
"learning_rate": 4.571282885766808e-05,
"loss": 0.4945,
"num_tokens": 402794748.0,
"step": 1055
},
{
"epoch": 0.7235494880546075,
"grad_norm": 0.30716631597516236,
"learning_rate": 4.5662993424185244e-05,
"loss": 0.5052,
"num_tokens": 404685655.0,
"step": 1060
},
{
"epoch": 0.726962457337884,
"grad_norm": 0.33227072895463494,
"learning_rate": 4.561290088381672e-05,
"loss": 0.5061,
"num_tokens": 406613744.0,
"step": 1065
},
{
"epoch": 0.7303754266211604,
"grad_norm": 0.3293212560670863,
"learning_rate": 4.5562551945649215e-05,
"loss": 0.5049,
"num_tokens": 408582799.0,
"step": 1070
},
{
"epoch": 0.7337883959044369,
"grad_norm": 0.3356632435675788,
"learning_rate": 4.5511947322398855e-05,
"loss": 0.513,
"num_tokens": 410442926.0,
"step": 1075
},
{
"epoch": 0.7372013651877133,
"grad_norm": 0.3107589924934008,
"learning_rate": 4.546108773040116e-05,
"loss": 0.5089,
"num_tokens": 412386009.0,
"step": 1080
},
{
"epoch": 0.7406143344709898,
"grad_norm": 0.30270204787047783,
"learning_rate": 4.540997388960085e-05,
"loss": 0.5197,
"num_tokens": 414357432.0,
"step": 1085
},
{
"epoch": 0.7440273037542662,
"grad_norm": 0.2818167774666258,
"learning_rate": 4.53586065235417e-05,
"loss": 0.5077,
"num_tokens": 416372039.0,
"step": 1090
},
{
"epoch": 0.7474402730375427,
"grad_norm": 0.3102971120341378,
"learning_rate": 4.530698635935622e-05,
"loss": 0.5046,
"num_tokens": 418265973.0,
"step": 1095
},
{
"epoch": 0.7508532423208191,
"grad_norm": 0.3807798685595783,
"learning_rate": 4.525511412775545e-05,
"loss": 0.4921,
"num_tokens": 420051975.0,
"step": 1100
},
{
"epoch": 0.7542662116040956,
"grad_norm": 0.31661653050496685,
"learning_rate": 4.5202990563018565e-05,
"loss": 0.4934,
"num_tokens": 421905923.0,
"step": 1105
},
{
"epoch": 0.757679180887372,
"grad_norm": 0.35276872591605113,
"learning_rate": 4.515061640298251e-05,
"loss": 0.5042,
"num_tokens": 423685709.0,
"step": 1110
},
{
"epoch": 0.7610921501706485,
"grad_norm": 0.3844552713062626,
"learning_rate": 4.509799238903153e-05,
"loss": 0.4947,
"num_tokens": 425515443.0,
"step": 1115
},
{
"epoch": 0.764505119453925,
"grad_norm": 0.31759030359483253,
"learning_rate": 4.504511926608667e-05,
"loss": 0.4949,
"num_tokens": 427405904.0,
"step": 1120
},
{
"epoch": 0.7679180887372014,
"grad_norm": 0.3276618424804238,
"learning_rate": 4.4991997782595286e-05,
"loss": 0.5,
"num_tokens": 429329969.0,
"step": 1125
},
{
"epoch": 0.7713310580204779,
"grad_norm": 0.3504691972099564,
"learning_rate": 4.493862869052038e-05,
"loss": 0.4938,
"num_tokens": 431288378.0,
"step": 1130
},
{
"epoch": 0.7747440273037542,
"grad_norm": 0.31911074838089193,
"learning_rate": 4.488501274532998e-05,
"loss": 0.5025,
"num_tokens": 433213317.0,
"step": 1135
},
{
"epoch": 0.7781569965870307,
"grad_norm": 0.346412105131523,
"learning_rate": 4.483115070598647e-05,
"loss": 0.4941,
"num_tokens": 435077995.0,
"step": 1140
},
{
"epoch": 0.7815699658703071,
"grad_norm": 0.3211768432110787,
"learning_rate": 4.4777043334935834e-05,
"loss": 0.499,
"num_tokens": 437063353.0,
"step": 1145
},
{
"epoch": 0.7849829351535836,
"grad_norm": 0.2934813782432162,
"learning_rate": 4.4722691398096845e-05,
"loss": 0.4963,
"num_tokens": 439006864.0,
"step": 1150
},
{
"epoch": 0.78839590443686,
"grad_norm": 0.2838196869214083,
"learning_rate": 4.466809566485022e-05,
"loss": 0.5126,
"num_tokens": 441123044.0,
"step": 1155
},
{
"epoch": 0.7918088737201365,
"grad_norm": 0.30674961764611963,
"learning_rate": 4.461325690802777e-05,
"loss": 0.5124,
"num_tokens": 443045688.0,
"step": 1160
},
{
"epoch": 0.7952218430034129,
"grad_norm": 0.31400026415051735,
"learning_rate": 4.455817590390144e-05,
"loss": 0.5011,
"num_tokens": 444874851.0,
"step": 1165
},
{
"epoch": 0.7986348122866894,
"grad_norm": 0.32725515813051603,
"learning_rate": 4.450285343217228e-05,
"loss": 0.4986,
"num_tokens": 446852018.0,
"step": 1170
},
{
"epoch": 0.8020477815699659,
"grad_norm": 0.32318551620334435,
"learning_rate": 4.444729027595948e-05,
"loss": 0.5048,
"num_tokens": 448665204.0,
"step": 1175
},
{
"epoch": 0.8054607508532423,
"grad_norm": 0.33687487158543156,
"learning_rate": 4.4391487221789216e-05,
"loss": 0.5224,
"num_tokens": 450687287.0,
"step": 1180
},
{
"epoch": 0.8088737201365188,
"grad_norm": 0.35347464715186616,
"learning_rate": 4.433544505958358e-05,
"loss": 0.4992,
"num_tokens": 452550699.0,
"step": 1185
},
{
"epoch": 0.8122866894197952,
"grad_norm": 0.3445779383146421,
"learning_rate": 4.427916458264935e-05,
"loss": 0.4991,
"num_tokens": 454483896.0,
"step": 1190
},
{
"epoch": 0.8156996587030717,
"grad_norm": 0.29879424431987656,
"learning_rate": 4.422264658766677e-05,
"loss": 0.4841,
"num_tokens": 456418788.0,
"step": 1195
},
{
"epoch": 0.8191126279863481,
"grad_norm": 0.2991095523769755,
"learning_rate": 4.416589187467828e-05,
"loss": 0.51,
"num_tokens": 458325861.0,
"step": 1200
},
{
"epoch": 0.8225255972696246,
"grad_norm": 0.32477512440054085,
"learning_rate": 4.41089012470772e-05,
"loss": 0.5153,
"num_tokens": 460227692.0,
"step": 1205
},
{
"epoch": 0.825938566552901,
"grad_norm": 0.3547912313792265,
"learning_rate": 4.405167551159635e-05,
"loss": 0.4893,
"num_tokens": 462055868.0,
"step": 1210
},
{
"epoch": 0.8293515358361775,
"grad_norm": 0.3016276005058045,
"learning_rate": 4.399421547829661e-05,
"loss": 0.4892,
"num_tokens": 463988579.0,
"step": 1215
},
{
"epoch": 0.8327645051194539,
"grad_norm": 0.309849759273995,
"learning_rate": 4.393652196055548e-05,
"loss": 0.504,
"num_tokens": 465973876.0,
"step": 1220
},
{
"epoch": 0.8361774744027304,
"grad_norm": 0.30189015799475877,
"learning_rate": 4.3878595775055574e-05,
"loss": 0.4969,
"num_tokens": 467962214.0,
"step": 1225
},
{
"epoch": 0.8395904436860068,
"grad_norm": 0.3047425782223173,
"learning_rate": 4.382043774177302e-05,
"loss": 0.4911,
"num_tokens": 469791498.0,
"step": 1230
},
{
"epoch": 0.8430034129692833,
"grad_norm": 0.29428684450637044,
"learning_rate": 4.376204868396588e-05,
"loss": 0.4926,
"num_tokens": 471616902.0,
"step": 1235
},
{
"epoch": 0.8464163822525598,
"grad_norm": 0.3158111338692259,
"learning_rate": 4.370342942816249e-05,
"loss": 0.49,
"num_tokens": 473570581.0,
"step": 1240
},
{
"epoch": 0.8498293515358362,
"grad_norm": 0.3408082001646201,
"learning_rate": 4.3644580804149774e-05,
"loss": 0.5041,
"num_tokens": 475518680.0,
"step": 1245
},
{
"epoch": 0.8532423208191127,
"grad_norm": 0.38616580597842337,
"learning_rate": 4.358550364496148e-05,
"loss": 0.5104,
"num_tokens": 477401353.0,
"step": 1250
},
{
"epoch": 0.856655290102389,
"grad_norm": 0.32247025368304155,
"learning_rate": 4.3526198786866386e-05,
"loss": 0.5048,
"num_tokens": 479371988.0,
"step": 1255
},
{
"epoch": 0.8600682593856656,
"grad_norm": 0.35588546668005355,
"learning_rate": 4.3466667069356465e-05,
"loss": 0.5074,
"num_tokens": 481348892.0,
"step": 1260
},
{
"epoch": 0.863481228668942,
"grad_norm": 0.3410788549802598,
"learning_rate": 4.340690933513504e-05,
"loss": 0.4931,
"num_tokens": 483220037.0,
"step": 1265
},
{
"epoch": 0.8668941979522184,
"grad_norm": 0.28996758799627625,
"learning_rate": 4.334692643010478e-05,
"loss": 0.5144,
"num_tokens": 485171910.0,
"step": 1270
},
{
"epoch": 0.8703071672354948,
"grad_norm": 0.3450006717750541,
"learning_rate": 4.328671920335579e-05,
"loss": 0.4909,
"num_tokens": 487118187.0,
"step": 1275
},
{
"epoch": 0.8737201365187713,
"grad_norm": 0.3168603261439533,
"learning_rate": 4.322628850715357e-05,
"loss": 0.487,
"num_tokens": 489059548.0,
"step": 1280
},
{
"epoch": 0.8771331058020477,
"grad_norm": 0.2934471279813396,
"learning_rate": 4.3165635196926935e-05,
"loss": 0.4902,
"num_tokens": 490973976.0,
"step": 1285
},
{
"epoch": 0.8805460750853242,
"grad_norm": 0.30116352534369073,
"learning_rate": 4.310476013125593e-05,
"loss": 0.4939,
"num_tokens": 492893029.0,
"step": 1290
},
{
"epoch": 0.8839590443686007,
"grad_norm": 0.27653712452340234,
"learning_rate": 4.3043664171859676e-05,
"loss": 0.5133,
"num_tokens": 494980675.0,
"step": 1295
},
{
"epoch": 0.8873720136518771,
"grad_norm": 0.3008423365150182,
"learning_rate": 4.298234818358414e-05,
"loss": 0.485,
"num_tokens": 496905674.0,
"step": 1300
},
{
"epoch": 0.8907849829351536,
"grad_norm": 0.2986356900020865,
"learning_rate": 4.2920813034389944e-05,
"loss": 0.4984,
"num_tokens": 498889766.0,
"step": 1305
},
{
"epoch": 0.89419795221843,
"grad_norm": 0.274831418663253,
"learning_rate": 4.285905959534002e-05,
"loss": 0.4897,
"num_tokens": 500864542.0,
"step": 1310
},
{
"epoch": 0.8976109215017065,
"grad_norm": 0.3126326726365892,
"learning_rate": 4.2797088740587324e-05,
"loss": 0.5062,
"num_tokens": 502876203.0,
"step": 1315
},
{
"epoch": 0.9010238907849829,
"grad_norm": 0.3234689945785143,
"learning_rate": 4.273490134736246e-05,
"loss": 0.4892,
"num_tokens": 504810366.0,
"step": 1320
},
{
"epoch": 0.9044368600682594,
"grad_norm": 0.2728068836733229,
"learning_rate": 4.267249829596123e-05,
"loss": 0.4927,
"num_tokens": 506777983.0,
"step": 1325
},
{
"epoch": 0.9078498293515358,
"grad_norm": 0.3168203159146273,
"learning_rate": 4.2609880469732196e-05,
"loss": 0.4949,
"num_tokens": 508607232.0,
"step": 1330
},
{
"epoch": 0.9112627986348123,
"grad_norm": 0.29165446674166984,
"learning_rate": 4.254704875506419e-05,
"loss": 0.4949,
"num_tokens": 510543906.0,
"step": 1335
},
{
"epoch": 0.9146757679180887,
"grad_norm": 0.2873297869381404,
"learning_rate": 4.2484004041373724e-05,
"loss": 0.5078,
"num_tokens": 512474084.0,
"step": 1340
},
{
"epoch": 0.9180887372013652,
"grad_norm": 0.3073994896051493,
"learning_rate": 4.242074722109244e-05,
"loss": 0.4827,
"num_tokens": 514322576.0,
"step": 1345
},
{
"epoch": 0.9215017064846417,
"grad_norm": 0.33305778620591386,
"learning_rate": 4.235727918965446e-05,
"loss": 0.5005,
"num_tokens": 516216104.0,
"step": 1350
},
{
"epoch": 0.9249146757679181,
"grad_norm": 0.3369378626468833,
"learning_rate": 4.2293600845483715e-05,
"loss": 0.5062,
"num_tokens": 518137999.0,
"step": 1355
},
{
"epoch": 0.9283276450511946,
"grad_norm": 0.3298207620717988,
"learning_rate": 4.222971308998123e-05,
"loss": 0.4961,
"num_tokens": 519966345.0,
"step": 1360
},
{
"epoch": 0.931740614334471,
"grad_norm": 0.30388199133879135,
"learning_rate": 4.216561682751234e-05,
"loss": 0.4818,
"num_tokens": 521824255.0,
"step": 1365
},
{
"epoch": 0.9351535836177475,
"grad_norm": 0.32752381042516066,
"learning_rate": 4.210131296539391e-05,
"loss": 0.4921,
"num_tokens": 523793837.0,
"step": 1370
},
{
"epoch": 0.9385665529010239,
"grad_norm": 0.28445695076252103,
"learning_rate": 4.2036802413881524e-05,
"loss": 0.4982,
"num_tokens": 525746360.0,
"step": 1375
},
{
"epoch": 0.9419795221843004,
"grad_norm": 0.28899230017319383,
"learning_rate": 4.1972086086156525e-05,
"loss": 0.5019,
"num_tokens": 527666864.0,
"step": 1380
},
{
"epoch": 0.9453924914675768,
"grad_norm": 0.32011937903596116,
"learning_rate": 4.190716489831315e-05,
"loss": 0.5005,
"num_tokens": 529624010.0,
"step": 1385
},
{
"epoch": 0.9488054607508533,
"grad_norm": 0.29697819340477716,
"learning_rate": 4.184203976934552e-05,
"loss": 0.4939,
"num_tokens": 531466359.0,
"step": 1390
},
{
"epoch": 0.9522184300341296,
"grad_norm": 0.3027431585655998,
"learning_rate": 4.177671162113468e-05,
"loss": 0.5084,
"num_tokens": 533360398.0,
"step": 1395
},
{
"epoch": 0.9556313993174061,
"grad_norm": 0.32991449610546014,
"learning_rate": 4.17111813784355e-05,
"loss": 0.4906,
"num_tokens": 535253217.0,
"step": 1400
},
{
"epoch": 0.9590443686006825,
"grad_norm": 0.33638706750117725,
"learning_rate": 4.16454499688636e-05,
"loss": 0.4806,
"num_tokens": 537247787.0,
"step": 1405
},
{
"epoch": 0.962457337883959,
"grad_norm": 0.2978526679067072,
"learning_rate": 4.1579518322882264e-05,
"loss": 0.4863,
"num_tokens": 539137436.0,
"step": 1410
},
{
"epoch": 0.9658703071672355,
"grad_norm": 0.28420517349753716,
"learning_rate": 4.1513387373789174e-05,
"loss": 0.491,
"num_tokens": 540954808.0,
"step": 1415
},
{
"epoch": 0.9692832764505119,
"grad_norm": 0.2914181751043302,
"learning_rate": 4.1447058057703296e-05,
"loss": 0.4974,
"num_tokens": 542824098.0,
"step": 1420
},
{
"epoch": 0.9726962457337884,
"grad_norm": 0.30896035937102595,
"learning_rate": 4.138053131355158e-05,
"loss": 0.51,
"num_tokens": 544761712.0,
"step": 1425
},
{
"epoch": 0.9761092150170648,
"grad_norm": 0.2996642933295098,
"learning_rate": 4.131380808305565e-05,
"loss": 0.4992,
"num_tokens": 546578047.0,
"step": 1430
},
{
"epoch": 0.9795221843003413,
"grad_norm": 0.32391571755782744,
"learning_rate": 4.1246889310718536e-05,
"loss": 0.4907,
"num_tokens": 548490659.0,
"step": 1435
},
{
"epoch": 0.9829351535836177,
"grad_norm": 0.2805330274894754,
"learning_rate": 4.117977594381123e-05,
"loss": 0.4904,
"num_tokens": 550377811.0,
"step": 1440
},
{
"epoch": 0.9863481228668942,
"grad_norm": 0.36985148320187977,
"learning_rate": 4.111246893235935e-05,
"loss": 0.4949,
"num_tokens": 552306122.0,
"step": 1445
},
{
"epoch": 0.9897610921501706,
"grad_norm": 0.2744153820711885,
"learning_rate": 4.104496922912963e-05,
"loss": 0.4791,
"num_tokens": 554224024.0,
"step": 1450
},
{
"epoch": 0.9931740614334471,
"grad_norm": 0.3466323284595071,
"learning_rate": 4.097727778961648e-05,
"loss": 0.4995,
"num_tokens": 556055771.0,
"step": 1455
},
{
"epoch": 0.9965870307167235,
"grad_norm": 0.2796639263924327,
"learning_rate": 4.090939557202841e-05,
"loss": 0.4876,
"num_tokens": 558001366.0,
"step": 1460
},
{
"epoch": 1.0,
"grad_norm": 0.2779393474451887,
"learning_rate": 4.0841323537274544e-05,
"loss": 0.4929,
"num_tokens": 559917574.0,
"step": 1465
},
{
"epoch": 1.0034129692832765,
"grad_norm": 0.3138567800728616,
"learning_rate": 4.0773062648950905e-05,
"loss": 0.4536,
"num_tokens": 561795304.0,
"step": 1470
},
{
"epoch": 1.006825938566553,
"grad_norm": 0.3114289367733724,
"learning_rate": 4.0704613873326895e-05,
"loss": 0.4544,
"num_tokens": 563661756.0,
"step": 1475
},
{
"epoch": 1.0102389078498293,
"grad_norm": 0.32100212575438497,
"learning_rate": 4.0635978179331534e-05,
"loss": 0.4557,
"num_tokens": 565576615.0,
"step": 1480
},
{
"epoch": 1.0136518771331058,
"grad_norm": 0.33784033052194967,
"learning_rate": 4.056715653853977e-05,
"loss": 0.4658,
"num_tokens": 567552980.0,
"step": 1485
},
{
"epoch": 1.0170648464163823,
"grad_norm": 0.2846562551793901,
"learning_rate": 4.0498149925158765e-05,
"loss": 0.4486,
"num_tokens": 569441970.0,
"step": 1490
},
{
"epoch": 1.0204778156996588,
"grad_norm": 0.27181341067200976,
"learning_rate": 4.0428959316013996e-05,
"loss": 0.4571,
"num_tokens": 571408138.0,
"step": 1495
},
{
"epoch": 1.023890784982935,
"grad_norm": 0.29080838948154386,
"learning_rate": 4.0359585690535565e-05,
"loss": 0.4679,
"num_tokens": 573385584.0,
"step": 1500
},
{
"epoch": 1.0273037542662116,
"grad_norm": 0.2862021615209201,
"learning_rate": 4.0290030030744244e-05,
"loss": 0.438,
"num_tokens": 575190202.0,
"step": 1505
},
{
"epoch": 1.030716723549488,
"grad_norm": 0.2972505881497117,
"learning_rate": 4.0220293321237577e-05,
"loss": 0.4564,
"num_tokens": 577207221.0,
"step": 1510
},
{
"epoch": 1.0341296928327646,
"grad_norm": 0.3039253750758028,
"learning_rate": 4.0150376549176e-05,
"loss": 0.4605,
"num_tokens": 579169216.0,
"step": 1515
},
{
"epoch": 1.0375426621160408,
"grad_norm": 0.29255452310430513,
"learning_rate": 4.0080280704268805e-05,
"loss": 0.4518,
"num_tokens": 581176488.0,
"step": 1520
},
{
"epoch": 1.0409556313993173,
"grad_norm": 0.2883720019415548,
"learning_rate": 4.0010006778760175e-05,
"loss": 0.4616,
"num_tokens": 583060493.0,
"step": 1525
},
{
"epoch": 1.0443686006825939,
"grad_norm": 0.2980789185115071,
"learning_rate": 3.993955576741509e-05,
"loss": 0.4491,
"num_tokens": 584967164.0,
"step": 1530
},
{
"epoch": 1.0477815699658704,
"grad_norm": 0.2962427827544438,
"learning_rate": 3.986892866750532e-05,
"loss": 0.469,
"num_tokens": 586896302.0,
"step": 1535
},
{
"epoch": 1.0511945392491469,
"grad_norm": 0.30639817855272605,
"learning_rate": 3.979812647879522e-05,
"loss": 0.4544,
"num_tokens": 588745165.0,
"step": 1540
},
{
"epoch": 1.0546075085324231,
"grad_norm": 0.2773317562699212,
"learning_rate": 3.972715020352763e-05,
"loss": 0.4486,
"num_tokens": 590639322.0,
"step": 1545
},
{
"epoch": 1.0580204778156996,
"grad_norm": 0.29236172696032264,
"learning_rate": 3.9656000846409695e-05,
"loss": 0.4424,
"num_tokens": 592549757.0,
"step": 1550
},
{
"epoch": 1.0614334470989761,
"grad_norm": 0.2571903473392426,
"learning_rate": 3.9584679414598616e-05,
"loss": 0.4595,
"num_tokens": 594545910.0,
"step": 1555
},
{
"epoch": 1.0648464163822526,
"grad_norm": 0.2961845232409812,
"learning_rate": 3.951318691768741e-05,
"loss": 0.4525,
"num_tokens": 596464710.0,
"step": 1560
},
{
"epoch": 1.068259385665529,
"grad_norm": 0.25912887664891626,
"learning_rate": 3.944152436769059e-05,
"loss": 0.4563,
"num_tokens": 598527528.0,
"step": 1565
},
{
"epoch": 1.0716723549488054,
"grad_norm": 0.28353901932289044,
"learning_rate": 3.93696927790299e-05,
"loss": 0.4512,
"num_tokens": 600405627.0,
"step": 1570
},
{
"epoch": 1.075085324232082,
"grad_norm": 0.32788777138253855,
"learning_rate": 3.929769316851987e-05,
"loss": 0.4519,
"num_tokens": 602215607.0,
"step": 1575
},
{
"epoch": 1.0784982935153584,
"grad_norm": 0.3517481812619457,
"learning_rate": 3.92255265553535e-05,
"loss": 0.4673,
"num_tokens": 604135477.0,
"step": 1580
},
{
"epoch": 1.0819112627986347,
"grad_norm": 0.32122104215438485,
"learning_rate": 3.915319396108781e-05,
"loss": 0.4534,
"num_tokens": 606019690.0,
"step": 1585
},
{
"epoch": 1.0853242320819112,
"grad_norm": 0.29953390066728885,
"learning_rate": 3.9080696409629344e-05,
"loss": 0.4532,
"num_tokens": 607891544.0,
"step": 1590
},
{
"epoch": 1.0887372013651877,
"grad_norm": 0.312931983732918,
"learning_rate": 3.900803492721971e-05,
"loss": 0.4546,
"num_tokens": 609757887.0,
"step": 1595
},
{
"epoch": 1.0921501706484642,
"grad_norm": 0.3128391047692134,
"learning_rate": 3.8935210542421055e-05,
"loss": 0.4487,
"num_tokens": 611634451.0,
"step": 1600
},
{
"epoch": 1.0955631399317407,
"grad_norm": 0.32796952939352125,
"learning_rate": 3.886222428610149e-05,
"loss": 0.447,
"num_tokens": 613533180.0,
"step": 1605
},
{
"epoch": 1.098976109215017,
"grad_norm": 0.29026332056250814,
"learning_rate": 3.878907719142052e-05,
"loss": 0.4616,
"num_tokens": 615506453.0,
"step": 1610
},
{
"epoch": 1.1023890784982935,
"grad_norm": 0.2821146739678027,
"learning_rate": 3.871577029381439e-05,
"loss": 0.4414,
"num_tokens": 617472869.0,
"step": 1615
},
{
"epoch": 1.10580204778157,
"grad_norm": 0.33356289539712947,
"learning_rate": 3.864230463098142e-05,
"loss": 0.4556,
"num_tokens": 619275868.0,
"step": 1620
},
{
"epoch": 1.1092150170648465,
"grad_norm": 0.2989940473508491,
"learning_rate": 3.8568681242867375e-05,
"loss": 0.4526,
"num_tokens": 621299094.0,
"step": 1625
},
{
"epoch": 1.1126279863481228,
"grad_norm": 0.2855743012501097,
"learning_rate": 3.849490117165069e-05,
"loss": 0.4434,
"num_tokens": 623118795.0,
"step": 1630
},
{
"epoch": 1.1160409556313993,
"grad_norm": 0.29837585943422495,
"learning_rate": 3.842096546172772e-05,
"loss": 0.4651,
"num_tokens": 625108673.0,
"step": 1635
},
{
"epoch": 1.1194539249146758,
"grad_norm": 0.30667205927828434,
"learning_rate": 3.834687515969798e-05,
"loss": 0.4607,
"num_tokens": 627058447.0,
"step": 1640
},
{
"epoch": 1.1228668941979523,
"grad_norm": 0.28121218504839973,
"learning_rate": 3.827263131434934e-05,
"loss": 0.4542,
"num_tokens": 628986208.0,
"step": 1645
},
{
"epoch": 1.1262798634812285,
"grad_norm": 0.3003909890336984,
"learning_rate": 3.819823497664311e-05,
"loss": 0.4484,
"num_tokens": 630867075.0,
"step": 1650
},
{
"epoch": 1.129692832764505,
"grad_norm": 0.2672543488419772,
"learning_rate": 3.8123687199699266e-05,
"loss": 0.4582,
"num_tokens": 632732993.0,
"step": 1655
},
{
"epoch": 1.1331058020477816,
"grad_norm": 0.2828052831967036,
"learning_rate": 3.8048989038781435e-05,
"loss": 0.4285,
"num_tokens": 634492738.0,
"step": 1660
},
{
"epoch": 1.136518771331058,
"grad_norm": 0.2960342355211648,
"learning_rate": 3.797414155128205e-05,
"loss": 0.4462,
"num_tokens": 636253607.0,
"step": 1665
},
{
"epoch": 1.1399317406143346,
"grad_norm": 0.27495101737982175,
"learning_rate": 3.789914579670732e-05,
"loss": 0.4617,
"num_tokens": 638081192.0,
"step": 1670
},
{
"epoch": 1.1433447098976108,
"grad_norm": 0.30596461849028866,
"learning_rate": 3.7824002836662257e-05,
"loss": 0.4491,
"num_tokens": 640042390.0,
"step": 1675
},
{
"epoch": 1.1467576791808873,
"grad_norm": 0.3312421873485475,
"learning_rate": 3.774871373483565e-05,
"loss": 0.4575,
"num_tokens": 641973230.0,
"step": 1680
},
{
"epoch": 1.1501706484641638,
"grad_norm": 0.3104278063985918,
"learning_rate": 3.7673279556985e-05,
"loss": 0.4525,
"num_tokens": 643968400.0,
"step": 1685
},
{
"epoch": 1.1535836177474403,
"grad_norm": 0.28162301519749217,
"learning_rate": 3.7597701370921444e-05,
"loss": 0.4621,
"num_tokens": 645975251.0,
"step": 1690
},
{
"epoch": 1.1569965870307168,
"grad_norm": 0.2861836614250746,
"learning_rate": 3.7521980246494614e-05,
"loss": 0.4521,
"num_tokens": 647924628.0,
"step": 1695
},
{
"epoch": 1.1604095563139931,
"grad_norm": 0.28785239642753035,
"learning_rate": 3.744611725557753e-05,
"loss": 0.4593,
"num_tokens": 649791951.0,
"step": 1700
},
{
"epoch": 1.1638225255972696,
"grad_norm": 0.27626617912281226,
"learning_rate": 3.7370113472051406e-05,
"loss": 0.4584,
"num_tokens": 651657667.0,
"step": 1705
},
{
"epoch": 1.1672354948805461,
"grad_norm": 0.3386332300169892,
"learning_rate": 3.729396997179044e-05,
"loss": 0.4629,
"num_tokens": 653497824.0,
"step": 1710
},
{
"epoch": 1.1706484641638226,
"grad_norm": 0.27240329301276556,
"learning_rate": 3.72176878326466e-05,
"loss": 0.4576,
"num_tokens": 655459782.0,
"step": 1715
},
{
"epoch": 1.174061433447099,
"grad_norm": 0.32392454797351683,
"learning_rate": 3.714126813443435e-05,
"loss": 0.4644,
"num_tokens": 657369737.0,
"step": 1720
},
{
"epoch": 1.1774744027303754,
"grad_norm": 0.29853720401881,
"learning_rate": 3.706471195891541e-05,
"loss": 0.4545,
"num_tokens": 659396733.0,
"step": 1725
},
{
"epoch": 1.180887372013652,
"grad_norm": 0.2773946756633204,
"learning_rate": 3.698802038978337e-05,
"loss": 0.4418,
"num_tokens": 661291458.0,
"step": 1730
},
{
"epoch": 1.1843003412969284,
"grad_norm": 0.2740460845448324,
"learning_rate": 3.691119451264843e-05,
"loss": 0.4698,
"num_tokens": 663241121.0,
"step": 1735
},
{
"epoch": 1.1877133105802047,
"grad_norm": 0.276547173388332,
"learning_rate": 3.683423541502194e-05,
"loss": 0.4442,
"num_tokens": 665124115.0,
"step": 1740
},
{
"epoch": 1.1911262798634812,
"grad_norm": 0.2903229118786473,
"learning_rate": 3.675714418630111e-05,
"loss": 0.4662,
"num_tokens": 667040244.0,
"step": 1745
},
{
"epoch": 1.1945392491467577,
"grad_norm": 0.27064420446562926,
"learning_rate": 3.667992191775349e-05,
"loss": 0.452,
"num_tokens": 668907555.0,
"step": 1750
},
{
"epoch": 1.1979522184300342,
"grad_norm": 0.28593868782908866,
"learning_rate": 3.6602569702501604e-05,
"loss": 0.4531,
"num_tokens": 670890387.0,
"step": 1755
},
{
"epoch": 1.2013651877133107,
"grad_norm": 0.2772025249561031,
"learning_rate": 3.652508863550742e-05,
"loss": 0.4463,
"num_tokens": 672825173.0,
"step": 1760
},
{
"epoch": 1.204778156996587,
"grad_norm": 0.2904532887461672,
"learning_rate": 3.644747981355689e-05,
"loss": 0.4559,
"num_tokens": 674704748.0,
"step": 1765
},
{
"epoch": 1.2081911262798635,
"grad_norm": 0.33272679002620287,
"learning_rate": 3.636974433524439e-05,
"loss": 0.442,
"num_tokens": 676622447.0,
"step": 1770
},
{
"epoch": 1.21160409556314,
"grad_norm": 0.3001225018723121,
"learning_rate": 3.629188330095718e-05,
"loss": 0.463,
"num_tokens": 678387581.0,
"step": 1775
},
{
"epoch": 1.2150170648464165,
"grad_norm": 0.2918057402571508,
"learning_rate": 3.621389781285985e-05,
"loss": 0.4652,
"num_tokens": 680273514.0,
"step": 1780
},
{
"epoch": 1.2184300341296928,
"grad_norm": 0.2715063996631153,
"learning_rate": 3.61357889748787e-05,
"loss": 0.4577,
"num_tokens": 682257311.0,
"step": 1785
},
{
"epoch": 1.2218430034129693,
"grad_norm": 0.29943640199690547,
"learning_rate": 3.6057557892686086e-05,
"loss": 0.4476,
"num_tokens": 684116507.0,
"step": 1790
},
{
"epoch": 1.2252559726962458,
"grad_norm": 0.30213895944964064,
"learning_rate": 3.597920567368483e-05,
"loss": 0.4545,
"num_tokens": 685965611.0,
"step": 1795
},
{
"epoch": 1.2286689419795223,
"grad_norm": 0.2850709767569452,
"learning_rate": 3.590073342699248e-05,
"loss": 0.4508,
"num_tokens": 687988417.0,
"step": 1800
},
{
"epoch": 1.2320819112627985,
"grad_norm": 0.28211304977263574,
"learning_rate": 3.582214226342567e-05,
"loss": 0.4528,
"num_tokens": 689788689.0,
"step": 1805
},
{
"epoch": 1.235494880546075,
"grad_norm": 0.30451801514288107,
"learning_rate": 3.574343329548435e-05,
"loss": 0.4684,
"num_tokens": 691706998.0,
"step": 1810
},
{
"epoch": 1.2389078498293515,
"grad_norm": 0.28161312286707335,
"learning_rate": 3.566460763733606e-05,
"loss": 0.4539,
"num_tokens": 693650650.0,
"step": 1815
},
{
"epoch": 1.242320819112628,
"grad_norm": 0.2855398246780951,
"learning_rate": 3.5585666404800136e-05,
"loss": 0.4481,
"num_tokens": 695583845.0,
"step": 1820
},
{
"epoch": 1.2457337883959045,
"grad_norm": 0.2685294618091198,
"learning_rate": 3.5506610715331945e-05,
"loss": 0.4454,
"num_tokens": 697598560.0,
"step": 1825
},
{
"epoch": 1.2491467576791808,
"grad_norm": 0.30253366081342353,
"learning_rate": 3.5427441688007056e-05,
"loss": 0.4248,
"num_tokens": 699412264.0,
"step": 1830
},
{
"epoch": 1.2525597269624573,
"grad_norm": 0.26768192457715556,
"learning_rate": 3.534816044350539e-05,
"loss": 0.4483,
"num_tokens": 701334667.0,
"step": 1835
},
{
"epoch": 1.2559726962457338,
"grad_norm": 0.27314950895844187,
"learning_rate": 3.5268768104095365e-05,
"loss": 0.4314,
"num_tokens": 703234238.0,
"step": 1840
},
{
"epoch": 1.25938566552901,
"grad_norm": 0.2796629207217079,
"learning_rate": 3.5189265793618e-05,
"loss": 0.4484,
"num_tokens": 705158041.0,
"step": 1845
},
{
"epoch": 1.2627986348122868,
"grad_norm": 0.29048618003370963,
"learning_rate": 3.510965463747103e-05,
"loss": 0.4574,
"num_tokens": 707047974.0,
"step": 1850
},
{
"epoch": 1.266211604095563,
"grad_norm": 0.2824168313312133,
"learning_rate": 3.5029935762592935e-05,
"loss": 0.4531,
"num_tokens": 708901930.0,
"step": 1855
},
{
"epoch": 1.2696245733788396,
"grad_norm": 0.2950742991938956,
"learning_rate": 3.495011029744703e-05,
"loss": 0.4501,
"num_tokens": 710871136.0,
"step": 1860
},
{
"epoch": 1.273037542662116,
"grad_norm": 0.2637357244615578,
"learning_rate": 3.4870179372005466e-05,
"loss": 0.4409,
"num_tokens": 712727010.0,
"step": 1865
},
{
"epoch": 1.2764505119453924,
"grad_norm": 0.27499001477652085,
"learning_rate": 3.4790144117733234e-05,
"loss": 0.4573,
"num_tokens": 714757788.0,
"step": 1870
},
{
"epoch": 1.2798634812286689,
"grad_norm": 0.2786231655621766,
"learning_rate": 3.471000566757216e-05,
"loss": 0.4451,
"num_tokens": 716648944.0,
"step": 1875
},
{
"epoch": 1.2832764505119454,
"grad_norm": 0.28497779850769467,
"learning_rate": 3.462976515592487e-05,
"loss": 0.4458,
"num_tokens": 718518526.0,
"step": 1880
},
{
"epoch": 1.286689419795222,
"grad_norm": 0.27321208726567636,
"learning_rate": 3.454942371863873e-05,
"loss": 0.4513,
"num_tokens": 720498397.0,
"step": 1885
},
{
"epoch": 1.2901023890784984,
"grad_norm": 0.2576256316628026,
"learning_rate": 3.4468982492989746e-05,
"loss": 0.4442,
"num_tokens": 722394245.0,
"step": 1890
},
{
"epoch": 1.2935153583617747,
"grad_norm": 0.26515887609716976,
"learning_rate": 3.438844261766648e-05,
"loss": 0.4603,
"num_tokens": 724278291.0,
"step": 1895
},
{
"epoch": 1.2969283276450512,
"grad_norm": 0.3040373005342316,
"learning_rate": 3.4307805232753945e-05,
"loss": 0.4613,
"num_tokens": 726250325.0,
"step": 1900
},
{
"epoch": 1.3003412969283277,
"grad_norm": 0.31517718265213174,
"learning_rate": 3.4227071479717445e-05,
"loss": 0.4654,
"num_tokens": 728217806.0,
"step": 1905
},
{
"epoch": 1.3037542662116042,
"grad_norm": 0.2605190400337086,
"learning_rate": 3.414624250138645e-05,
"loss": 0.4509,
"num_tokens": 730069746.0,
"step": 1910
},
{
"epoch": 1.3071672354948807,
"grad_norm": 0.29826014855147326,
"learning_rate": 3.4065319441938355e-05,
"loss": 0.452,
"num_tokens": 731992759.0,
"step": 1915
},
{
"epoch": 1.310580204778157,
"grad_norm": 0.28635184919521356,
"learning_rate": 3.398430344688235e-05,
"loss": 0.4537,
"num_tokens": 733866439.0,
"step": 1920
},
{
"epoch": 1.3139931740614335,
"grad_norm": 0.29234660944718105,
"learning_rate": 3.390319566304319e-05,
"loss": 0.4589,
"num_tokens": 735858853.0,
"step": 1925
},
{
"epoch": 1.31740614334471,
"grad_norm": 0.2962339298082799,
"learning_rate": 3.3821997238544916e-05,
"loss": 0.4558,
"num_tokens": 737751290.0,
"step": 1930
},
{
"epoch": 1.3208191126279862,
"grad_norm": 0.3070070098211661,
"learning_rate": 3.374070932279465e-05,
"loss": 0.4543,
"num_tokens": 739700530.0,
"step": 1935
},
{
"epoch": 1.3242320819112627,
"grad_norm": 0.4850349267925565,
"learning_rate": 3.365933306646633e-05,
"loss": 0.4639,
"num_tokens": 741654310.0,
"step": 1940
},
{
"epoch": 1.3276450511945392,
"grad_norm": 0.27351892148927176,
"learning_rate": 3.357786962148437e-05,
"loss": 0.4313,
"num_tokens": 743544844.0,
"step": 1945
},
{
"epoch": 1.3310580204778157,
"grad_norm": 0.2694056885578761,
"learning_rate": 3.3496320141007406e-05,
"loss": 0.4571,
"num_tokens": 745423520.0,
"step": 1950
},
{
"epoch": 1.3344709897610922,
"grad_norm": 0.2728262843326819,
"learning_rate": 3.3414685779411945e-05,
"loss": 0.4556,
"num_tokens": 747401769.0,
"step": 1955
},
{
"epoch": 1.3378839590443685,
"grad_norm": 0.30994439761647724,
"learning_rate": 3.333296769227604e-05,
"loss": 0.4587,
"num_tokens": 749294404.0,
"step": 1960
},
{
"epoch": 1.341296928327645,
"grad_norm": 0.29334073529273996,
"learning_rate": 3.3251167036362915e-05,
"loss": 0.4471,
"num_tokens": 751251487.0,
"step": 1965
},
{
"epoch": 1.3447098976109215,
"grad_norm": 0.33180165168065484,
"learning_rate": 3.31692849696046e-05,
"loss": 0.4487,
"num_tokens": 753055155.0,
"step": 1970
},
{
"epoch": 1.348122866894198,
"grad_norm": 0.2993222027921042,
"learning_rate": 3.3087322651085554e-05,
"loss": 0.4672,
"num_tokens": 755096420.0,
"step": 1975
},
{
"epoch": 1.3515358361774745,
"grad_norm": 0.2801645429736596,
"learning_rate": 3.3005281241026215e-05,
"loss": 0.4415,
"num_tokens": 756939622.0,
"step": 1980
},
{
"epoch": 1.3549488054607508,
"grad_norm": 0.2745528396247091,
"learning_rate": 3.2923161900766614e-05,
"loss": 0.4399,
"num_tokens": 758793302.0,
"step": 1985
},
{
"epoch": 1.3583617747440273,
"grad_norm": 0.2612966118749133,
"learning_rate": 3.284096579274995e-05,
"loss": 0.4447,
"num_tokens": 760675723.0,
"step": 1990
},
{
"epoch": 1.3617747440273038,
"grad_norm": 0.270210002371754,
"learning_rate": 3.275869408050608e-05,
"loss": 0.4503,
"num_tokens": 762483433.0,
"step": 1995
},
{
"epoch": 1.36518771331058,
"grad_norm": 0.2680635125777971,
"learning_rate": 3.267634792863509e-05,
"loss": 0.4574,
"num_tokens": 764466963.0,
"step": 2000
},
{
"epoch": 1.3686006825938566,
"grad_norm": 0.286309419549809,
"learning_rate": 3.259392850279082e-05,
"loss": 0.4449,
"num_tokens": 766382996.0,
"step": 2005
},
{
"epoch": 1.372013651877133,
"grad_norm": 0.2740692290659991,
"learning_rate": 3.2511436969664284e-05,
"loss": 0.4541,
"num_tokens": 768293330.0,
"step": 2010
},
{
"epoch": 1.3754266211604096,
"grad_norm": 0.2690578067704949,
"learning_rate": 3.2428874496967274e-05,
"loss": 0.455,
"num_tokens": 770247322.0,
"step": 2015
},
{
"epoch": 1.378839590443686,
"grad_norm": 0.2563408299269977,
"learning_rate": 3.234624225341575e-05,
"loss": 0.4459,
"num_tokens": 772205337.0,
"step": 2020
},
{
"epoch": 1.3822525597269624,
"grad_norm": 0.25655513271782726,
"learning_rate": 3.22635414087133e-05,
"loss": 0.4658,
"num_tokens": 774217733.0,
"step": 2025
},
{
"epoch": 1.3856655290102389,
"grad_norm": 0.29060509201295237,
"learning_rate": 3.218077313353462e-05,
"loss": 0.4488,
"num_tokens": 776079090.0,
"step": 2030
},
{
"epoch": 1.3890784982935154,
"grad_norm": 0.2720186770597433,
"learning_rate": 3.20979385995089e-05,
"loss": 0.442,
"num_tokens": 777982393.0,
"step": 2035
},
{
"epoch": 1.3924914675767919,
"grad_norm": 0.2861264692108029,
"learning_rate": 3.201503897920327e-05,
"loss": 0.4496,
"num_tokens": 779989886.0,
"step": 2040
},
{
"epoch": 1.3959044368600684,
"grad_norm": 0.29707981121890303,
"learning_rate": 3.193207544610621e-05,
"loss": 0.4432,
"num_tokens": 781895136.0,
"step": 2045
},
{
"epoch": 1.3993174061433447,
"grad_norm": 0.27309938834165404,
"learning_rate": 3.184904917461088e-05,
"loss": 0.4563,
"num_tokens": 783942187.0,
"step": 2050
},
{
"epoch": 1.4027303754266212,
"grad_norm": 0.2893396465962206,
"learning_rate": 3.1765961339998565e-05,
"loss": 0.4468,
"num_tokens": 785853334.0,
"step": 2055
},
{
"epoch": 1.4061433447098977,
"grad_norm": 0.2896324408589564,
"learning_rate": 3.1682813118422e-05,
"loss": 0.4484,
"num_tokens": 787909060.0,
"step": 2060
},
{
"epoch": 1.409556313993174,
"grad_norm": 0.26774630794325843,
"learning_rate": 3.159960568688872e-05,
"loss": 0.4507,
"num_tokens": 789874438.0,
"step": 2065
},
{
"epoch": 1.4129692832764504,
"grad_norm": 0.3167509710093353,
"learning_rate": 3.151634022324444e-05,
"loss": 0.4599,
"num_tokens": 791709412.0,
"step": 2070
},
{
"epoch": 1.416382252559727,
"grad_norm": 0.27632081551359566,
"learning_rate": 3.1433017906156316e-05,
"loss": 0.4429,
"num_tokens": 793521730.0,
"step": 2075
},
{
"epoch": 1.4197952218430034,
"grad_norm": 0.2766678534542702,
"learning_rate": 3.134963991509631e-05,
"loss": 0.4613,
"num_tokens": 795344918.0,
"step": 2080
},
{
"epoch": 1.42320819112628,
"grad_norm": 0.28361755145893136,
"learning_rate": 3.126620743032447e-05,
"loss": 0.4467,
"num_tokens": 797189457.0,
"step": 2085
},
{
"epoch": 1.4266211604095562,
"grad_norm": 0.27697812269141797,
"learning_rate": 3.1182721632872254e-05,
"loss": 0.436,
"num_tokens": 799164609.0,
"step": 2090
},
{
"epoch": 1.4300341296928327,
"grad_norm": 0.2754786232171963,
"learning_rate": 3.109918370452575e-05,
"loss": 0.445,
"num_tokens": 801159658.0,
"step": 2095
},
{
"epoch": 1.4334470989761092,
"grad_norm": 0.2604825606354588,
"learning_rate": 3.101559482780903e-05,
"loss": 0.4408,
"num_tokens": 803102119.0,
"step": 2100
},
{
"epoch": 1.4368600682593857,
"grad_norm": 0.27648906929611866,
"learning_rate": 3.093195618596735e-05,
"loss": 0.4466,
"num_tokens": 805178655.0,
"step": 2105
},
{
"epoch": 1.4402730375426622,
"grad_norm": 0.265092750892,
"learning_rate": 3.084826896295041e-05,
"loss": 0.4473,
"num_tokens": 807040305.0,
"step": 2110
},
{
"epoch": 1.4436860068259385,
"grad_norm": 0.2722185832102206,
"learning_rate": 3.07645343433956e-05,
"loss": 0.4669,
"num_tokens": 808944713.0,
"step": 2115
},
{
"epoch": 1.447098976109215,
"grad_norm": 0.28414946820217796,
"learning_rate": 3.068075351261126e-05,
"loss": 0.4374,
"num_tokens": 810867962.0,
"step": 2120
},
{
"epoch": 1.4505119453924915,
"grad_norm": 0.2650239217756763,
"learning_rate": 3.0596927656559834e-05,
"loss": 0.4422,
"num_tokens": 812796180.0,
"step": 2125
},
{
"epoch": 1.4539249146757678,
"grad_norm": 0.28516975016618823,
"learning_rate": 3.0513057961841175e-05,
"loss": 0.4569,
"num_tokens": 814693135.0,
"step": 2130
},
{
"epoch": 1.4573378839590443,
"grad_norm": 0.2531012712568243,
"learning_rate": 3.042914561567563e-05,
"loss": 0.4414,
"num_tokens": 816615984.0,
"step": 2135
},
{
"epoch": 1.4607508532423208,
"grad_norm": 0.3105010778606175,
"learning_rate": 3.0345191805887367e-05,
"loss": 0.4502,
"num_tokens": 818449910.0,
"step": 2140
},
{
"epoch": 1.4641638225255973,
"grad_norm": 0.3002226245659818,
"learning_rate": 3.0261197720887457e-05,
"loss": 0.4257,
"num_tokens": 820318972.0,
"step": 2145
},
{
"epoch": 1.4675767918088738,
"grad_norm": 0.2890933430333858,
"learning_rate": 3.017716454965708e-05,
"loss": 0.4578,
"num_tokens": 822330354.0,
"step": 2150
},
{
"epoch": 1.47098976109215,
"grad_norm": 0.2740990659865864,
"learning_rate": 3.0093093481730723e-05,
"loss": 0.4485,
"num_tokens": 824181338.0,
"step": 2155
},
{
"epoch": 1.4744027303754266,
"grad_norm": 0.2649593030398301,
"learning_rate": 3.0008985707179326e-05,
"loss": 0.4417,
"num_tokens": 826004744.0,
"step": 2160
},
{
"epoch": 1.477815699658703,
"grad_norm": 0.28281137311210414,
"learning_rate": 2.9924842416593406e-05,
"loss": 0.4515,
"num_tokens": 827894923.0,
"step": 2165
},
{
"epoch": 1.4812286689419796,
"grad_norm": 0.2759663786994046,
"learning_rate": 2.9840664801066247e-05,
"loss": 0.4499,
"num_tokens": 829837206.0,
"step": 2170
},
{
"epoch": 1.484641638225256,
"grad_norm": 0.2580667059210538,
"learning_rate": 2.9756454052177012e-05,
"loss": 0.4573,
"num_tokens": 831686870.0,
"step": 2175
},
{
"epoch": 1.4880546075085324,
"grad_norm": 0.2575391507719894,
"learning_rate": 2.96722113619739e-05,
"loss": 0.4392,
"num_tokens": 833569477.0,
"step": 2180
},
{
"epoch": 1.4914675767918089,
"grad_norm": 0.27408399127884947,
"learning_rate": 2.9587937922957233e-05,
"loss": 0.4452,
"num_tokens": 835500719.0,
"step": 2185
},
{
"epoch": 1.4948805460750854,
"grad_norm": 0.2615339855136178,
"learning_rate": 2.950363492806262e-05,
"loss": 0.4708,
"num_tokens": 837484919.0,
"step": 2190
},
{
"epoch": 1.4982935153583616,
"grad_norm": 0.25621570699947277,
"learning_rate": 2.941930357064402e-05,
"loss": 0.4501,
"num_tokens": 839486730.0,
"step": 2195
},
{
"epoch": 1.5017064846416384,
"grad_norm": 0.2756773591629605,
"learning_rate": 2.9334945044456923e-05,
"loss": 0.452,
"num_tokens": 841489715.0,
"step": 2200
},
{
"epoch": 1.5051194539249146,
"grad_norm": 0.27466282153404803,
"learning_rate": 2.925056054364137e-05,
"loss": 0.4485,
"num_tokens": 843365250.0,
"step": 2205
},
{
"epoch": 1.5085324232081911,
"grad_norm": 0.2804813346164974,
"learning_rate": 2.9166151262705105e-05,
"loss": 0.4408,
"num_tokens": 845275912.0,
"step": 2210
},
{
"epoch": 1.5119453924914676,
"grad_norm": 0.2595259256898971,
"learning_rate": 2.9081718396506635e-05,
"loss": 0.4429,
"num_tokens": 847232214.0,
"step": 2215
},
{
"epoch": 1.515358361774744,
"grad_norm": 0.2663724850178613,
"learning_rate": 2.8997263140238346e-05,
"loss": 0.4495,
"num_tokens": 848987167.0,
"step": 2220
},
{
"epoch": 1.5187713310580204,
"grad_norm": 0.2589624660607149,
"learning_rate": 2.8912786689409556e-05,
"loss": 0.4464,
"num_tokens": 850922570.0,
"step": 2225
},
{
"epoch": 1.522184300341297,
"grad_norm": 0.2656741346606789,
"learning_rate": 2.88282902398296e-05,
"loss": 0.4517,
"num_tokens": 852822624.0,
"step": 2230
},
{
"epoch": 1.5255972696245734,
"grad_norm": 0.293689264474386,
"learning_rate": 2.8743774987590916e-05,
"loss": 0.4529,
"num_tokens": 854740425.0,
"step": 2235
},
{
"epoch": 1.52901023890785,
"grad_norm": 0.27142432002975353,
"learning_rate": 2.8659242129052093e-05,
"loss": 0.4414,
"num_tokens": 856603546.0,
"step": 2240
},
{
"epoch": 1.5324232081911262,
"grad_norm": 0.267870113436132,
"learning_rate": 2.8574692860820974e-05,
"loss": 0.4596,
"num_tokens": 858627881.0,
"step": 2245
},
{
"epoch": 1.5358361774744027,
"grad_norm": 0.26103422109658747,
"learning_rate": 2.849012837973764e-05,
"loss": 0.4475,
"num_tokens": 860521721.0,
"step": 2250
},
{
"epoch": 1.5392491467576792,
"grad_norm": 0.2601926950553868,
"learning_rate": 2.840554988285755e-05,
"loss": 0.4484,
"num_tokens": 862459060.0,
"step": 2255
},
{
"epoch": 1.5426621160409555,
"grad_norm": 0.27246673405743077,
"learning_rate": 2.8320958567434585e-05,
"loss": 0.447,
"num_tokens": 864416686.0,
"step": 2260
},
{
"epoch": 1.5460750853242322,
"grad_norm": 0.2881598506293941,
"learning_rate": 2.8236355630904037e-05,
"loss": 0.45,
"num_tokens": 866372213.0,
"step": 2265
},
{
"epoch": 1.5494880546075085,
"grad_norm": 0.28066610110205614,
"learning_rate": 2.8151742270865722e-05,
"loss": 0.4593,
"num_tokens": 868286031.0,
"step": 2270
},
{
"epoch": 1.552901023890785,
"grad_norm": 0.2862420813113712,
"learning_rate": 2.8067119685067e-05,
"loss": 0.4614,
"num_tokens": 870215475.0,
"step": 2275
},
{
"epoch": 1.5563139931740615,
"grad_norm": 0.2554540585046952,
"learning_rate": 2.798248907138584e-05,
"loss": 0.4484,
"num_tokens": 872204026.0,
"step": 2280
},
{
"epoch": 1.5597269624573378,
"grad_norm": 0.26746236600835344,
"learning_rate": 2.7897851627813836e-05,
"loss": 0.4508,
"num_tokens": 874200553.0,
"step": 2285
},
{
"epoch": 1.5631399317406145,
"grad_norm": 0.2591829699984779,
"learning_rate": 2.7813208552439257e-05,
"loss": 0.4486,
"num_tokens": 876046353.0,
"step": 2290
},
{
"epoch": 1.5665529010238908,
"grad_norm": 0.2792969836284745,
"learning_rate": 2.7728561043430118e-05,
"loss": 0.456,
"num_tokens": 877975265.0,
"step": 2295
},
{
"epoch": 1.5699658703071673,
"grad_norm": 0.2861082160674022,
"learning_rate": 2.7643910299017168e-05,
"loss": 0.4399,
"num_tokens": 879857040.0,
"step": 2300
},
{
"epoch": 1.5733788395904438,
"grad_norm": 0.26577617584751645,
"learning_rate": 2.7559257517476972e-05,
"loss": 0.4584,
"num_tokens": 881775297.0,
"step": 2305
},
{
"epoch": 1.57679180887372,
"grad_norm": 0.304890853130377,
"learning_rate": 2.747460389711492e-05,
"loss": 0.46,
"num_tokens": 883694768.0,
"step": 2310
},
{
"epoch": 1.5802047781569966,
"grad_norm": 0.27264196953878284,
"learning_rate": 2.7389950636248284e-05,
"loss": 0.4627,
"num_tokens": 885606244.0,
"step": 2315
},
{
"epoch": 1.583617747440273,
"grad_norm": 0.26079561956352626,
"learning_rate": 2.7305298933189255e-05,
"loss": 0.4583,
"num_tokens": 887457723.0,
"step": 2320
},
{
"epoch": 1.5870307167235493,
"grad_norm": 0.24807936020422583,
"learning_rate": 2.7220649986227964e-05,
"loss": 0.4417,
"num_tokens": 889409099.0,
"step": 2325
},
{
"epoch": 1.590443686006826,
"grad_norm": 0.2651021999880332,
"learning_rate": 2.7136004993615505e-05,
"loss": 0.4465,
"num_tokens": 891339953.0,
"step": 2330
},
{
"epoch": 1.5938566552901023,
"grad_norm": 0.26971209715318745,
"learning_rate": 2.7051365153547027e-05,
"loss": 0.4424,
"num_tokens": 893325476.0,
"step": 2335
},
{
"epoch": 1.5972696245733788,
"grad_norm": 0.28339562896194187,
"learning_rate": 2.6966731664144733e-05,
"loss": 0.4451,
"num_tokens": 895188323.0,
"step": 2340
},
{
"epoch": 1.6006825938566553,
"grad_norm": 0.27171867664083665,
"learning_rate": 2.688210572344095e-05,
"loss": 0.4437,
"num_tokens": 897121692.0,
"step": 2345
},
{
"epoch": 1.6040955631399316,
"grad_norm": 0.2813986180164497,
"learning_rate": 2.6797488529361093e-05,
"loss": 0.4447,
"num_tokens": 899043370.0,
"step": 2350
},
{
"epoch": 1.6075085324232083,
"grad_norm": 0.271610888053345,
"learning_rate": 2.6712881279706814e-05,
"loss": 0.4523,
"num_tokens": 900952717.0,
"step": 2355
},
{
"epoch": 1.6109215017064846,
"grad_norm": 0.28993349676639346,
"learning_rate": 2.662828517213899e-05,
"loss": 0.4467,
"num_tokens": 902743085.0,
"step": 2360
},
{
"epoch": 1.6143344709897611,
"grad_norm": 0.27680179119800896,
"learning_rate": 2.6543701404160748e-05,
"loss": 0.4453,
"num_tokens": 904703890.0,
"step": 2365
},
{
"epoch": 1.6177474402730376,
"grad_norm": 0.28092901036185763,
"learning_rate": 2.645913117310057e-05,
"loss": 0.4538,
"num_tokens": 906627984.0,
"step": 2370
},
{
"epoch": 1.621160409556314,
"grad_norm": 0.26517844884302066,
"learning_rate": 2.637457567609531e-05,
"loss": 0.4469,
"num_tokens": 908570138.0,
"step": 2375
},
{
"epoch": 1.6245733788395904,
"grad_norm": 0.26409207116238576,
"learning_rate": 2.6290036110073242e-05,
"loss": 0.4469,
"num_tokens": 910543342.0,
"step": 2380
},
{
"epoch": 1.627986348122867,
"grad_norm": 0.2583484326247722,
"learning_rate": 2.6205513671737135e-05,
"loss": 0.4468,
"num_tokens": 912533770.0,
"step": 2385
},
{
"epoch": 1.6313993174061432,
"grad_norm": 0.24446056976219688,
"learning_rate": 2.612100955754731e-05,
"loss": 0.4469,
"num_tokens": 914532283.0,
"step": 2390
},
{
"epoch": 1.63481228668942,
"grad_norm": 0.2693898345306616,
"learning_rate": 2.6036524963704705e-05,
"loss": 0.4428,
"num_tokens": 916447270.0,
"step": 2395
},
{
"epoch": 1.6382252559726962,
"grad_norm": 0.2603116651063788,
"learning_rate": 2.5952061086133915e-05,
"loss": 0.4372,
"num_tokens": 918323773.0,
"step": 2400
},
{
"epoch": 1.6416382252559727,
"grad_norm": 0.28162088278006275,
"learning_rate": 2.58676191204663e-05,
"loss": 0.4453,
"num_tokens": 920259747.0,
"step": 2405
},
{
"epoch": 1.6450511945392492,
"grad_norm": 0.2710337541109652,
"learning_rate": 2.578320026202306e-05,
"loss": 0.4364,
"num_tokens": 922162828.0,
"step": 2410
},
{
"epoch": 1.6484641638225255,
"grad_norm": 0.27020261434441295,
"learning_rate": 2.5698805705798273e-05,
"loss": 0.4507,
"num_tokens": 924056073.0,
"step": 2415
},
{
"epoch": 1.6518771331058022,
"grad_norm": 0.2634733100798534,
"learning_rate": 2.5614436646442015e-05,
"loss": 0.4519,
"num_tokens": 925974493.0,
"step": 2420
},
{
"epoch": 1.6552901023890785,
"grad_norm": 0.255269239584084,
"learning_rate": 2.553009427824345e-05,
"loss": 0.4385,
"num_tokens": 927926823.0,
"step": 2425
},
{
"epoch": 1.658703071672355,
"grad_norm": 0.2853510934721726,
"learning_rate": 2.54457797951139e-05,
"loss": 0.436,
"num_tokens": 929863736.0,
"step": 2430
},
{
"epoch": 1.6621160409556315,
"grad_norm": 0.27590699133483987,
"learning_rate": 2.5361494390569973e-05,
"loss": 0.4383,
"num_tokens": 931699965.0,
"step": 2435
},
{
"epoch": 1.6655290102389078,
"grad_norm": 0.2672913591423428,
"learning_rate": 2.527723925771664e-05,
"loss": 0.4315,
"num_tokens": 933525777.0,
"step": 2440
},
{
"epoch": 1.6689419795221843,
"grad_norm": 0.26586385140122987,
"learning_rate": 2.5193015589230374e-05,
"loss": 0.4492,
"num_tokens": 935357508.0,
"step": 2445
},
{
"epoch": 1.6723549488054608,
"grad_norm": 0.26155221487403196,
"learning_rate": 2.5108824577342243e-05,
"loss": 0.4324,
"num_tokens": 937184859.0,
"step": 2450
},
{
"epoch": 1.675767918088737,
"grad_norm": 0.2604660508130615,
"learning_rate": 2.502466741382105e-05,
"loss": 0.4467,
"num_tokens": 939164719.0,
"step": 2455
},
{
"epoch": 1.6791808873720138,
"grad_norm": 0.2672255765255275,
"learning_rate": 2.494054528995644e-05,
"loss": 0.4403,
"num_tokens": 941017477.0,
"step": 2460
},
{
"epoch": 1.68259385665529,
"grad_norm": 0.2584274462980594,
"learning_rate": 2.4856459396542092e-05,
"loss": 0.4421,
"num_tokens": 942907050.0,
"step": 2465
},
{
"epoch": 1.6860068259385665,
"grad_norm": 0.24720970294975506,
"learning_rate": 2.477241092385877e-05,
"loss": 0.4418,
"num_tokens": 944808236.0,
"step": 2470
},
{
"epoch": 1.689419795221843,
"grad_norm": 0.26472526826866016,
"learning_rate": 2.4688401061657563e-05,
"loss": 0.4381,
"num_tokens": 946840703.0,
"step": 2475
},
{
"epoch": 1.6928327645051193,
"grad_norm": 5.0030305349934885,
"learning_rate": 2.4604430999143002e-05,
"loss": 0.4506,
"num_tokens": 948769790.0,
"step": 2480
},
{
"epoch": 1.696245733788396,
"grad_norm": 0.2680033634853091,
"learning_rate": 2.452050192495624e-05,
"loss": 0.4335,
"num_tokens": 950570738.0,
"step": 2485
},
{
"epoch": 1.6996587030716723,
"grad_norm": 0.2839588999603886,
"learning_rate": 2.4436615027158194e-05,
"loss": 0.4463,
"num_tokens": 952490894.0,
"step": 2490
},
{
"epoch": 1.7030716723549488,
"grad_norm": 0.26446203164353277,
"learning_rate": 2.4352771493212763e-05,
"loss": 0.4532,
"num_tokens": 954445294.0,
"step": 2495
},
{
"epoch": 1.7064846416382253,
"grad_norm": 0.2706578896879531,
"learning_rate": 2.4268972509970027e-05,
"loss": 0.438,
"num_tokens": 956373551.0,
"step": 2500
},
{
"epoch": 1.7098976109215016,
"grad_norm": 0.2510763849552845,
"learning_rate": 2.4185219263649402e-05,
"loss": 0.4369,
"num_tokens": 958195010.0,
"step": 2505
},
{
"epoch": 1.713310580204778,
"grad_norm": 0.2833747779105397,
"learning_rate": 2.4101512939822875e-05,
"loss": 0.4416,
"num_tokens": 960041529.0,
"step": 2510
},
{
"epoch": 1.7167235494880546,
"grad_norm": 0.26481804710093265,
"learning_rate": 2.401785472339823e-05,
"loss": 0.4389,
"num_tokens": 961954801.0,
"step": 2515
},
{
"epoch": 1.7201365187713311,
"grad_norm": 0.2647677324608605,
"learning_rate": 2.393424579860228e-05,
"loss": 0.4381,
"num_tokens": 963894044.0,
"step": 2520
},
{
"epoch": 1.7235494880546076,
"grad_norm": 0.26943728181282595,
"learning_rate": 2.385068734896404e-05,
"loss": 0.4419,
"num_tokens": 965758747.0,
"step": 2525
},
{
"epoch": 1.726962457337884,
"grad_norm": 0.2594300124417837,
"learning_rate": 2.3767180557298074e-05,
"loss": 0.4383,
"num_tokens": 967735319.0,
"step": 2530
},
{
"epoch": 1.7303754266211604,
"grad_norm": 0.2515723573014202,
"learning_rate": 2.368372660568768e-05,
"loss": 0.421,
"num_tokens": 969551079.0,
"step": 2535
},
{
"epoch": 1.733788395904437,
"grad_norm": 0.2593132229076659,
"learning_rate": 2.3600326675468158e-05,
"loss": 0.4351,
"num_tokens": 971469515.0,
"step": 2540
},
{
"epoch": 1.7372013651877132,
"grad_norm": 0.24852875972861324,
"learning_rate": 2.351698194721013e-05,
"loss": 0.4508,
"num_tokens": 973400224.0,
"step": 2545
},
{
"epoch": 1.74061433447099,
"grad_norm": 0.2894838158575043,
"learning_rate": 2.343369360070281e-05,
"loss": 0.4632,
"num_tokens": 975250566.0,
"step": 2550
},
{
"epoch": 1.7440273037542662,
"grad_norm": 0.25922772428222707,
"learning_rate": 2.335046281493728e-05,
"loss": 0.4327,
"num_tokens": 977140229.0,
"step": 2555
},
{
"epoch": 1.7474402730375427,
"grad_norm": 0.26168920953623176,
"learning_rate": 2.326729076808981e-05,
"loss": 0.4477,
"num_tokens": 979159476.0,
"step": 2560
},
{
"epoch": 1.7508532423208192,
"grad_norm": 0.26615040708871945,
"learning_rate": 2.3184178637505227e-05,
"loss": 0.4375,
"num_tokens": 981111783.0,
"step": 2565
},
{
"epoch": 1.7542662116040955,
"grad_norm": 0.25291556202610044,
"learning_rate": 2.310112759968018e-05,
"loss": 0.4464,
"num_tokens": 983196600.0,
"step": 2570
},
{
"epoch": 1.757679180887372,
"grad_norm": 0.2695038308858752,
"learning_rate": 2.3018138830246516e-05,
"loss": 0.4482,
"num_tokens": 985104722.0,
"step": 2575
},
{
"epoch": 1.7610921501706485,
"grad_norm": 0.26848491423115955,
"learning_rate": 2.2935213503954662e-05,
"loss": 0.4522,
"num_tokens": 986930750.0,
"step": 2580
},
{
"epoch": 1.764505119453925,
"grad_norm": 0.25839245304492914,
"learning_rate": 2.285235279465696e-05,
"loss": 0.4427,
"num_tokens": 988686155.0,
"step": 2585
},
{
"epoch": 1.7679180887372015,
"grad_norm": 0.2399595630712539,
"learning_rate": 2.2769557875291063e-05,
"loss": 0.4424,
"num_tokens": 990711608.0,
"step": 2590
},
{
"epoch": 1.7713310580204777,
"grad_norm": 0.2544879611742542,
"learning_rate": 2.2686829917863333e-05,
"loss": 0.4441,
"num_tokens": 992694774.0,
"step": 2595
},
{
"epoch": 1.7747440273037542,
"grad_norm": 0.2775597763576913,
"learning_rate": 2.2604170093432255e-05,
"loss": 0.437,
"num_tokens": 994568102.0,
"step": 2600
},
{
"epoch": 1.7781569965870307,
"grad_norm": 0.26757996732416056,
"learning_rate": 2.252157957209185e-05,
"loss": 0.4339,
"num_tokens": 996378479.0,
"step": 2605
},
{
"epoch": 1.781569965870307,
"grad_norm": 0.2636448817949683,
"learning_rate": 2.2439059522955107e-05,
"loss": 0.4325,
"num_tokens": 998156090.0,
"step": 2610
},
{
"epoch": 1.7849829351535837,
"grad_norm": 0.275460716899482,
"learning_rate": 2.2356611114137465e-05,
"loss": 0.4354,
"num_tokens": 999976607.0,
"step": 2615
},
{
"epoch": 1.78839590443686,
"grad_norm": 0.2625063656499445,
"learning_rate": 2.2274235512740248e-05,
"loss": 0.4327,
"num_tokens": 1001825873.0,
"step": 2620
},
{
"epoch": 1.7918088737201365,
"grad_norm": 0.28402951172695334,
"learning_rate": 2.2191933884834148e-05,
"loss": 0.4597,
"num_tokens": 1003678305.0,
"step": 2625
},
{
"epoch": 1.795221843003413,
"grad_norm": 0.2602751829566075,
"learning_rate": 2.2109707395442714e-05,
"loss": 0.4368,
"num_tokens": 1005516007.0,
"step": 2630
},
{
"epoch": 1.7986348122866893,
"grad_norm": 0.26647070338777407,
"learning_rate": 2.2027557208525883e-05,
"loss": 0.4428,
"num_tokens": 1007344786.0,
"step": 2635
},
{
"epoch": 1.802047781569966,
"grad_norm": 0.26794683193288027,
"learning_rate": 2.194548448696349e-05,
"loss": 0.4362,
"num_tokens": 1009075212.0,
"step": 2640
},
{
"epoch": 1.8054607508532423,
"grad_norm": 0.27816074997567525,
"learning_rate": 2.1863490392538816e-05,
"loss": 0.4438,
"num_tokens": 1010808416.0,
"step": 2645
},
{
"epoch": 1.8088737201365188,
"grad_norm": 0.27463994949053044,
"learning_rate": 2.1781576085922083e-05,
"loss": 0.4512,
"num_tokens": 1012683694.0,
"step": 2650
},
{
"epoch": 1.8122866894197953,
"grad_norm": 0.2707994505036392,
"learning_rate": 2.1699742726654132e-05,
"loss": 0.4376,
"num_tokens": 1014601234.0,
"step": 2655
},
{
"epoch": 1.8156996587030716,
"grad_norm": 0.2627965386697017,
"learning_rate": 2.161799147312994e-05,
"loss": 0.439,
"num_tokens": 1016448826.0,
"step": 2660
},
{
"epoch": 1.819112627986348,
"grad_norm": 0.2522129136280826,
"learning_rate": 2.15363234825822e-05,
"loss": 0.4358,
"num_tokens": 1018365951.0,
"step": 2665
},
{
"epoch": 1.8225255972696246,
"grad_norm": 0.252384471077172,
"learning_rate": 2.1454739911065002e-05,
"loss": 0.441,
"num_tokens": 1020273196.0,
"step": 2670
},
{
"epoch": 1.8259385665529009,
"grad_norm": 0.24823623801093808,
"learning_rate": 2.137324191343743e-05,
"loss": 0.4287,
"num_tokens": 1022225624.0,
"step": 2675
},
{
"epoch": 1.8293515358361776,
"grad_norm": 0.27365445309933684,
"learning_rate": 2.129183064334725e-05,
"loss": 0.4374,
"num_tokens": 1024124126.0,
"step": 2680
},
{
"epoch": 1.8327645051194539,
"grad_norm": 0.265169099623179,
"learning_rate": 2.1210507253214495e-05,
"loss": 0.4512,
"num_tokens": 1026116910.0,
"step": 2685
},
{
"epoch": 1.8361774744027304,
"grad_norm": 0.2687843686775235,
"learning_rate": 2.1129272894215262e-05,
"loss": 0.4411,
"num_tokens": 1027954118.0,
"step": 2690
},
{
"epoch": 1.8395904436860069,
"grad_norm": 0.2609011147767087,
"learning_rate": 2.1048128716265357e-05,
"loss": 0.4379,
"num_tokens": 1029884450.0,
"step": 2695
},
{
"epoch": 1.8430034129692832,
"grad_norm": 0.27599227786573743,
"learning_rate": 2.0967075868003995e-05,
"loss": 0.4527,
"num_tokens": 1031794352.0,
"step": 2700
},
{
"epoch": 1.8464163822525599,
"grad_norm": 0.2343269626270317,
"learning_rate": 2.0886115496777598e-05,
"loss": 0.4405,
"num_tokens": 1033740753.0,
"step": 2705
},
{
"epoch": 1.8498293515358362,
"grad_norm": 0.26453037363335735,
"learning_rate": 2.0805248748623528e-05,
"loss": 0.4443,
"num_tokens": 1035620201.0,
"step": 2710
},
{
"epoch": 1.8532423208191127,
"grad_norm": 0.2536404978918816,
"learning_rate": 2.0724476768253854e-05,
"loss": 0.453,
"num_tokens": 1037586913.0,
"step": 2715
},
{
"epoch": 1.8566552901023892,
"grad_norm": 0.2544903502546627,
"learning_rate": 2.064380069903914e-05,
"loss": 0.4447,
"num_tokens": 1039498293.0,
"step": 2720
},
{
"epoch": 1.8600682593856654,
"grad_norm": 0.2702028649002257,
"learning_rate": 2.0563221682992305e-05,
"loss": 0.4411,
"num_tokens": 1041470501.0,
"step": 2725
},
{
"epoch": 1.863481228668942,
"grad_norm": 0.26562312719614667,
"learning_rate": 2.048274086075242e-05,
"loss": 0.4429,
"num_tokens": 1043288032.0,
"step": 2730
},
{
"epoch": 1.8668941979522184,
"grad_norm": 0.24219783619847288,
"learning_rate": 2.0402359371568554e-05,
"loss": 0.4485,
"num_tokens": 1045205381.0,
"step": 2735
},
{
"epoch": 1.8703071672354947,
"grad_norm": 0.2506743431312461,
"learning_rate": 2.0322078353283676e-05,
"loss": 0.4404,
"num_tokens": 1047143820.0,
"step": 2740
},
{
"epoch": 1.8737201365187715,
"grad_norm": 0.2586737586940106,
"learning_rate": 2.0241898942318538e-05,
"loss": 0.4387,
"num_tokens": 1049056564.0,
"step": 2745
},
{
"epoch": 1.8771331058020477,
"grad_norm": 0.26391954870509793,
"learning_rate": 2.016182227365559e-05,
"loss": 0.4453,
"num_tokens": 1051032103.0,
"step": 2750
},
{
"epoch": 1.8805460750853242,
"grad_norm": 0.2564856135718643,
"learning_rate": 2.0081849480822896e-05,
"loss": 0.4338,
"num_tokens": 1052811471.0,
"step": 2755
},
{
"epoch": 1.8839590443686007,
"grad_norm": 0.2470202752335773,
"learning_rate": 2.000198169587811e-05,
"loss": 0.4431,
"num_tokens": 1054738378.0,
"step": 2760
},
{
"epoch": 1.887372013651877,
"grad_norm": 0.2542101749179161,
"learning_rate": 1.9922220049392438e-05,
"loss": 0.4319,
"num_tokens": 1056676264.0,
"step": 2765
},
{
"epoch": 1.8907849829351537,
"grad_norm": 0.2435430685322008,
"learning_rate": 1.9842565670434648e-05,
"loss": 0.4226,
"num_tokens": 1058467325.0,
"step": 2770
},
{
"epoch": 1.89419795221843,
"grad_norm": 0.25562350059297084,
"learning_rate": 1.9763019686555073e-05,
"loss": 0.4376,
"num_tokens": 1060452339.0,
"step": 2775
},
{
"epoch": 1.8976109215017065,
"grad_norm": 0.29345078376520056,
"learning_rate": 1.9683583223769658e-05,
"loss": 0.428,
"num_tokens": 1062292098.0,
"step": 2780
},
{
"epoch": 1.901023890784983,
"grad_norm": 0.2584621207973962,
"learning_rate": 1.9604257406544024e-05,
"loss": 0.4511,
"num_tokens": 1064263837.0,
"step": 2785
},
{
"epoch": 1.9044368600682593,
"grad_norm": 0.27128805508680764,
"learning_rate": 1.9525043357777516e-05,
"loss": 0.4462,
"num_tokens": 1066251533.0,
"step": 2790
},
{
"epoch": 1.9078498293515358,
"grad_norm": 0.2727233705082907,
"learning_rate": 1.9445942198787382e-05,
"loss": 0.4361,
"num_tokens": 1068173478.0,
"step": 2795
},
{
"epoch": 1.9112627986348123,
"grad_norm": 0.2625441116715545,
"learning_rate": 1.9366955049292828e-05,
"loss": 0.4338,
"num_tokens": 1070084280.0,
"step": 2800
},
{
"epoch": 1.9146757679180886,
"grad_norm": 0.2739406236004986,
"learning_rate": 1.9288083027399184e-05,
"loss": 0.4318,
"num_tokens": 1072030507.0,
"step": 2805
},
{
"epoch": 1.9180887372013653,
"grad_norm": 0.2763042179831527,
"learning_rate": 1.920932724958211e-05,
"loss": 0.4503,
"num_tokens": 1073943343.0,
"step": 2810
},
{
"epoch": 1.9215017064846416,
"grad_norm": 0.26164021127778475,
"learning_rate": 1.9130688830671767e-05,
"loss": 0.4441,
"num_tokens": 1075835547.0,
"step": 2815
},
{
"epoch": 1.924914675767918,
"grad_norm": 0.27098665794683024,
"learning_rate": 1.9052168883837036e-05,
"loss": 0.4381,
"num_tokens": 1077765498.0,
"step": 2820
},
{
"epoch": 1.9283276450511946,
"grad_norm": 0.23896063773534829,
"learning_rate": 1.8973768520569736e-05,
"loss": 0.4287,
"num_tokens": 1079685882.0,
"step": 2825
},
{
"epoch": 1.9317406143344709,
"grad_norm": 0.25936956300235064,
"learning_rate": 1.889548885066894e-05,
"loss": 0.4391,
"num_tokens": 1081506827.0,
"step": 2830
},
{
"epoch": 1.9351535836177476,
"grad_norm": 0.2535472219270516,
"learning_rate": 1.8817330982225266e-05,
"loss": 0.4513,
"num_tokens": 1083401425.0,
"step": 2835
},
{
"epoch": 1.9385665529010239,
"grad_norm": 0.26568889655316086,
"learning_rate": 1.8739296021605118e-05,
"loss": 0.4427,
"num_tokens": 1085262365.0,
"step": 2840
},
{
"epoch": 1.9419795221843004,
"grad_norm": 0.266890642029229,
"learning_rate": 1.8661385073435107e-05,
"loss": 0.4353,
"num_tokens": 1087251428.0,
"step": 2845
},
{
"epoch": 1.9453924914675769,
"grad_norm": 0.26724914343445294,
"learning_rate": 1.858359924058637e-05,
"loss": 0.4465,
"num_tokens": 1089194392.0,
"step": 2850
},
{
"epoch": 1.9488054607508531,
"grad_norm": 0.2564504243189192,
"learning_rate": 1.8505939624158974e-05,
"loss": 0.4376,
"num_tokens": 1091066424.0,
"step": 2855
},
{
"epoch": 1.9522184300341296,
"grad_norm": 0.25705029863794304,
"learning_rate": 1.8428407323466325e-05,
"loss": 0.4382,
"num_tokens": 1093050518.0,
"step": 2860
},
{
"epoch": 1.9556313993174061,
"grad_norm": 0.2431513884931927,
"learning_rate": 1.8351003436019594e-05,
"loss": 0.455,
"num_tokens": 1095046925.0,
"step": 2865
},
{
"epoch": 1.9590443686006824,
"grad_norm": 0.2664425379728042,
"learning_rate": 1.8273729057512213e-05,
"loss": 0.4329,
"num_tokens": 1096832378.0,
"step": 2870
},
{
"epoch": 1.9624573378839592,
"grad_norm": 0.2604943224342159,
"learning_rate": 1.8196585281804328e-05,
"loss": 0.4331,
"num_tokens": 1098831283.0,
"step": 2875
},
{
"epoch": 1.9658703071672354,
"grad_norm": 0.25398450356166435,
"learning_rate": 1.8119573200907346e-05,
"loss": 0.435,
"num_tokens": 1100801547.0,
"step": 2880
},
{
"epoch": 1.969283276450512,
"grad_norm": 0.27269389447622255,
"learning_rate": 1.8042693904968466e-05,
"loss": 0.4346,
"num_tokens": 1102602774.0,
"step": 2885
},
{
"epoch": 1.9726962457337884,
"grad_norm": 0.2680478392449107,
"learning_rate": 1.7965948482255245e-05,
"loss": 0.4434,
"num_tokens": 1104612875.0,
"step": 2890
},
{
"epoch": 1.9761092150170647,
"grad_norm": 0.25071872779112125,
"learning_rate": 1.7889338019140155e-05,
"loss": 0.4252,
"num_tokens": 1106487794.0,
"step": 2895
},
{
"epoch": 1.9795221843003414,
"grad_norm": 0.24655928594788282,
"learning_rate": 1.7812863600085295e-05,
"loss": 0.4294,
"num_tokens": 1108397131.0,
"step": 2900
},
{
"epoch": 1.9829351535836177,
"grad_norm": 0.27068290921667937,
"learning_rate": 1.7736526307626984e-05,
"loss": 0.4465,
"num_tokens": 1110328750.0,
"step": 2905
},
{
"epoch": 1.9863481228668942,
"grad_norm": 0.26321066071381893,
"learning_rate": 1.766032722236038e-05,
"loss": 0.4421,
"num_tokens": 1112269290.0,
"step": 2910
},
{
"epoch": 1.9897610921501707,
"grad_norm": 0.24753433514478085,
"learning_rate": 1.7584267422924316e-05,
"loss": 0.4444,
"num_tokens": 1114173750.0,
"step": 2915
},
{
"epoch": 1.993174061433447,
"grad_norm": 0.24923481634416494,
"learning_rate": 1.750834798598592e-05,
"loss": 0.4284,
"num_tokens": 1116064691.0,
"step": 2920
},
{
"epoch": 1.9965870307167235,
"grad_norm": 0.2494698895081856,
"learning_rate": 1.743256998622543e-05,
"loss": 0.4347,
"num_tokens": 1118027431.0,
"step": 2925
},
{
"epoch": 2.0,
"grad_norm": 0.2710124353887983,
"learning_rate": 1.7356934496320964e-05,
"loss": 0.4343,
"num_tokens": 1119829452.0,
"step": 2930
},
{
"epoch": 2.0034129692832763,
"grad_norm": 0.3039709938451478,
"learning_rate": 1.7281442586933312e-05,
"loss": 0.3991,
"num_tokens": 1121774995.0,
"step": 2935
},
{
"epoch": 2.006825938566553,
"grad_norm": 0.26543870061365027,
"learning_rate": 1.720609532669085e-05,
"loss": 0.3938,
"num_tokens": 1123664582.0,
"step": 2940
},
{
"epoch": 2.0102389078498293,
"grad_norm": 0.27016391920786065,
"learning_rate": 1.7130893782174333e-05,
"loss": 0.3919,
"num_tokens": 1125654432.0,
"step": 2945
},
{
"epoch": 2.013651877133106,
"grad_norm": 0.27052638008268537,
"learning_rate": 1.7055839017901835e-05,
"loss": 0.4028,
"num_tokens": 1127599030.0,
"step": 2950
},
{
"epoch": 2.0170648464163823,
"grad_norm": 0.2562182608373477,
"learning_rate": 1.6980932096313697e-05,
"loss": 0.3973,
"num_tokens": 1129551904.0,
"step": 2955
},
{
"epoch": 2.0204778156996586,
"grad_norm": 0.28525693433456295,
"learning_rate": 1.6906174077757448e-05,
"loss": 0.3929,
"num_tokens": 1131280173.0,
"step": 2960
},
{
"epoch": 2.0238907849829353,
"grad_norm": 0.2743618011639598,
"learning_rate": 1.6831566020472817e-05,
"loss": 0.406,
"num_tokens": 1133218605.0,
"step": 2965
},
{
"epoch": 2.0273037542662116,
"grad_norm": 0.253725112327626,
"learning_rate": 1.675710898057677e-05,
"loss": 0.3924,
"num_tokens": 1135165512.0,
"step": 2970
},
{
"epoch": 2.030716723549488,
"grad_norm": 0.2848097672160611,
"learning_rate": 1.668280401204852e-05,
"loss": 0.4141,
"num_tokens": 1137123206.0,
"step": 2975
},
{
"epoch": 2.0341296928327646,
"grad_norm": 0.28740321034842653,
"learning_rate": 1.6608652166714625e-05,
"loss": 0.4012,
"num_tokens": 1139148605.0,
"step": 2980
},
{
"epoch": 2.037542662116041,
"grad_norm": 0.2499470155743692,
"learning_rate": 1.6534654494234137e-05,
"loss": 0.4023,
"num_tokens": 1141138126.0,
"step": 2985
},
{
"epoch": 2.0409556313993176,
"grad_norm": 0.2471292947781148,
"learning_rate": 1.6460812042083656e-05,
"loss": 0.3882,
"num_tokens": 1142933925.0,
"step": 2990
},
{
"epoch": 2.044368600682594,
"grad_norm": 0.2507392899848245,
"learning_rate": 1.6387125855542612e-05,
"loss": 0.3836,
"num_tokens": 1144759330.0,
"step": 2995
},
{
"epoch": 2.04778156996587,
"grad_norm": 0.2398128538110352,
"learning_rate": 1.6313596977678365e-05,
"loss": 0.4016,
"num_tokens": 1146764722.0,
"step": 3000
},
{
"epoch": 2.051194539249147,
"grad_norm": 0.26352910950792985,
"learning_rate": 1.624022644933151e-05,
"loss": 0.4162,
"num_tokens": 1148732344.0,
"step": 3005
},
{
"epoch": 2.054607508532423,
"grad_norm": 0.25363271732366105,
"learning_rate": 1.6167015309101124e-05,
"loss": 0.3852,
"num_tokens": 1150602940.0,
"step": 3010
},
{
"epoch": 2.0580204778157,
"grad_norm": 0.2507903258771862,
"learning_rate": 1.6093964593330032e-05,
"loss": 0.3964,
"num_tokens": 1152471105.0,
"step": 3015
},
{
"epoch": 2.061433447098976,
"grad_norm": 0.2582500570099867,
"learning_rate": 1.6021075336090195e-05,
"loss": 0.4066,
"num_tokens": 1154361749.0,
"step": 3020
},
{
"epoch": 2.0648464163822524,
"grad_norm": 0.25407514009499443,
"learning_rate": 1.5948348569168037e-05,
"loss": 0.3958,
"num_tokens": 1156475739.0,
"step": 3025
},
{
"epoch": 2.068259385665529,
"grad_norm": 0.26754352821223326,
"learning_rate": 1.587578532204983e-05,
"loss": 0.3986,
"num_tokens": 1158363541.0,
"step": 3030
},
{
"epoch": 2.0716723549488054,
"grad_norm": 0.263192137906748,
"learning_rate": 1.5803386621907145e-05,
"loss": 0.4023,
"num_tokens": 1160236837.0,
"step": 3035
},
{
"epoch": 2.0750853242320817,
"grad_norm": 0.25505691061126157,
"learning_rate": 1.573115349358231e-05,
"loss": 0.4002,
"num_tokens": 1162129918.0,
"step": 3040
},
{
"epoch": 2.0784982935153584,
"grad_norm": 0.2711167689164809,
"learning_rate": 1.5659086959573887e-05,
"loss": 0.4018,
"num_tokens": 1164024665.0,
"step": 3045
},
{
"epoch": 2.0819112627986347,
"grad_norm": 0.26572076468891587,
"learning_rate": 1.5587188040022198e-05,
"loss": 0.3969,
"num_tokens": 1165915543.0,
"step": 3050
},
{
"epoch": 2.0853242320819114,
"grad_norm": 0.28019838675386793,
"learning_rate": 1.5515457752694897e-05,
"loss": 0.3945,
"num_tokens": 1167829802.0,
"step": 3055
},
{
"epoch": 2.0887372013651877,
"grad_norm": 0.26850496481924274,
"learning_rate": 1.544389711297257e-05,
"loss": 0.395,
"num_tokens": 1169715890.0,
"step": 3060
},
{
"epoch": 2.092150170648464,
"grad_norm": 0.2791716668257277,
"learning_rate": 1.5372507133834368e-05,
"loss": 0.4012,
"num_tokens": 1171614830.0,
"step": 3065
},
{
"epoch": 2.0955631399317407,
"grad_norm": 0.26105997446681856,
"learning_rate": 1.5301288825843584e-05,
"loss": 0.4042,
"num_tokens": 1173464835.0,
"step": 3070
},
{
"epoch": 2.098976109215017,
"grad_norm": 0.2670165726151138,
"learning_rate": 1.523024319713348e-05,
"loss": 0.4011,
"num_tokens": 1175409243.0,
"step": 3075
},
{
"epoch": 2.1023890784982937,
"grad_norm": 0.2876373869088731,
"learning_rate": 1.5159371253392928e-05,
"loss": 0.4077,
"num_tokens": 1177337387.0,
"step": 3080
},
{
"epoch": 2.10580204778157,
"grad_norm": 0.26433480908646045,
"learning_rate": 1.5088673997852183e-05,
"loss": 0.3956,
"num_tokens": 1179233879.0,
"step": 3085
},
{
"epoch": 2.1092150170648463,
"grad_norm": 0.25917326213961195,
"learning_rate": 1.5018152431268712e-05,
"loss": 0.4027,
"num_tokens": 1181151842.0,
"step": 3090
},
{
"epoch": 2.112627986348123,
"grad_norm": 0.28427260914311997,
"learning_rate": 1.4947807551913001e-05,
"loss": 0.4043,
"num_tokens": 1183082878.0,
"step": 3095
},
{
"epoch": 2.1160409556313993,
"grad_norm": 0.30436740585047983,
"learning_rate": 1.4877640355554454e-05,
"loss": 0.4027,
"num_tokens": 1185016399.0,
"step": 3100
},
{
"epoch": 2.1194539249146755,
"grad_norm": 0.2834148607288468,
"learning_rate": 1.480765183544725e-05,
"loss": 0.4126,
"num_tokens": 1186879823.0,
"step": 3105
},
{
"epoch": 2.1228668941979523,
"grad_norm": 0.27439489593858457,
"learning_rate": 1.4737842982316313e-05,
"loss": 0.4134,
"num_tokens": 1188773106.0,
"step": 3110
},
{
"epoch": 2.1262798634812285,
"grad_norm": 0.26473117800887336,
"learning_rate": 1.4668214784343315e-05,
"loss": 0.3792,
"num_tokens": 1190558029.0,
"step": 3115
},
{
"epoch": 2.1296928327645053,
"grad_norm": 0.25563541947968327,
"learning_rate": 1.4598768227152621e-05,
"loss": 0.4037,
"num_tokens": 1192501980.0,
"step": 3120
},
{
"epoch": 2.1331058020477816,
"grad_norm": 0.2503903645932623,
"learning_rate": 1.4529504293797389e-05,
"loss": 0.3877,
"num_tokens": 1194378327.0,
"step": 3125
},
{
"epoch": 2.136518771331058,
"grad_norm": 0.24140775538856318,
"learning_rate": 1.4460423964745649e-05,
"loss": 0.3934,
"num_tokens": 1196267966.0,
"step": 3130
},
{
"epoch": 2.1399317406143346,
"grad_norm": 0.2600080789731432,
"learning_rate": 1.4391528217866396e-05,
"loss": 0.3973,
"num_tokens": 1198190295.0,
"step": 3135
},
{
"epoch": 2.143344709897611,
"grad_norm": 0.27181663294997715,
"learning_rate": 1.4322818028415765e-05,
"loss": 0.3988,
"num_tokens": 1200151285.0,
"step": 3140
},
{
"epoch": 2.1467576791808876,
"grad_norm": 0.2537637030557871,
"learning_rate": 1.4254294369023258e-05,
"loss": 0.3963,
"num_tokens": 1202046752.0,
"step": 3145
},
{
"epoch": 2.150170648464164,
"grad_norm": 0.2494321959131866,
"learning_rate": 1.4185958209677901e-05,
"loss": 0.3956,
"num_tokens": 1203977515.0,
"step": 3150
},
{
"epoch": 2.15358361774744,
"grad_norm": 0.26108243025286193,
"learning_rate": 1.4117810517714575e-05,
"loss": 0.4032,
"num_tokens": 1205900022.0,
"step": 3155
},
{
"epoch": 2.156996587030717,
"grad_norm": 0.26473158504792393,
"learning_rate": 1.4049852257800325e-05,
"loss": 0.4023,
"num_tokens": 1207887070.0,
"step": 3160
},
{
"epoch": 2.160409556313993,
"grad_norm": 0.24762994609328232,
"learning_rate": 1.3982084391920641e-05,
"loss": 0.4029,
"num_tokens": 1209821581.0,
"step": 3165
},
{
"epoch": 2.1638225255972694,
"grad_norm": 0.26299537685268803,
"learning_rate": 1.391450787936594e-05,
"loss": 0.3893,
"num_tokens": 1211692505.0,
"step": 3170
},
{
"epoch": 2.167235494880546,
"grad_norm": 0.25375467426023246,
"learning_rate": 1.3847123676717857e-05,
"loss": 0.41,
"num_tokens": 1213608110.0,
"step": 3175
},
{
"epoch": 2.1706484641638224,
"grad_norm": 0.2635067571833855,
"learning_rate": 1.3779932737835844e-05,
"loss": 0.4067,
"num_tokens": 1215501666.0,
"step": 3180
},
{
"epoch": 2.174061433447099,
"grad_norm": 0.2590695796559807,
"learning_rate": 1.371293601384358e-05,
"loss": 0.4057,
"num_tokens": 1217434979.0,
"step": 3185
},
{
"epoch": 2.1774744027303754,
"grad_norm": 0.2743187153407509,
"learning_rate": 1.36461344531155e-05,
"loss": 0.3976,
"num_tokens": 1219286362.0,
"step": 3190
},
{
"epoch": 2.1808873720136517,
"grad_norm": 0.2553965380053624,
"learning_rate": 1.3579529001263441e-05,
"loss": 0.4143,
"num_tokens": 1221237480.0,
"step": 3195
},
{
"epoch": 2.1843003412969284,
"grad_norm": 0.2530670854295067,
"learning_rate": 1.3513120601123195e-05,
"loss": 0.3923,
"num_tokens": 1223165783.0,
"step": 3200
},
{
"epoch": 2.1877133105802047,
"grad_norm": 0.24797200268758846,
"learning_rate": 1.3446910192741174e-05,
"loss": 0.3729,
"num_tokens": 1224980893.0,
"step": 3205
},
{
"epoch": 2.1911262798634814,
"grad_norm": 0.2620732555128451,
"learning_rate": 1.3380898713361128e-05,
"loss": 0.3927,
"num_tokens": 1226871414.0,
"step": 3210
},
{
"epoch": 2.1945392491467577,
"grad_norm": 0.2609322854993342,
"learning_rate": 1.3315087097410835e-05,
"loss": 0.4083,
"num_tokens": 1228764510.0,
"step": 3215
},
{
"epoch": 2.197952218430034,
"grad_norm": 0.24742442098811646,
"learning_rate": 1.3249476276488937e-05,
"loss": 0.4007,
"num_tokens": 1230729714.0,
"step": 3220
},
{
"epoch": 2.2013651877133107,
"grad_norm": 0.2781961602579005,
"learning_rate": 1.3184067179351677e-05,
"loss": 0.3857,
"num_tokens": 1232478639.0,
"step": 3225
},
{
"epoch": 2.204778156996587,
"grad_norm": 0.2562282084423495,
"learning_rate": 1.3118860731899807e-05,
"loss": 0.4118,
"num_tokens": 1234465242.0,
"step": 3230
},
{
"epoch": 2.2081911262798632,
"grad_norm": 0.2681846880059678,
"learning_rate": 1.305385785716548e-05,
"loss": 0.4051,
"num_tokens": 1236387820.0,
"step": 3235
},
{
"epoch": 2.21160409556314,
"grad_norm": 0.25338427225837795,
"learning_rate": 1.2989059475299137e-05,
"loss": 0.3926,
"num_tokens": 1238270889.0,
"step": 3240
},
{
"epoch": 2.2150170648464163,
"grad_norm": 0.24393136100195462,
"learning_rate": 1.2924466503556523e-05,
"loss": 0.4102,
"num_tokens": 1240290068.0,
"step": 3245
},
{
"epoch": 2.218430034129693,
"grad_norm": 0.2528007996358032,
"learning_rate": 1.2860079856285717e-05,
"loss": 0.3913,
"num_tokens": 1242113295.0,
"step": 3250
},
{
"epoch": 2.2218430034129693,
"grad_norm": 0.2578860500257866,
"learning_rate": 1.279590044491414e-05,
"loss": 0.4018,
"num_tokens": 1244058774.0,
"step": 3255
},
{
"epoch": 2.2252559726962455,
"grad_norm": 0.2556123598222948,
"learning_rate": 1.2731929177935664e-05,
"loss": 0.4069,
"num_tokens": 1245986085.0,
"step": 3260
},
{
"epoch": 2.2286689419795223,
"grad_norm": 0.24628583265829654,
"learning_rate": 1.2668166960897815e-05,
"loss": 0.4175,
"num_tokens": 1248054717.0,
"step": 3265
},
{
"epoch": 2.2320819112627985,
"grad_norm": 0.2539187999798937,
"learning_rate": 1.2604614696388855e-05,
"loss": 0.3911,
"num_tokens": 1249915944.0,
"step": 3270
},
{
"epoch": 2.2354948805460753,
"grad_norm": 0.28754364964508844,
"learning_rate": 1.2541273284025088e-05,
"loss": 0.3939,
"num_tokens": 1251667373.0,
"step": 3275
},
{
"epoch": 2.2389078498293515,
"grad_norm": 0.26299909392718523,
"learning_rate": 1.247814362043808e-05,
"loss": 0.4014,
"num_tokens": 1253547204.0,
"step": 3280
},
{
"epoch": 2.242320819112628,
"grad_norm": 0.25498776876513835,
"learning_rate": 1.2415226599261972e-05,
"loss": 0.4059,
"num_tokens": 1255555709.0,
"step": 3285
},
{
"epoch": 2.2457337883959045,
"grad_norm": 0.2524752384826022,
"learning_rate": 1.2352523111120858e-05,
"loss": 0.3885,
"num_tokens": 1257467951.0,
"step": 3290
},
{
"epoch": 2.249146757679181,
"grad_norm": 0.2636786776353791,
"learning_rate": 1.2290034043616148e-05,
"loss": 0.4067,
"num_tokens": 1259415847.0,
"step": 3295
},
{
"epoch": 2.252559726962457,
"grad_norm": 0.24510895102259947,
"learning_rate": 1.2227760281314001e-05,
"loss": 0.4056,
"num_tokens": 1261370375.0,
"step": 3300
},
{
"epoch": 2.255972696245734,
"grad_norm": 0.2685313662473033,
"learning_rate": 1.216570270573284e-05,
"loss": 0.396,
"num_tokens": 1263209852.0,
"step": 3305
},
{
"epoch": 2.25938566552901,
"grad_norm": 0.2912242466154553,
"learning_rate": 1.2103862195330833e-05,
"loss": 0.4135,
"num_tokens": 1265148638.0,
"step": 3310
},
{
"epoch": 2.262798634812287,
"grad_norm": 0.2508984374513887,
"learning_rate": 1.2042239625493465e-05,
"loss": 0.3856,
"num_tokens": 1267037306.0,
"step": 3315
},
{
"epoch": 2.266211604095563,
"grad_norm": 0.72856347101417,
"learning_rate": 1.1980835868521188e-05,
"loss": 0.4029,
"num_tokens": 1268983314.0,
"step": 3320
},
{
"epoch": 2.26962457337884,
"grad_norm": 0.25899903691774656,
"learning_rate": 1.1919651793617011e-05,
"loss": 0.3838,
"num_tokens": 1270826829.0,
"step": 3325
},
{
"epoch": 2.273037542662116,
"grad_norm": 0.25418536949443005,
"learning_rate": 1.185868826687424e-05,
"loss": 0.3927,
"num_tokens": 1272699748.0,
"step": 3330
},
{
"epoch": 2.2764505119453924,
"grad_norm": 0.2454564607450951,
"learning_rate": 1.1797946151264186e-05,
"loss": 0.3889,
"num_tokens": 1274682677.0,
"step": 3335
},
{
"epoch": 2.279863481228669,
"grad_norm": 0.2401309320335627,
"learning_rate": 1.1737426306623996e-05,
"loss": 0.3964,
"num_tokens": 1276615688.0,
"step": 3340
},
{
"epoch": 2.2832764505119454,
"grad_norm": 0.262862375486088,
"learning_rate": 1.1677129589644446e-05,
"loss": 0.412,
"num_tokens": 1278546081.0,
"step": 3345
},
{
"epoch": 2.2866894197952217,
"grad_norm": 0.2453847358307599,
"learning_rate": 1.1617056853857787e-05,
"loss": 0.3943,
"num_tokens": 1280497422.0,
"step": 3350
},
{
"epoch": 2.2901023890784984,
"grad_norm": 0.27755459465541255,
"learning_rate": 1.1557208949625736e-05,
"loss": 0.4032,
"num_tokens": 1282340664.0,
"step": 3355
},
{
"epoch": 2.2935153583617747,
"grad_norm": 0.24394967766877984,
"learning_rate": 1.1497586724127396e-05,
"loss": 0.3937,
"num_tokens": 1284342162.0,
"step": 3360
},
{
"epoch": 2.296928327645051,
"grad_norm": 0.23730769947011054,
"learning_rate": 1.143819102134723e-05,
"loss": 0.4012,
"num_tokens": 1286272914.0,
"step": 3365
},
{
"epoch": 2.3003412969283277,
"grad_norm": 0.2742813641159952,
"learning_rate": 1.1379022682063195e-05,
"loss": 0.3933,
"num_tokens": 1288199996.0,
"step": 3370
},
{
"epoch": 2.303754266211604,
"grad_norm": 0.2642748821784269,
"learning_rate": 1.1320082543834764e-05,
"loss": 0.4001,
"num_tokens": 1290121821.0,
"step": 3375
},
{
"epoch": 2.3071672354948807,
"grad_norm": 0.2554580567261532,
"learning_rate": 1.1261371440991137e-05,
"loss": 0.4088,
"num_tokens": 1292096399.0,
"step": 3380
},
{
"epoch": 2.310580204778157,
"grad_norm": 0.25534131996398396,
"learning_rate": 1.1202890204619353e-05,
"loss": 0.3855,
"num_tokens": 1293977825.0,
"step": 3385
},
{
"epoch": 2.3139931740614337,
"grad_norm": 0.2511343749324971,
"learning_rate": 1.1144639662552592e-05,
"loss": 0.4002,
"num_tokens": 1295850805.0,
"step": 3390
},
{
"epoch": 2.31740614334471,
"grad_norm": 0.24497341281835422,
"learning_rate": 1.1086620639358442e-05,
"loss": 0.3844,
"num_tokens": 1297791728.0,
"step": 3395
},
{
"epoch": 2.3208191126279862,
"grad_norm": 0.24993434873174314,
"learning_rate": 1.1028833956327198e-05,
"loss": 0.393,
"num_tokens": 1299649317.0,
"step": 3400
},
{
"epoch": 2.324232081911263,
"grad_norm": 0.2766260502313979,
"learning_rate": 1.0971280431460257e-05,
"loss": 0.4048,
"num_tokens": 1301526586.0,
"step": 3405
},
{
"epoch": 2.3276450511945392,
"grad_norm": 0.23439107923909808,
"learning_rate": 1.0913960879458557e-05,
"loss": 0.4113,
"num_tokens": 1303510626.0,
"step": 3410
},
{
"epoch": 2.3310580204778155,
"grad_norm": 0.24827116673836433,
"learning_rate": 1.0856876111711003e-05,
"loss": 0.3924,
"num_tokens": 1305289984.0,
"step": 3415
},
{
"epoch": 2.3344709897610922,
"grad_norm": 0.25114000454246466,
"learning_rate": 1.0800026936283011e-05,
"loss": 0.4038,
"num_tokens": 1307208316.0,
"step": 3420
},
{
"epoch": 2.3378839590443685,
"grad_norm": 0.26280740614744735,
"learning_rate": 1.074341415790507e-05,
"loss": 0.3893,
"num_tokens": 1309117594.0,
"step": 3425
},
{
"epoch": 2.3412969283276452,
"grad_norm": 0.25721690855657364,
"learning_rate": 1.0687038577961334e-05,
"loss": 0.3987,
"num_tokens": 1310930129.0,
"step": 3430
},
{
"epoch": 2.3447098976109215,
"grad_norm": 0.2648598938841444,
"learning_rate": 1.0630900994478271e-05,
"loss": 0.4002,
"num_tokens": 1312822051.0,
"step": 3435
},
{
"epoch": 2.348122866894198,
"grad_norm": 0.26164289201070806,
"learning_rate": 1.0575002202113422e-05,
"loss": 0.3905,
"num_tokens": 1314614448.0,
"step": 3440
},
{
"epoch": 2.3515358361774745,
"grad_norm": 0.23672854825147213,
"learning_rate": 1.0519342992144073e-05,
"loss": 0.3897,
"num_tokens": 1316495391.0,
"step": 3445
},
{
"epoch": 2.354948805460751,
"grad_norm": 0.25335620270381537,
"learning_rate": 1.0463924152456117e-05,
"loss": 0.3894,
"num_tokens": 1318368009.0,
"step": 3450
},
{
"epoch": 2.3583617747440275,
"grad_norm": 0.23819804416780682,
"learning_rate": 1.0408746467532864e-05,
"loss": 0.3888,
"num_tokens": 1320174846.0,
"step": 3455
},
{
"epoch": 2.361774744027304,
"grad_norm": 0.24041042748052557,
"learning_rate": 1.0353810718443949e-05,
"loss": 0.3917,
"num_tokens": 1322107997.0,
"step": 3460
},
{
"epoch": 2.36518771331058,
"grad_norm": 0.25302315227083694,
"learning_rate": 1.0299117682834295e-05,
"loss": 0.3926,
"num_tokens": 1323934060.0,
"step": 3465
},
{
"epoch": 2.368600682593857,
"grad_norm": 0.24411931442269785,
"learning_rate": 1.0244668134913053e-05,
"loss": 0.3974,
"num_tokens": 1325905507.0,
"step": 3470
},
{
"epoch": 2.372013651877133,
"grad_norm": 0.24821036313437658,
"learning_rate": 1.0190462845442702e-05,
"loss": 0.3924,
"num_tokens": 1327780847.0,
"step": 3475
},
{
"epoch": 2.3754266211604094,
"grad_norm": 0.23916482562392274,
"learning_rate": 1.0136502581728109e-05,
"loss": 0.3895,
"num_tokens": 1329668788.0,
"step": 3480
},
{
"epoch": 2.378839590443686,
"grad_norm": 0.25207319235021297,
"learning_rate": 1.0082788107605665e-05,
"loss": 0.3874,
"num_tokens": 1331519325.0,
"step": 3485
},
{
"epoch": 2.3822525597269624,
"grad_norm": 0.2649045025779577,
"learning_rate": 1.0029320183432468e-05,
"loss": 0.387,
"num_tokens": 1333459697.0,
"step": 3490
},
{
"epoch": 2.385665529010239,
"grad_norm": 0.2722179095902226,
"learning_rate": 9.976099566075591e-06,
"loss": 0.3961,
"num_tokens": 1335261419.0,
"step": 3495
},
{
"epoch": 2.3890784982935154,
"grad_norm": 0.2631815719468567,
"learning_rate": 9.923127008901334e-06,
"loss": 0.4047,
"num_tokens": 1337245592.0,
"step": 3500
},
{
"epoch": 2.3924914675767917,
"grad_norm": 0.24412031355907363,
"learning_rate": 9.87040326176457e-06,
"loss": 0.3937,
"num_tokens": 1339175144.0,
"step": 3505
},
{
"epoch": 2.3959044368600684,
"grad_norm": 0.24694376687052613,
"learning_rate": 9.817929070998133e-06,
"loss": 0.3992,
"num_tokens": 1341031356.0,
"step": 3510
},
{
"epoch": 2.3993174061433447,
"grad_norm": 0.24983130243543367,
"learning_rate": 9.765705179402262e-06,
"loss": 0.4056,
"num_tokens": 1342981326.0,
"step": 3515
},
{
"epoch": 2.4027303754266214,
"grad_norm": 0.27290814627861304,
"learning_rate": 9.713732326234085e-06,
"loss": 0.4064,
"num_tokens": 1344847445.0,
"step": 3520
},
{
"epoch": 2.4061433447098977,
"grad_norm": 0.2735478214317808,
"learning_rate": 9.662011247197111e-06,
"loss": 0.409,
"num_tokens": 1346880201.0,
"step": 3525
},
{
"epoch": 2.409556313993174,
"grad_norm": 0.2676701207069597,
"learning_rate": 9.610542674430893e-06,
"loss": 0.411,
"num_tokens": 1348775492.0,
"step": 3530
},
{
"epoch": 2.4129692832764507,
"grad_norm": 0.2535137200935169,
"learning_rate": 9.559327336500597e-06,
"loss": 0.3916,
"num_tokens": 1350779236.0,
"step": 3535
},
{
"epoch": 2.416382252559727,
"grad_norm": 0.2568613906059044,
"learning_rate": 9.508365958386714e-06,
"loss": 0.3925,
"num_tokens": 1352677486.0,
"step": 3540
},
{
"epoch": 2.419795221843003,
"grad_norm": 0.24833002064524512,
"learning_rate": 9.457659261474821e-06,
"loss": 0.3989,
"num_tokens": 1354567655.0,
"step": 3545
},
{
"epoch": 2.42320819112628,
"grad_norm": 0.25899774223190003,
"learning_rate": 9.407207963545322e-06,
"loss": 0.4025,
"num_tokens": 1356434352.0,
"step": 3550
},
{
"epoch": 2.426621160409556,
"grad_norm": 0.2464870171227337,
"learning_rate": 9.357012778763327e-06,
"loss": 0.3976,
"num_tokens": 1358296741.0,
"step": 3555
},
{
"epoch": 2.430034129692833,
"grad_norm": 0.23706503790371114,
"learning_rate": 9.307074417668519e-06,
"loss": 0.4037,
"num_tokens": 1360327394.0,
"step": 3560
},
{
"epoch": 2.4334470989761092,
"grad_norm": 0.24878886578696077,
"learning_rate": 9.2573935871651e-06,
"loss": 0.397,
"num_tokens": 1362266965.0,
"step": 3565
},
{
"epoch": 2.4368600682593855,
"grad_norm": 0.2527659055892693,
"learning_rate": 9.207970990511808e-06,
"loss": 0.3931,
"num_tokens": 1364208448.0,
"step": 3570
},
{
"epoch": 2.4402730375426622,
"grad_norm": 0.2584388941834717,
"learning_rate": 9.158807327311925e-06,
"loss": 0.3982,
"num_tokens": 1366138625.0,
"step": 3575
},
{
"epoch": 2.4436860068259385,
"grad_norm": 0.2538965465130803,
"learning_rate": 9.109903293503386e-06,
"loss": 0.4053,
"num_tokens": 1368129796.0,
"step": 3580
},
{
"epoch": 2.4470989761092152,
"grad_norm": 0.2473126356674962,
"learning_rate": 9.061259581348966e-06,
"loss": 0.4024,
"num_tokens": 1370084334.0,
"step": 3585
},
{
"epoch": 2.4505119453924915,
"grad_norm": 0.25579986929786075,
"learning_rate": 9.01287687942641e-06,
"loss": 0.3974,
"num_tokens": 1371929041.0,
"step": 3590
},
{
"epoch": 2.453924914675768,
"grad_norm": 0.24496560953291152,
"learning_rate": 8.964755872618739e-06,
"loss": 0.3945,
"num_tokens": 1373930783.0,
"step": 3595
},
{
"epoch": 2.4573378839590445,
"grad_norm": 0.23916550521261395,
"learning_rate": 8.916897242104547e-06,
"loss": 0.3964,
"num_tokens": 1375701367.0,
"step": 3600
},
{
"epoch": 2.460750853242321,
"grad_norm": 0.27385157987053893,
"learning_rate": 8.869301665348344e-06,
"loss": 0.3882,
"num_tokens": 1377461734.0,
"step": 3605
},
{
"epoch": 2.464163822525597,
"grad_norm": 0.25702930592535705,
"learning_rate": 8.821969816090966e-06,
"loss": 0.3927,
"num_tokens": 1379370664.0,
"step": 3610
},
{
"epoch": 2.467576791808874,
"grad_norm": 0.2525995216753665,
"learning_rate": 8.774902364340062e-06,
"loss": 0.3914,
"num_tokens": 1381309974.0,
"step": 3615
},
{
"epoch": 2.47098976109215,
"grad_norm": 0.2513274331105906,
"learning_rate": 8.728099976360573e-06,
"loss": 0.3993,
"num_tokens": 1383131522.0,
"step": 3620
},
{
"epoch": 2.474402730375427,
"grad_norm": 0.23420881905848545,
"learning_rate": 8.68156331466535e-06,
"loss": 0.3939,
"num_tokens": 1385084855.0,
"step": 3625
},
{
"epoch": 2.477815699658703,
"grad_norm": 0.28788382628340653,
"learning_rate": 8.635293038005704e-06,
"loss": 0.3957,
"num_tokens": 1386955618.0,
"step": 3630
},
{
"epoch": 2.4812286689419794,
"grad_norm": 0.2587696251636566,
"learning_rate": 8.58928980136216e-06,
"loss": 0.394,
"num_tokens": 1388905153.0,
"step": 3635
},
{
"epoch": 2.484641638225256,
"grad_norm": 0.2590768818161258,
"learning_rate": 8.543554255935143e-06,
"loss": 0.4054,
"num_tokens": 1390842205.0,
"step": 3640
},
{
"epoch": 2.4880546075085324,
"grad_norm": 0.24772041281038729,
"learning_rate": 8.498087049135738e-06,
"loss": 0.3858,
"num_tokens": 1392714747.0,
"step": 3645
},
{
"epoch": 2.491467576791809,
"grad_norm": 0.24778534706466526,
"learning_rate": 8.452888824576588e-06,
"loss": 0.385,
"num_tokens": 1394469454.0,
"step": 3650
},
{
"epoch": 2.4948805460750854,
"grad_norm": 0.24364215488154967,
"learning_rate": 8.407960222062734e-06,
"loss": 0.3941,
"num_tokens": 1396446146.0,
"step": 3655
},
{
"epoch": 2.4982935153583616,
"grad_norm": 0.25058147643890444,
"learning_rate": 8.363301877582572e-06,
"loss": 0.3849,
"num_tokens": 1398333563.0,
"step": 3660
},
{
"epoch": 2.5017064846416384,
"grad_norm": 0.25450013492478507,
"learning_rate": 8.318914423298848e-06,
"loss": 0.399,
"num_tokens": 1400273261.0,
"step": 3665
},
{
"epoch": 2.5051194539249146,
"grad_norm": 0.24566497134038118,
"learning_rate": 8.274798487539715e-06,
"loss": 0.4041,
"num_tokens": 1402142031.0,
"step": 3670
},
{
"epoch": 2.508532423208191,
"grad_norm": 0.2611873096022051,
"learning_rate": 8.23095469478984e-06,
"loss": 0.3979,
"num_tokens": 1404063953.0,
"step": 3675
},
{
"epoch": 2.5119453924914676,
"grad_norm": 0.24351692997405652,
"learning_rate": 8.18738366568157e-06,
"loss": 0.4078,
"num_tokens": 1406021491.0,
"step": 3680
},
{
"epoch": 2.515358361774744,
"grad_norm": 0.2660498083738705,
"learning_rate": 8.144086016986098e-06,
"loss": 0.3933,
"num_tokens": 1407825519.0,
"step": 3685
},
{
"epoch": 2.51877133105802,
"grad_norm": 0.23795381298513307,
"learning_rate": 8.10106236160482e-06,
"loss": 0.3922,
"num_tokens": 1409756663.0,
"step": 3690
},
{
"epoch": 2.522184300341297,
"grad_norm": 0.22685355776603627,
"learning_rate": 8.05831330856058e-06,
"loss": 0.3956,
"num_tokens": 1411714094.0,
"step": 3695
},
{
"epoch": 2.5255972696245736,
"grad_norm": 0.24879932954473205,
"learning_rate": 8.01583946298908e-06,
"loss": 0.4137,
"num_tokens": 1413598256.0,
"step": 3700
},
{
"epoch": 2.52901023890785,
"grad_norm": 0.25134610896136816,
"learning_rate": 7.97364142613033e-06,
"loss": 0.3961,
"num_tokens": 1415457816.0,
"step": 3705
},
{
"epoch": 2.532423208191126,
"grad_norm": 0.24514468786362295,
"learning_rate": 7.9317197953201e-06,
"loss": 0.397,
"num_tokens": 1417330639.0,
"step": 3710
},
{
"epoch": 2.535836177474403,
"grad_norm": 0.23174407336003164,
"learning_rate": 7.890075163981505e-06,
"loss": 0.3903,
"num_tokens": 1419297418.0,
"step": 3715
},
{
"epoch": 2.539249146757679,
"grad_norm": 0.2510853584108354,
"learning_rate": 7.848708121616567e-06,
"loss": 0.4062,
"num_tokens": 1421278815.0,
"step": 3720
},
{
"epoch": 2.5426621160409555,
"grad_norm": 0.25550059792582824,
"learning_rate": 7.807619253797891e-06,
"loss": 0.4022,
"num_tokens": 1423300631.0,
"step": 3725
},
{
"epoch": 2.546075085324232,
"grad_norm": 0.24251183778596538,
"learning_rate": 7.766809142160385e-06,
"loss": 0.3899,
"num_tokens": 1425213662.0,
"step": 3730
},
{
"epoch": 2.5494880546075085,
"grad_norm": 0.25490145483738413,
"learning_rate": 7.726278364393e-06,
"loss": 0.3934,
"num_tokens": 1427257799.0,
"step": 3735
},
{
"epoch": 2.5529010238907848,
"grad_norm": 0.24924750644971594,
"learning_rate": 7.686027494230566e-06,
"loss": 0.3915,
"num_tokens": 1429239975.0,
"step": 3740
},
{
"epoch": 2.5563139931740615,
"grad_norm": 0.2449260738764112,
"learning_rate": 7.646057101445686e-06,
"loss": 0.386,
"num_tokens": 1431117805.0,
"step": 3745
},
{
"epoch": 2.5597269624573378,
"grad_norm": 0.23586099566737673,
"learning_rate": 7.606367751840644e-06,
"loss": 0.4013,
"num_tokens": 1433152174.0,
"step": 3750
},
{
"epoch": 2.5631399317406145,
"grad_norm": 0.23854900024400358,
"learning_rate": 7.566960007239405e-06,
"loss": 0.3911,
"num_tokens": 1435008887.0,
"step": 3755
},
{
"epoch": 2.5665529010238908,
"grad_norm": 0.2500987961586113,
"learning_rate": 7.5278344254796764e-06,
"loss": 0.4031,
"num_tokens": 1436955687.0,
"step": 3760
},
{
"epoch": 2.5699658703071675,
"grad_norm": 0.2706863126827021,
"learning_rate": 7.48899156040499e-06,
"loss": 0.3973,
"num_tokens": 1438848557.0,
"step": 3765
},
{
"epoch": 2.573378839590444,
"grad_norm": 0.2543704542807227,
"learning_rate": 7.450431961856869e-06,
"loss": 0.4031,
"num_tokens": 1440739976.0,
"step": 3770
},
{
"epoch": 2.57679180887372,
"grad_norm": 0.2669655519777121,
"learning_rate": 7.412156175667064e-06,
"loss": 0.3943,
"num_tokens": 1442599846.0,
"step": 3775
},
{
"epoch": 2.580204778156997,
"grad_norm": 0.24836800578210708,
"learning_rate": 7.3741647436497846e-06,
"loss": 0.3946,
"num_tokens": 1444571386.0,
"step": 3780
},
{
"epoch": 2.583617747440273,
"grad_norm": 0.2600600913802552,
"learning_rate": 7.336458203594086e-06,
"loss": 0.3937,
"num_tokens": 1446445368.0,
"step": 3785
},
{
"epoch": 2.5870307167235493,
"grad_norm": 0.24666717274669178,
"learning_rate": 7.299037089256197e-06,
"loss": 0.4076,
"num_tokens": 1448440439.0,
"step": 3790
},
{
"epoch": 2.590443686006826,
"grad_norm": 0.25452167411429993,
"learning_rate": 7.2619019303520065e-06,
"loss": 0.4054,
"num_tokens": 1450323674.0,
"step": 3795
},
{
"epoch": 2.5938566552901023,
"grad_norm": 0.26154476417880795,
"learning_rate": 7.225053252549556e-06,
"loss": 0.3907,
"num_tokens": 1452237710.0,
"step": 3800
},
{
"epoch": 2.5972696245733786,
"grad_norm": 0.2750355702856808,
"learning_rate": 7.188491577461573e-06,
"loss": 0.4003,
"num_tokens": 1454072198.0,
"step": 3805
},
{
"epoch": 2.6006825938566553,
"grad_norm": 0.26966622870777457,
"learning_rate": 7.1522174226381315e-06,
"loss": 0.4121,
"num_tokens": 1455997298.0,
"step": 3810
},
{
"epoch": 2.6040955631399316,
"grad_norm": 0.23947567913720003,
"learning_rate": 7.116231301559292e-06,
"loss": 0.3939,
"num_tokens": 1457990359.0,
"step": 3815
},
{
"epoch": 2.6075085324232083,
"grad_norm": 0.25804521315764417,
"learning_rate": 7.080533723627844e-06,
"loss": 0.392,
"num_tokens": 1459817918.0,
"step": 3820
},
{
"epoch": 2.6109215017064846,
"grad_norm": 0.2439185854647888,
"learning_rate": 7.045125194162096e-06,
"loss": 0.3891,
"num_tokens": 1461691998.0,
"step": 3825
},
{
"epoch": 2.6143344709897613,
"grad_norm": 0.2332908442050089,
"learning_rate": 7.010006214388713e-06,
"loss": 0.388,
"num_tokens": 1463593827.0,
"step": 3830
},
{
"epoch": 2.6177474402730376,
"grad_norm": 0.25261750552287104,
"learning_rate": 6.975177281435641e-06,
"loss": 0.3915,
"num_tokens": 1465426370.0,
"step": 3835
},
{
"epoch": 2.621160409556314,
"grad_norm": 0.23818001032099148,
"learning_rate": 6.9406388883250545e-06,
"loss": 0.3964,
"num_tokens": 1467336173.0,
"step": 3840
},
{
"epoch": 2.6245733788395906,
"grad_norm": 0.25882672475274643,
"learning_rate": 6.906391523966373e-06,
"loss": 0.3931,
"num_tokens": 1469211173.0,
"step": 3845
},
{
"epoch": 2.627986348122867,
"grad_norm": 0.2541572268544839,
"learning_rate": 6.872435673149356e-06,
"loss": 0.391,
"num_tokens": 1471172352.0,
"step": 3850
},
{
"epoch": 2.631399317406143,
"grad_norm": 0.2613262919497558,
"learning_rate": 6.838771816537246e-06,
"loss": 0.3925,
"num_tokens": 1473141245.0,
"step": 3855
},
{
"epoch": 2.63481228668942,
"grad_norm": 0.2664766003389638,
"learning_rate": 6.805400430659915e-06,
"loss": 0.3871,
"num_tokens": 1474940344.0,
"step": 3860
},
{
"epoch": 2.638225255972696,
"grad_norm": 0.25284839351036326,
"learning_rate": 6.772321987907193e-06,
"loss": 0.3921,
"num_tokens": 1476903937.0,
"step": 3865
},
{
"epoch": 2.6416382252559725,
"grad_norm": 0.2719817319285575,
"learning_rate": 6.739536956522123e-06,
"loss": 0.4109,
"num_tokens": 1478813688.0,
"step": 3870
},
{
"epoch": 2.645051194539249,
"grad_norm": 0.2645200858858097,
"learning_rate": 6.707045800594355e-06,
"loss": 0.4038,
"num_tokens": 1480766595.0,
"step": 3875
},
{
"epoch": 2.6484641638225255,
"grad_norm": 0.26001001469112844,
"learning_rate": 6.674848980053584e-06,
"loss": 0.3966,
"num_tokens": 1482642747.0,
"step": 3880
},
{
"epoch": 2.651877133105802,
"grad_norm": 0.24500708714170433,
"learning_rate": 6.642946950663017e-06,
"loss": 0.396,
"num_tokens": 1484419324.0,
"step": 3885
},
{
"epoch": 2.6552901023890785,
"grad_norm": 0.23023591716339004,
"learning_rate": 6.611340164012951e-06,
"loss": 0.3902,
"num_tokens": 1486408843.0,
"step": 3890
},
{
"epoch": 2.658703071672355,
"grad_norm": 0.2563250156761757,
"learning_rate": 6.580029067514346e-06,
"loss": 0.3975,
"num_tokens": 1488281943.0,
"step": 3895
},
{
"epoch": 2.6621160409556315,
"grad_norm": 0.2318318316850287,
"learning_rate": 6.549014104392517e-06,
"loss": 0.3859,
"num_tokens": 1490170627.0,
"step": 3900
},
{
"epoch": 2.6655290102389078,
"grad_norm": 0.24688799503819153,
"learning_rate": 6.518295713680865e-06,
"loss": 0.4052,
"num_tokens": 1492100727.0,
"step": 3905
},
{
"epoch": 2.6689419795221845,
"grad_norm": 0.2678221813912207,
"learning_rate": 6.487874330214634e-06,
"loss": 0.3893,
"num_tokens": 1493930672.0,
"step": 3910
},
{
"epoch": 2.6723549488054608,
"grad_norm": 0.24999129402505174,
"learning_rate": 6.4577503846247705e-06,
"loss": 0.3937,
"num_tokens": 1495878482.0,
"step": 3915
},
{
"epoch": 2.675767918088737,
"grad_norm": 0.24973780539752083,
"learning_rate": 6.427924303331842e-06,
"loss": 0.3828,
"num_tokens": 1497662915.0,
"step": 3920
},
{
"epoch": 2.6791808873720138,
"grad_norm": 0.2346890895753833,
"learning_rate": 6.398396508539978e-06,
"loss": 0.4061,
"num_tokens": 1499588241.0,
"step": 3925
},
{
"epoch": 2.68259385665529,
"grad_norm": 0.24884364729155156,
"learning_rate": 6.369167418230905e-06,
"loss": 0.3998,
"num_tokens": 1501588836.0,
"step": 3930
},
{
"epoch": 2.6860068259385663,
"grad_norm": 0.23641487250123258,
"learning_rate": 6.340237446158029e-06,
"loss": 0.4003,
"num_tokens": 1503502828.0,
"step": 3935
},
{
"epoch": 2.689419795221843,
"grad_norm": 0.258701689896867,
"learning_rate": 6.31160700184058e-06,
"loss": 0.3897,
"num_tokens": 1505280996.0,
"step": 3940
},
{
"epoch": 2.6928327645051193,
"grad_norm": 0.24492715549764293,
"learning_rate": 6.283276490557805e-06,
"loss": 0.3874,
"num_tokens": 1507079279.0,
"step": 3945
},
{
"epoch": 2.696245733788396,
"grad_norm": 0.258468233589569,
"learning_rate": 6.255246313343244e-06,
"loss": 0.3901,
"num_tokens": 1508954940.0,
"step": 3950
},
{
"epoch": 2.6996587030716723,
"grad_norm": 0.24437266193161206,
"learning_rate": 6.227516866979042e-06,
"loss": 0.3992,
"num_tokens": 1510879092.0,
"step": 3955
},
{
"epoch": 2.703071672354949,
"grad_norm": 0.2297482842865935,
"learning_rate": 6.200088543990355e-06,
"loss": 0.3907,
"num_tokens": 1512799299.0,
"step": 3960
},
{
"epoch": 2.7064846416382253,
"grad_norm": 0.2468930948842974,
"learning_rate": 6.1729617326397484e-06,
"loss": 0.3923,
"num_tokens": 1514827859.0,
"step": 3965
},
{
"epoch": 2.7098976109215016,
"grad_norm": 0.25174441050573704,
"learning_rate": 6.1461368169217515e-06,
"loss": 0.3992,
"num_tokens": 1516677372.0,
"step": 3970
},
{
"epoch": 2.7133105802047783,
"grad_norm": 0.23550127912898583,
"learning_rate": 6.119614176557399e-06,
"loss": 0.3983,
"num_tokens": 1518630477.0,
"step": 3975
},
{
"epoch": 2.7167235494880546,
"grad_norm": 0.25435494543972065,
"learning_rate": 6.093394186988837e-06,
"loss": 0.3941,
"num_tokens": 1520552814.0,
"step": 3980
},
{
"epoch": 2.720136518771331,
"grad_norm": 0.24689339414271427,
"learning_rate": 6.0674772193740485e-06,
"loss": 0.3951,
"num_tokens": 1522494886.0,
"step": 3985
},
{
"epoch": 2.7235494880546076,
"grad_norm": 0.23307385388488122,
"learning_rate": 6.041863640581571e-06,
"loss": 0.4008,
"num_tokens": 1524429474.0,
"step": 3990
},
{
"epoch": 2.726962457337884,
"grad_norm": 0.2605356057614315,
"learning_rate": 6.016553813185308e-06,
"loss": 0.3893,
"num_tokens": 1526216361.0,
"step": 3995
},
{
"epoch": 2.73037542662116,
"grad_norm": 0.2523506025794084,
"learning_rate": 5.991548095459404e-06,
"loss": 0.4007,
"num_tokens": 1528138941.0,
"step": 4000
},
{
"epoch": 2.733788395904437,
"grad_norm": 0.2499474707985221,
"learning_rate": 5.966846841373165e-06,
"loss": 0.4069,
"num_tokens": 1530018296.0,
"step": 4005
},
{
"epoch": 2.737201365187713,
"grad_norm": 0.24921605460943203,
"learning_rate": 5.942450400586057e-06,
"loss": 0.4049,
"num_tokens": 1532049388.0,
"step": 4010
},
{
"epoch": 2.74061433447099,
"grad_norm": 0.2498586016104822,
"learning_rate": 5.9183591184427425e-06,
"loss": 0.4045,
"num_tokens": 1534051867.0,
"step": 4015
},
{
"epoch": 2.744027303754266,
"grad_norm": 0.24618708809713136,
"learning_rate": 5.894573335968203e-06,
"loss": 0.3853,
"num_tokens": 1535873961.0,
"step": 4020
},
{
"epoch": 2.747440273037543,
"grad_norm": 0.2505546224989872,
"learning_rate": 5.8710933898629166e-06,
"loss": 0.3837,
"num_tokens": 1537679516.0,
"step": 4025
},
{
"epoch": 2.750853242320819,
"grad_norm": 0.23197077317386236,
"learning_rate": 5.847919612498076e-06,
"loss": 0.3853,
"num_tokens": 1539735224.0,
"step": 4030
},
{
"epoch": 2.7542662116040955,
"grad_norm": 0.2439125129228283,
"learning_rate": 5.825052331910887e-06,
"loss": 0.4079,
"num_tokens": 1541826635.0,
"step": 4035
},
{
"epoch": 2.757679180887372,
"grad_norm": 0.23851878381859146,
"learning_rate": 5.8024918717999445e-06,
"loss": 0.3995,
"num_tokens": 1543746445.0,
"step": 4040
},
{
"epoch": 2.7610921501706485,
"grad_norm": 0.24287441981308092,
"learning_rate": 5.780238551520622e-06,
"loss": 0.4013,
"num_tokens": 1545660100.0,
"step": 4045
},
{
"epoch": 2.7645051194539247,
"grad_norm": 0.27270907920127346,
"learning_rate": 5.75829268608057e-06,
"loss": 0.3939,
"num_tokens": 1547633390.0,
"step": 4050
},
{
"epoch": 2.7679180887372015,
"grad_norm": 0.2215511372126067,
"learning_rate": 5.7366545861352515e-06,
"loss": 0.3934,
"num_tokens": 1549636484.0,
"step": 4055
},
{
"epoch": 2.7713310580204777,
"grad_norm": 0.23824072596286747,
"learning_rate": 5.715324557983544e-06,
"loss": 0.3985,
"num_tokens": 1551617632.0,
"step": 4060
},
{
"epoch": 2.774744027303754,
"grad_norm": 0.2500935931294177,
"learning_rate": 5.694302903563405e-06,
"loss": 0.4064,
"num_tokens": 1553671734.0,
"step": 4065
},
{
"epoch": 2.7781569965870307,
"grad_norm": 0.26068724299179286,
"learning_rate": 5.673589920447592e-06,
"loss": 0.3916,
"num_tokens": 1555493757.0,
"step": 4070
},
{
"epoch": 2.781569965870307,
"grad_norm": 0.2541875918112447,
"learning_rate": 5.653185901839459e-06,
"loss": 0.3856,
"num_tokens": 1557404748.0,
"step": 4075
},
{
"epoch": 2.7849829351535837,
"grad_norm": 0.24522050182761787,
"learning_rate": 5.6330911365688025e-06,
"loss": 0.387,
"num_tokens": 1559334578.0,
"step": 4080
},
{
"epoch": 2.78839590443686,
"grad_norm": 0.2494978034193831,
"learning_rate": 5.613305909087776e-06,
"loss": 0.4005,
"num_tokens": 1561255667.0,
"step": 4085
},
{
"epoch": 2.7918088737201368,
"grad_norm": 0.2585725559042003,
"learning_rate": 5.593830499466846e-06,
"loss": 0.3983,
"num_tokens": 1563224423.0,
"step": 4090
},
{
"epoch": 2.795221843003413,
"grad_norm": 0.24273735328231116,
"learning_rate": 5.574665183390861e-06,
"loss": 0.4039,
"num_tokens": 1565191375.0,
"step": 4095
},
{
"epoch": 2.7986348122866893,
"grad_norm": 0.2717130897992212,
"learning_rate": 5.5558102321551155e-06,
"loss": 0.3982,
"num_tokens": 1567008310.0,
"step": 4100
},
{
"epoch": 2.802047781569966,
"grad_norm": 0.25253291196009875,
"learning_rate": 5.537265912661524e-06,
"loss": 0.4122,
"num_tokens": 1568946206.0,
"step": 4105
},
{
"epoch": 2.8054607508532423,
"grad_norm": 0.2619145601499863,
"learning_rate": 5.519032487414857e-06,
"loss": 0.3993,
"num_tokens": 1570872759.0,
"step": 4110
},
{
"epoch": 2.8088737201365186,
"grad_norm": 0.2686949634633545,
"learning_rate": 5.501110214518992e-06,
"loss": 0.3937,
"num_tokens": 1572860132.0,
"step": 4115
},
{
"epoch": 2.8122866894197953,
"grad_norm": 0.23455834216077803,
"learning_rate": 5.483499347673291e-06,
"loss": 0.3819,
"num_tokens": 1574790093.0,
"step": 4120
},
{
"epoch": 2.8156996587030716,
"grad_norm": 0.2354856688706412,
"learning_rate": 5.466200136168988e-06,
"loss": 0.3994,
"num_tokens": 1576855613.0,
"step": 4125
},
{
"epoch": 2.819112627986348,
"grad_norm": 0.2404835881052928,
"learning_rate": 5.449212824885679e-06,
"loss": 0.3945,
"num_tokens": 1578697231.0,
"step": 4130
},
{
"epoch": 2.8225255972696246,
"grad_norm": 0.24782491779127389,
"learning_rate": 5.432537654287839e-06,
"loss": 0.3926,
"num_tokens": 1580557801.0,
"step": 4135
},
{
"epoch": 2.825938566552901,
"grad_norm": 0.25709964300656496,
"learning_rate": 5.416174860421423e-06,
"loss": 0.4062,
"num_tokens": 1582409503.0,
"step": 4140
},
{
"epoch": 2.8293515358361776,
"grad_norm": 0.24193936395054547,
"learning_rate": 5.400124674910531e-06,
"loss": 0.3838,
"num_tokens": 1584225497.0,
"step": 4145
},
{
"epoch": 2.832764505119454,
"grad_norm": 0.24785601077252273,
"learning_rate": 5.384387324954123e-06,
"loss": 0.3823,
"num_tokens": 1586141543.0,
"step": 4150
},
{
"epoch": 2.8361774744027306,
"grad_norm": 0.24772540511880872,
"learning_rate": 5.368963033322803e-06,
"loss": 0.395,
"num_tokens": 1588008199.0,
"step": 4155
},
{
"epoch": 2.839590443686007,
"grad_norm": 0.2616741371511031,
"learning_rate": 5.353852018355671e-06,
"loss": 0.3858,
"num_tokens": 1589775524.0,
"step": 4160
},
{
"epoch": 2.843003412969283,
"grad_norm": 0.23998611616488993,
"learning_rate": 5.339054493957223e-06,
"loss": 0.4036,
"num_tokens": 1591713597.0,
"step": 4165
},
{
"epoch": 2.84641638225256,
"grad_norm": 0.2473340920387729,
"learning_rate": 5.324570669594329e-06,
"loss": 0.3987,
"num_tokens": 1593620097.0,
"step": 4170
},
{
"epoch": 2.849829351535836,
"grad_norm": 0.24291234707437337,
"learning_rate": 5.310400750293274e-06,
"loss": 0.3885,
"num_tokens": 1595431634.0,
"step": 4175
},
{
"epoch": 2.8532423208191124,
"grad_norm": 0.25195102405447184,
"learning_rate": 5.296544936636839e-06,
"loss": 0.3991,
"num_tokens": 1597345689.0,
"step": 4180
},
{
"epoch": 2.856655290102389,
"grad_norm": 0.24244585678705705,
"learning_rate": 5.283003424761481e-06,
"loss": 0.4016,
"num_tokens": 1599337430.0,
"step": 4185
},
{
"epoch": 2.8600682593856654,
"grad_norm": 0.2420260300363789,
"learning_rate": 5.269776406354538e-06,
"loss": 0.3888,
"num_tokens": 1601383218.0,
"step": 4190
},
{
"epoch": 2.8634812286689417,
"grad_norm": 0.23531179171366776,
"learning_rate": 5.25686406865153e-06,
"loss": 0.3986,
"num_tokens": 1603461399.0,
"step": 4195
},
{
"epoch": 2.8668941979522184,
"grad_norm": 0.23924316659304262,
"learning_rate": 5.244266594433509e-06,
"loss": 0.403,
"num_tokens": 1605406571.0,
"step": 4200
},
{
"epoch": 2.8703071672354947,
"grad_norm": 0.25159268694642145,
"learning_rate": 5.231984162024453e-06,
"loss": 0.4079,
"num_tokens": 1607383394.0,
"step": 4205
},
{
"epoch": 2.8737201365187715,
"grad_norm": 0.26246430303018,
"learning_rate": 5.220016945288762e-06,
"loss": 0.3947,
"num_tokens": 1609294043.0,
"step": 4210
},
{
"epoch": 2.8771331058020477,
"grad_norm": 0.2583127082260596,
"learning_rate": 5.208365113628795e-06,
"loss": 0.4073,
"num_tokens": 1611262646.0,
"step": 4215
},
{
"epoch": 2.8805460750853245,
"grad_norm": 0.2703860048405685,
"learning_rate": 5.197028831982456e-06,
"loss": 0.3951,
"num_tokens": 1613173875.0,
"step": 4220
},
{
"epoch": 2.8839590443686007,
"grad_norm": 0.24985368025540616,
"learning_rate": 5.186008260820875e-06,
"loss": 0.3894,
"num_tokens": 1615060675.0,
"step": 4225
},
{
"epoch": 2.887372013651877,
"grad_norm": 0.24347269615277906,
"learning_rate": 5.17530355614613e-06,
"loss": 0.3865,
"num_tokens": 1616973984.0,
"step": 4230
},
{
"epoch": 2.8907849829351537,
"grad_norm": 0.25860445333919463,
"learning_rate": 5.164914869489042e-06,
"loss": 0.3886,
"num_tokens": 1618849994.0,
"step": 4235
},
{
"epoch": 2.89419795221843,
"grad_norm": 0.25992126090841056,
"learning_rate": 5.154842347907027e-06,
"loss": 0.3949,
"num_tokens": 1620722746.0,
"step": 4240
},
{
"epoch": 2.8976109215017063,
"grad_norm": 0.2507997098825153,
"learning_rate": 5.145086133982016e-06,
"loss": 0.4036,
"num_tokens": 1622535501.0,
"step": 4245
},
{
"epoch": 2.901023890784983,
"grad_norm": 0.25556096631117053,
"learning_rate": 5.1356463658184294e-06,
"loss": 0.4005,
"num_tokens": 1624444287.0,
"step": 4250
},
{
"epoch": 2.9044368600682593,
"grad_norm": 0.247634476621531,
"learning_rate": 5.126523177041238e-06,
"loss": 0.4,
"num_tokens": 1626411719.0,
"step": 4255
},
{
"epoch": 2.9078498293515356,
"grad_norm": 0.24948299890290146,
"learning_rate": 5.117716696794059e-06,
"loss": 0.3981,
"num_tokens": 1628332813.0,
"step": 4260
},
{
"epoch": 2.9112627986348123,
"grad_norm": 0.23498087131331388,
"learning_rate": 5.109227049737329e-06,
"loss": 0.3856,
"num_tokens": 1630193509.0,
"step": 4265
},
{
"epoch": 2.9146757679180886,
"grad_norm": 0.2359135742357886,
"learning_rate": 5.101054356046542e-06,
"loss": 0.4039,
"num_tokens": 1632145825.0,
"step": 4270
},
{
"epoch": 2.9180887372013653,
"grad_norm": 0.2453789165871806,
"learning_rate": 5.093198731410548e-06,
"loss": 0.4038,
"num_tokens": 1634164850.0,
"step": 4275
},
{
"epoch": 2.9215017064846416,
"grad_norm": 0.2615881798789952,
"learning_rate": 5.085660287029918e-06,
"loss": 0.3906,
"num_tokens": 1635989796.0,
"step": 4280
},
{
"epoch": 2.9249146757679183,
"grad_norm": 0.26649221952806657,
"learning_rate": 5.078439129615357e-06,
"loss": 0.383,
"num_tokens": 1637928342.0,
"step": 4285
},
{
"epoch": 2.9283276450511946,
"grad_norm": 0.2612187057511615,
"learning_rate": 5.071535361386216e-06,
"loss": 0.3944,
"num_tokens": 1639831973.0,
"step": 4290
},
{
"epoch": 2.931740614334471,
"grad_norm": 0.2573017244859236,
"learning_rate": 5.064949080069025e-06,
"loss": 0.3888,
"num_tokens": 1641670758.0,
"step": 4295
},
{
"epoch": 2.9351535836177476,
"grad_norm": 0.2513577594767894,
"learning_rate": 5.058680378896119e-06,
"loss": 0.4059,
"num_tokens": 1643634455.0,
"step": 4300
},
{
"epoch": 2.938566552901024,
"grad_norm": 0.24800683186403535,
"learning_rate": 5.0527293466043126e-06,
"loss": 0.3838,
"num_tokens": 1645383276.0,
"step": 4305
},
{
"epoch": 2.9419795221843,
"grad_norm": 0.2433798047992357,
"learning_rate": 5.047096067433657e-06,
"loss": 0.394,
"num_tokens": 1647349819.0,
"step": 4310
},
{
"epoch": 2.945392491467577,
"grad_norm": 0.24355801889840303,
"learning_rate": 5.0417806211262245e-06,
"loss": 0.3859,
"num_tokens": 1649242582.0,
"step": 4315
},
{
"epoch": 2.948805460750853,
"grad_norm": 0.25828743538143506,
"learning_rate": 5.036783082925003e-06,
"loss": 0.4005,
"num_tokens": 1651206051.0,
"step": 4320
},
{
"epoch": 2.9522184300341294,
"grad_norm": 0.2514852306063191,
"learning_rate": 5.032103523572822e-06,
"loss": 0.3914,
"num_tokens": 1653037850.0,
"step": 4325
},
{
"epoch": 2.955631399317406,
"grad_norm": 0.26158521210090824,
"learning_rate": 5.027742009311342e-06,
"loss": 0.3972,
"num_tokens": 1654950263.0,
"step": 4330
},
{
"epoch": 2.9590443686006824,
"grad_norm": 0.2302195005923823,
"learning_rate": 5.023698601880131e-06,
"loss": 0.3953,
"num_tokens": 1656844160.0,
"step": 4335
},
{
"epoch": 2.962457337883959,
"grad_norm": 0.25697806948819085,
"learning_rate": 5.019973358515785e-06,
"loss": 0.3881,
"num_tokens": 1658618203.0,
"step": 4340
},
{
"epoch": 2.9658703071672354,
"grad_norm": 0.26230031669230236,
"learning_rate": 5.016566331951116e-06,
"loss": 0.3995,
"num_tokens": 1660479956.0,
"step": 4345
},
{
"epoch": 2.969283276450512,
"grad_norm": 0.24647721425219105,
"learning_rate": 5.013477570414405e-06,
"loss": 0.3947,
"num_tokens": 1662435735.0,
"step": 4350
},
{
"epoch": 2.9726962457337884,
"grad_norm": 0.24145722179190598,
"learning_rate": 5.010707117628725e-06,
"loss": 0.3984,
"num_tokens": 1664298270.0,
"step": 4355
},
{
"epoch": 2.9761092150170647,
"grad_norm": 0.2541130044127246,
"learning_rate": 5.008255012811318e-06,
"loss": 0.3881,
"num_tokens": 1666219228.0,
"step": 4360
},
{
"epoch": 2.9795221843003414,
"grad_norm": 0.2554311961056411,
"learning_rate": 5.006121290673037e-06,
"loss": 0.4008,
"num_tokens": 1668243685.0,
"step": 4365
},
{
"epoch": 2.9829351535836177,
"grad_norm": 0.236207532281216,
"learning_rate": 5.004305981417863e-06,
"loss": 0.3904,
"num_tokens": 1670202629.0,
"step": 4370
},
{
"epoch": 2.986348122866894,
"grad_norm": 0.23352563711926605,
"learning_rate": 5.002809110742464e-06,
"loss": 0.3968,
"num_tokens": 1672186589.0,
"step": 4375
},
{
"epoch": 2.9897610921501707,
"grad_norm": 0.26559636339552617,
"learning_rate": 5.001630699835849e-06,
"loss": 0.4007,
"num_tokens": 1674108971.0,
"step": 4380
},
{
"epoch": 2.993174061433447,
"grad_norm": 0.23381874902902047,
"learning_rate": 5.000770765379057e-06,
"loss": 0.3962,
"num_tokens": 1676074131.0,
"step": 4385
},
{
"epoch": 2.9965870307167233,
"grad_norm": 0.24395543004381443,
"learning_rate": 5.000229319544913e-06,
"loss": 0.3862,
"num_tokens": 1677958772.0,
"step": 4390
},
{
"epoch": 3.0,
"grad_norm": 0.24316523321927602,
"learning_rate": 5.0000063699978795e-06,
"loss": 0.3918,
"num_tokens": 1679763258.0,
"step": 4395
},
{
"epoch": 3.0,
"step": 4395,
"total_flos": 3436437652111360.0,
"train_loss": 0.4596632290483199,
"train_runtime": 36029.2862,
"train_samples_per_second": 7.805,
"train_steps_per_second": 0.122
}
],
"logging_steps": 5,
"max_steps": 4395,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3436437652111360.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}