Qwen2.5-VL-7B-Instruct-SFT / trainer_state.json
bluuluu's picture
Model save
8ccd4dc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998919736415686,
"eval_steps": 500,
"global_step": 3471,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014403514457527638,
"grad_norm": 2.62252908668901,
"learning_rate": 2.2988505747126437e-07,
"loss": 3.4281,
"step": 5
},
{
"epoch": 0.0028807028915055276,
"grad_norm": 2.6537402724592827,
"learning_rate": 5.172413793103449e-07,
"loss": 3.4375,
"step": 10
},
{
"epoch": 0.004321054337258291,
"grad_norm": 2.6461606608506973,
"learning_rate": 8.045977011494253e-07,
"loss": 3.4271,
"step": 15
},
{
"epoch": 0.005761405783011055,
"grad_norm": 2.6482412561136113,
"learning_rate": 1.0919540229885058e-06,
"loss": 3.4316,
"step": 20
},
{
"epoch": 0.007201757228763818,
"grad_norm": 2.7090530181422676,
"learning_rate": 1.3793103448275862e-06,
"loss": 3.4271,
"step": 25
},
{
"epoch": 0.008642108674516582,
"grad_norm": 2.6957713098098757,
"learning_rate": 1.6666666666666667e-06,
"loss": 3.4222,
"step": 30
},
{
"epoch": 0.010082460120269346,
"grad_norm": 2.823975463539696,
"learning_rate": 1.9540229885057475e-06,
"loss": 3.4122,
"step": 35
},
{
"epoch": 0.01152281156602211,
"grad_norm": 2.906829198673145,
"learning_rate": 2.241379310344828e-06,
"loss": 3.3972,
"step": 40
},
{
"epoch": 0.012963163011774873,
"grad_norm": 2.959760309292492,
"learning_rate": 2.5287356321839083e-06,
"loss": 3.3748,
"step": 45
},
{
"epoch": 0.014403514457527637,
"grad_norm": 3.0140610424144882,
"learning_rate": 2.8160919540229887e-06,
"loss": 3.3312,
"step": 50
},
{
"epoch": 0.0158438659032804,
"grad_norm": 3.0958728084508227,
"learning_rate": 3.103448275862069e-06,
"loss": 3.2621,
"step": 55
},
{
"epoch": 0.017284217349033165,
"grad_norm": 3.0789510621647116,
"learning_rate": 3.3908045977011496e-06,
"loss": 3.1805,
"step": 60
},
{
"epoch": 0.01872456879478593,
"grad_norm": 3.060095215151247,
"learning_rate": 3.67816091954023e-06,
"loss": 3.0475,
"step": 65
},
{
"epoch": 0.020164920240538693,
"grad_norm": 2.862619501338678,
"learning_rate": 3.96551724137931e-06,
"loss": 2.8765,
"step": 70
},
{
"epoch": 0.021605271686291457,
"grad_norm": 2.4529701308424627,
"learning_rate": 4.252873563218391e-06,
"loss": 2.6438,
"step": 75
},
{
"epoch": 0.02304562313204422,
"grad_norm": 1.9259597661186747,
"learning_rate": 4.540229885057471e-06,
"loss": 2.4074,
"step": 80
},
{
"epoch": 0.02448597457779698,
"grad_norm": 1.1944899501060937,
"learning_rate": 4.8275862068965525e-06,
"loss": 2.1598,
"step": 85
},
{
"epoch": 0.025926326023549745,
"grad_norm": 0.7043736273275593,
"learning_rate": 5.114942528735632e-06,
"loss": 1.986,
"step": 90
},
{
"epoch": 0.02736667746930251,
"grad_norm": 0.4401877202562375,
"learning_rate": 5.402298850574713e-06,
"loss": 1.8752,
"step": 95
},
{
"epoch": 0.028807028915055273,
"grad_norm": 0.29913771411892215,
"learning_rate": 5.689655172413794e-06,
"loss": 1.8032,
"step": 100
},
{
"epoch": 0.030247380360808037,
"grad_norm": 0.2501344036852339,
"learning_rate": 5.977011494252874e-06,
"loss": 1.762,
"step": 105
},
{
"epoch": 0.0316877318065608,
"grad_norm": 0.22028235935534177,
"learning_rate": 6.264367816091954e-06,
"loss": 1.7284,
"step": 110
},
{
"epoch": 0.03312808325231356,
"grad_norm": 0.20809861418922077,
"learning_rate": 6.551724137931035e-06,
"loss": 1.7045,
"step": 115
},
{
"epoch": 0.03456843469806633,
"grad_norm": 0.20520136163663816,
"learning_rate": 6.839080459770115e-06,
"loss": 1.6829,
"step": 120
},
{
"epoch": 0.03600878614381909,
"grad_norm": 0.20174383175553343,
"learning_rate": 7.126436781609196e-06,
"loss": 1.6605,
"step": 125
},
{
"epoch": 0.03744913758957186,
"grad_norm": 0.20319553328609763,
"learning_rate": 7.413793103448277e-06,
"loss": 1.6379,
"step": 130
},
{
"epoch": 0.03888948903532462,
"grad_norm": 0.2038342591080777,
"learning_rate": 7.701149425287356e-06,
"loss": 1.6107,
"step": 135
},
{
"epoch": 0.040329840481077385,
"grad_norm": 0.20440794132298992,
"learning_rate": 7.988505747126438e-06,
"loss": 1.5846,
"step": 140
},
{
"epoch": 0.041770191926830146,
"grad_norm": 0.20590893215016112,
"learning_rate": 8.275862068965518e-06,
"loss": 1.5547,
"step": 145
},
{
"epoch": 0.04321054337258291,
"grad_norm": 0.20185248245660797,
"learning_rate": 8.563218390804599e-06,
"loss": 1.526,
"step": 150
},
{
"epoch": 0.044650894818335674,
"grad_norm": 0.1991870123255739,
"learning_rate": 8.85057471264368e-06,
"loss": 1.4937,
"step": 155
},
{
"epoch": 0.04609124626408844,
"grad_norm": 0.19465384170558617,
"learning_rate": 9.13793103448276e-06,
"loss": 1.4598,
"step": 160
},
{
"epoch": 0.0475315977098412,
"grad_norm": 0.18797574210376147,
"learning_rate": 9.42528735632184e-06,
"loss": 1.4343,
"step": 165
},
{
"epoch": 0.04897194915559396,
"grad_norm": 0.18212469780560764,
"learning_rate": 9.71264367816092e-06,
"loss": 1.3986,
"step": 170
},
{
"epoch": 0.05041230060134673,
"grad_norm": 0.1851275242281937,
"learning_rate": 1e-05,
"loss": 1.3663,
"step": 175
},
{
"epoch": 0.05185265204709949,
"grad_norm": 0.1894926644233431,
"learning_rate": 1.0287356321839081e-05,
"loss": 1.3365,
"step": 180
},
{
"epoch": 0.05329300349285226,
"grad_norm": 0.19239941503471172,
"learning_rate": 1.0574712643678162e-05,
"loss": 1.3029,
"step": 185
},
{
"epoch": 0.05473335493860502,
"grad_norm": 0.1983109078749674,
"learning_rate": 1.0862068965517242e-05,
"loss": 1.2663,
"step": 190
},
{
"epoch": 0.056173706384357786,
"grad_norm": 0.20536206577251548,
"learning_rate": 1.1149425287356324e-05,
"loss": 1.2238,
"step": 195
},
{
"epoch": 0.057614057830110546,
"grad_norm": 0.20960804903533373,
"learning_rate": 1.1436781609195405e-05,
"loss": 1.1841,
"step": 200
},
{
"epoch": 0.059054409275863314,
"grad_norm": 0.2188932735565814,
"learning_rate": 1.1724137931034483e-05,
"loss": 1.1332,
"step": 205
},
{
"epoch": 0.060494760721616074,
"grad_norm": 0.22876533797063386,
"learning_rate": 1.2011494252873564e-05,
"loss": 1.0851,
"step": 210
},
{
"epoch": 0.061935112167368835,
"grad_norm": 0.23174345011010952,
"learning_rate": 1.2298850574712644e-05,
"loss": 1.0293,
"step": 215
},
{
"epoch": 0.0633754636131216,
"grad_norm": 0.23330633292890332,
"learning_rate": 1.2586206896551725e-05,
"loss": 0.9716,
"step": 220
},
{
"epoch": 0.06481581505887436,
"grad_norm": 0.24464497890825554,
"learning_rate": 1.2873563218390805e-05,
"loss": 0.9035,
"step": 225
},
{
"epoch": 0.06625616650462712,
"grad_norm": 0.25134894132627916,
"learning_rate": 1.3160919540229885e-05,
"loss": 0.8296,
"step": 230
},
{
"epoch": 0.0676965179503799,
"grad_norm": 0.2586116096970685,
"learning_rate": 1.3448275862068967e-05,
"loss": 0.749,
"step": 235
},
{
"epoch": 0.06913686939613266,
"grad_norm": 0.25755530564734574,
"learning_rate": 1.3735632183908048e-05,
"loss": 0.657,
"step": 240
},
{
"epoch": 0.07057722084188542,
"grad_norm": 0.24571389947226407,
"learning_rate": 1.4022988505747128e-05,
"loss": 0.5611,
"step": 245
},
{
"epoch": 0.07201757228763818,
"grad_norm": 0.2161386991422811,
"learning_rate": 1.4310344827586209e-05,
"loss": 0.4684,
"step": 250
},
{
"epoch": 0.07345792373339095,
"grad_norm": 0.1845577123208276,
"learning_rate": 1.459770114942529e-05,
"loss": 0.3929,
"step": 255
},
{
"epoch": 0.07489827517914371,
"grad_norm": 0.15938611776040426,
"learning_rate": 1.4885057471264368e-05,
"loss": 0.3274,
"step": 260
},
{
"epoch": 0.07633862662489647,
"grad_norm": 0.13184840828375916,
"learning_rate": 1.5172413793103448e-05,
"loss": 0.273,
"step": 265
},
{
"epoch": 0.07777897807064924,
"grad_norm": 0.10547360872838579,
"learning_rate": 1.545977011494253e-05,
"loss": 0.2341,
"step": 270
},
{
"epoch": 0.079219329516402,
"grad_norm": 0.08378821715636167,
"learning_rate": 1.574712643678161e-05,
"loss": 0.2152,
"step": 275
},
{
"epoch": 0.08065968096215477,
"grad_norm": 0.06639227255563633,
"learning_rate": 1.603448275862069e-05,
"loss": 0.2028,
"step": 280
},
{
"epoch": 0.08210003240790753,
"grad_norm": 0.05326270333294462,
"learning_rate": 1.632183908045977e-05,
"loss": 0.1915,
"step": 285
},
{
"epoch": 0.08354038385366029,
"grad_norm": 0.045250863339317625,
"learning_rate": 1.6609195402298854e-05,
"loss": 0.1831,
"step": 290
},
{
"epoch": 0.08498073529941305,
"grad_norm": 0.03997876406980682,
"learning_rate": 1.6896551724137932e-05,
"loss": 0.1783,
"step": 295
},
{
"epoch": 0.08642108674516583,
"grad_norm": 0.03683411390731312,
"learning_rate": 1.7183908045977015e-05,
"loss": 0.1803,
"step": 300
},
{
"epoch": 0.08786143819091859,
"grad_norm": 0.033494444332579816,
"learning_rate": 1.7471264367816093e-05,
"loss": 0.1792,
"step": 305
},
{
"epoch": 0.08930178963667135,
"grad_norm": 0.027545194459475855,
"learning_rate": 1.7758620689655175e-05,
"loss": 0.1776,
"step": 310
},
{
"epoch": 0.09074214108242411,
"grad_norm": 0.025534566575995447,
"learning_rate": 1.8045977011494254e-05,
"loss": 0.1627,
"step": 315
},
{
"epoch": 0.09218249252817688,
"grad_norm": 0.02432314883206978,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.1694,
"step": 320
},
{
"epoch": 0.09362284397392964,
"grad_norm": 0.02296007196296768,
"learning_rate": 1.8620689655172415e-05,
"loss": 0.1632,
"step": 325
},
{
"epoch": 0.0950631954196824,
"grad_norm": 0.02231231517863281,
"learning_rate": 1.8908045977011497e-05,
"loss": 0.1603,
"step": 330
},
{
"epoch": 0.09650354686543516,
"grad_norm": 0.020592453220906963,
"learning_rate": 1.9195402298850576e-05,
"loss": 0.1578,
"step": 335
},
{
"epoch": 0.09794389831118792,
"grad_norm": 0.020380116268697165,
"learning_rate": 1.9482758620689658e-05,
"loss": 0.1568,
"step": 340
},
{
"epoch": 0.0993842497569407,
"grad_norm": 0.02128754387207953,
"learning_rate": 1.9770114942528737e-05,
"loss": 0.1577,
"step": 345
},
{
"epoch": 0.10082460120269346,
"grad_norm": 0.020077957237521784,
"learning_rate": 1.9999994940288617e-05,
"loss": 0.1645,
"step": 350
},
{
"epoch": 0.10226495264844622,
"grad_norm": 0.02081981445796709,
"learning_rate": 1.999981785092774e-05,
"loss": 0.1458,
"step": 355
},
{
"epoch": 0.10370530409419898,
"grad_norm": 0.019642666023756323,
"learning_rate": 1.9999387781117715e-05,
"loss": 0.1515,
"step": 360
},
{
"epoch": 0.10514565553995175,
"grad_norm": 0.01993767310786855,
"learning_rate": 1.9998704741738657e-05,
"loss": 0.1558,
"step": 365
},
{
"epoch": 0.10658600698570452,
"grad_norm": 0.018085760313418967,
"learning_rate": 1.9997768750070442e-05,
"loss": 0.1553,
"step": 370
},
{
"epoch": 0.10802635843145728,
"grad_norm": 0.0203452490973548,
"learning_rate": 1.9996579829792263e-05,
"loss": 0.15,
"step": 375
},
{
"epoch": 0.10946670987721004,
"grad_norm": 0.018590604714013987,
"learning_rate": 1.9995138010982028e-05,
"loss": 0.1492,
"step": 380
},
{
"epoch": 0.1109070613229628,
"grad_norm": 0.017630128077178075,
"learning_rate": 1.9993443330115592e-05,
"loss": 0.1474,
"step": 385
},
{
"epoch": 0.11234741276871557,
"grad_norm": 0.019441688441053217,
"learning_rate": 1.9991495830065857e-05,
"loss": 0.1509,
"step": 390
},
{
"epoch": 0.11378776421446833,
"grad_norm": 0.023197073067636517,
"learning_rate": 1.9989295560101656e-05,
"loss": 0.1512,
"step": 395
},
{
"epoch": 0.11522811566022109,
"grad_norm": 0.022598358372408592,
"learning_rate": 1.998684257588654e-05,
"loss": 0.1457,
"step": 400
},
{
"epoch": 0.11666846710597385,
"grad_norm": 0.018391884570447603,
"learning_rate": 1.9984136939477333e-05,
"loss": 0.1487,
"step": 405
},
{
"epoch": 0.11810881855172663,
"grad_norm": 0.01730298666489709,
"learning_rate": 1.9981178719322606e-05,
"loss": 0.1408,
"step": 410
},
{
"epoch": 0.11954916999747939,
"grad_norm": 0.018077200725101054,
"learning_rate": 1.99779679902609e-05,
"loss": 0.1543,
"step": 415
},
{
"epoch": 0.12098952144323215,
"grad_norm": 0.018875155851426084,
"learning_rate": 1.9974504833518863e-05,
"loss": 0.1526,
"step": 420
},
{
"epoch": 0.12242987288898491,
"grad_norm": 0.01550384193559367,
"learning_rate": 1.9970789336709185e-05,
"loss": 0.1503,
"step": 425
},
{
"epoch": 0.12387022433473767,
"grad_norm": 0.01563256542028533,
"learning_rate": 1.9966821593828393e-05,
"loss": 0.1475,
"step": 430
},
{
"epoch": 0.12531057578049043,
"grad_norm": 0.015801051344428534,
"learning_rate": 1.9962601705254442e-05,
"loss": 0.1384,
"step": 435
},
{
"epoch": 0.1267509272262432,
"grad_norm": 0.018919536477648418,
"learning_rate": 1.995812977774421e-05,
"loss": 0.146,
"step": 440
},
{
"epoch": 0.12819127867199598,
"grad_norm": 0.01564902677446783,
"learning_rate": 1.995340592443078e-05,
"loss": 0.1446,
"step": 445
},
{
"epoch": 0.12963163011774873,
"grad_norm": 0.01649586526356844,
"learning_rate": 1.9948430264820588e-05,
"loss": 0.1443,
"step": 450
},
{
"epoch": 0.1310719815635015,
"grad_norm": 0.016233025873589237,
"learning_rate": 1.994320292479038e-05,
"loss": 0.1398,
"step": 455
},
{
"epoch": 0.13251233300925425,
"grad_norm": 0.018370727196108234,
"learning_rate": 1.993772403658405e-05,
"loss": 0.1413,
"step": 460
},
{
"epoch": 0.13395268445500702,
"grad_norm": 0.015413145114948405,
"learning_rate": 1.9931993738809288e-05,
"loss": 0.1377,
"step": 465
},
{
"epoch": 0.1353930359007598,
"grad_norm": 0.01579533249322328,
"learning_rate": 1.9926012176434054e-05,
"loss": 0.1343,
"step": 470
},
{
"epoch": 0.13683338734651254,
"grad_norm": 0.016623384137655177,
"learning_rate": 1.991977950078295e-05,
"loss": 0.1362,
"step": 475
},
{
"epoch": 0.13827373879226532,
"grad_norm": 0.016561476846420342,
"learning_rate": 1.9913295869533345e-05,
"loss": 0.1383,
"step": 480
},
{
"epoch": 0.13971409023801806,
"grad_norm": 0.016900232091944686,
"learning_rate": 1.990656144671143e-05,
"loss": 0.1448,
"step": 485
},
{
"epoch": 0.14115444168377084,
"grad_norm": 0.018123435569779828,
"learning_rate": 1.9899576402688038e-05,
"loss": 0.1364,
"step": 490
},
{
"epoch": 0.1425947931295236,
"grad_norm": 0.01869327376515684,
"learning_rate": 1.9892340914174344e-05,
"loss": 0.1457,
"step": 495
},
{
"epoch": 0.14403514457527636,
"grad_norm": 0.017383446282339288,
"learning_rate": 1.988485516421739e-05,
"loss": 0.1369,
"step": 500
},
{
"epoch": 0.14403514457527636,
"eval_loss": 0.13828575611114502,
"eval_runtime": 863.0505,
"eval_samples_per_second": 2.09,
"eval_steps_per_second": 0.523,
"step": 500
},
{
"epoch": 0.14547549602102913,
"grad_norm": 0.01727758799136872,
"learning_rate": 1.9877119342195478e-05,
"loss": 0.141,
"step": 505
},
{
"epoch": 0.1469158474667819,
"grad_norm": 0.020724335957621343,
"learning_rate": 1.986913364381333e-05,
"loss": 0.1417,
"step": 510
},
{
"epoch": 0.14835619891253465,
"grad_norm": 0.015171059899028828,
"learning_rate": 1.9860898271097194e-05,
"loss": 0.1361,
"step": 515
},
{
"epoch": 0.14979655035828743,
"grad_norm": 0.019695421273977713,
"learning_rate": 1.9852413432389685e-05,
"loss": 0.1314,
"step": 520
},
{
"epoch": 0.15123690180404017,
"grad_norm": 0.016105641997242213,
"learning_rate": 1.984367934234455e-05,
"loss": 0.1355,
"step": 525
},
{
"epoch": 0.15267725324979295,
"grad_norm": 0.017169458650654956,
"learning_rate": 1.9834696221921213e-05,
"loss": 0.135,
"step": 530
},
{
"epoch": 0.15411760469554572,
"grad_norm": 0.016758621454569307,
"learning_rate": 1.98254642983792e-05,
"loss": 0.1319,
"step": 535
},
{
"epoch": 0.15555795614129847,
"grad_norm": 0.018163113510772863,
"learning_rate": 1.9815983805272378e-05,
"loss": 0.1303,
"step": 540
},
{
"epoch": 0.15699830758705124,
"grad_norm": 0.017990247516200045,
"learning_rate": 1.980625498244306e-05,
"loss": 0.139,
"step": 545
},
{
"epoch": 0.158438659032804,
"grad_norm": 0.015967634026842183,
"learning_rate": 1.9796278076015924e-05,
"loss": 0.1319,
"step": 550
},
{
"epoch": 0.15987901047855677,
"grad_norm": 0.014829856672342604,
"learning_rate": 1.9786053338391792e-05,
"loss": 0.1352,
"step": 555
},
{
"epoch": 0.16131936192430954,
"grad_norm": 0.0172321563882635,
"learning_rate": 1.9775581028241253e-05,
"loss": 0.1253,
"step": 560
},
{
"epoch": 0.1627597133700623,
"grad_norm": 0.018596015242561326,
"learning_rate": 1.97648614104981e-05,
"loss": 0.1281,
"step": 565
},
{
"epoch": 0.16420006481581506,
"grad_norm": 0.018018526896851798,
"learning_rate": 1.9753894756352643e-05,
"loss": 0.1345,
"step": 570
},
{
"epoch": 0.16564041626156784,
"grad_norm": 0.016819098654316897,
"learning_rate": 1.9742681343244853e-05,
"loss": 0.1248,
"step": 575
},
{
"epoch": 0.16708076770732058,
"grad_norm": 0.015776677724923773,
"learning_rate": 1.9731221454857322e-05,
"loss": 0.1313,
"step": 580
},
{
"epoch": 0.16852111915307336,
"grad_norm": 0.017615190905280357,
"learning_rate": 1.9719515381108093e-05,
"loss": 0.1368,
"step": 585
},
{
"epoch": 0.1699614705988261,
"grad_norm": 0.01586176965856958,
"learning_rate": 1.970756341814335e-05,
"loss": 0.1308,
"step": 590
},
{
"epoch": 0.17140182204457888,
"grad_norm": 0.01582181491527461,
"learning_rate": 1.9695365868329895e-05,
"loss": 0.1334,
"step": 595
},
{
"epoch": 0.17284217349033165,
"grad_norm": 0.01832860755228363,
"learning_rate": 1.9682923040247513e-05,
"loss": 0.1303,
"step": 600
},
{
"epoch": 0.1742825249360844,
"grad_norm": 0.01608043519957003,
"learning_rate": 1.9670235248681154e-05,
"loss": 0.128,
"step": 605
},
{
"epoch": 0.17572287638183717,
"grad_norm": 0.015462595330781244,
"learning_rate": 1.965730281461299e-05,
"loss": 0.1318,
"step": 610
},
{
"epoch": 0.17716322782758992,
"grad_norm": 0.01829748858884658,
"learning_rate": 1.964412606521428e-05,
"loss": 0.128,
"step": 615
},
{
"epoch": 0.1786035792733427,
"grad_norm": 0.01741437351404647,
"learning_rate": 1.9630705333837096e-05,
"loss": 0.129,
"step": 620
},
{
"epoch": 0.18004393071909547,
"grad_norm": 0.01714971316628412,
"learning_rate": 1.9617040960005883e-05,
"loss": 0.1274,
"step": 625
},
{
"epoch": 0.18148428216484822,
"grad_norm": 0.017341345727170193,
"learning_rate": 1.9603133289408883e-05,
"loss": 0.1285,
"step": 630
},
{
"epoch": 0.182924633610601,
"grad_norm": 0.018545013302290372,
"learning_rate": 1.9588982673889373e-05,
"loss": 0.129,
"step": 635
},
{
"epoch": 0.18436498505635376,
"grad_norm": 0.01917072532666851,
"learning_rate": 1.9574589471436794e-05,
"loss": 0.1246,
"step": 640
},
{
"epoch": 0.1858053365021065,
"grad_norm": 0.017610415450957716,
"learning_rate": 1.955995404617765e-05,
"loss": 0.1296,
"step": 645
},
{
"epoch": 0.18724568794785929,
"grad_norm": 0.017348619206735666,
"learning_rate": 1.9545076768366336e-05,
"loss": 0.1201,
"step": 650
},
{
"epoch": 0.18868603939361203,
"grad_norm": 0.018402248035990453,
"learning_rate": 1.9529958014375748e-05,
"loss": 0.1288,
"step": 655
},
{
"epoch": 0.1901263908393648,
"grad_norm": 0.017347671383931933,
"learning_rate": 1.9514598166687772e-05,
"loss": 0.1322,
"step": 660
},
{
"epoch": 0.19156674228511758,
"grad_norm": 0.01862965351110765,
"learning_rate": 1.9498997613883597e-05,
"loss": 0.1368,
"step": 665
},
{
"epoch": 0.19300709373087033,
"grad_norm": 0.016601710581881732,
"learning_rate": 1.9483156750633906e-05,
"loss": 0.1326,
"step": 670
},
{
"epoch": 0.1944474451766231,
"grad_norm": 0.015529988935684115,
"learning_rate": 1.946707597768886e-05,
"loss": 0.1224,
"step": 675
},
{
"epoch": 0.19588779662237585,
"grad_norm": 0.02017712606978146,
"learning_rate": 1.9450755701867994e-05,
"loss": 0.1291,
"step": 680
},
{
"epoch": 0.19732814806812862,
"grad_norm": 0.0167232245419185,
"learning_rate": 1.9434196336049897e-05,
"loss": 0.1201,
"step": 685
},
{
"epoch": 0.1987684995138814,
"grad_norm": 0.019048290037720794,
"learning_rate": 1.941739829916177e-05,
"loss": 0.1241,
"step": 690
},
{
"epoch": 0.20020885095963414,
"grad_norm": 0.017899758339100985,
"learning_rate": 1.940036201616886e-05,
"loss": 0.1255,
"step": 695
},
{
"epoch": 0.20164920240538692,
"grad_norm": 0.018753604988563825,
"learning_rate": 1.9383087918063662e-05,
"loss": 0.1279,
"step": 700
},
{
"epoch": 0.20308955385113966,
"grad_norm": 0.016720002206463015,
"learning_rate": 1.9365576441855046e-05,
"loss": 0.1265,
"step": 705
},
{
"epoch": 0.20452990529689244,
"grad_norm": 0.01598156321133028,
"learning_rate": 1.9347828030557196e-05,
"loss": 0.1247,
"step": 710
},
{
"epoch": 0.2059702567426452,
"grad_norm": 0.015176452766976297,
"learning_rate": 1.932984313317839e-05,
"loss": 0.1269,
"step": 715
},
{
"epoch": 0.20741060818839796,
"grad_norm": 0.01743363415449858,
"learning_rate": 1.931162220470967e-05,
"loss": 0.1252,
"step": 720
},
{
"epoch": 0.20885095963415073,
"grad_norm": 0.020263642348868235,
"learning_rate": 1.9293165706113287e-05,
"loss": 0.1245,
"step": 725
},
{
"epoch": 0.2102913110799035,
"grad_norm": 0.016973378857152475,
"learning_rate": 1.9274474104311083e-05,
"loss": 0.1229,
"step": 730
},
{
"epoch": 0.21173166252565626,
"grad_norm": 0.01631083673087625,
"learning_rate": 1.925554787217265e-05,
"loss": 0.124,
"step": 735
},
{
"epoch": 0.21317201397140903,
"grad_norm": 0.019965918558652533,
"learning_rate": 1.9236387488503378e-05,
"loss": 0.1223,
"step": 740
},
{
"epoch": 0.21461236541716178,
"grad_norm": 0.021848215070733194,
"learning_rate": 1.921699343803235e-05,
"loss": 0.1138,
"step": 745
},
{
"epoch": 0.21605271686291455,
"grad_norm": 0.018291368502582516,
"learning_rate": 1.9197366211400058e-05,
"loss": 0.1241,
"step": 750
},
{
"epoch": 0.21749306830866733,
"grad_norm": 0.019059330643677385,
"learning_rate": 1.9177506305146008e-05,
"loss": 0.1268,
"step": 755
},
{
"epoch": 0.21893341975442007,
"grad_norm": 0.017080958515049458,
"learning_rate": 1.9157414221696155e-05,
"loss": 0.1206,
"step": 760
},
{
"epoch": 0.22037377120017285,
"grad_norm": 0.017957542038359017,
"learning_rate": 1.9137090469350185e-05,
"loss": 0.1244,
"step": 765
},
{
"epoch": 0.2218141226459256,
"grad_norm": 0.01802711782041451,
"learning_rate": 1.9116535562268658e-05,
"loss": 0.1206,
"step": 770
},
{
"epoch": 0.22325447409167837,
"grad_norm": 0.019287045682764212,
"learning_rate": 1.9095750020460017e-05,
"loss": 0.1144,
"step": 775
},
{
"epoch": 0.22469482553743114,
"grad_norm": 0.01879207545472608,
"learning_rate": 1.9074734369767407e-05,
"loss": 0.1192,
"step": 780
},
{
"epoch": 0.2261351769831839,
"grad_norm": 0.019461601988728348,
"learning_rate": 1.9053489141855386e-05,
"loss": 0.125,
"step": 785
},
{
"epoch": 0.22757552842893666,
"grad_norm": 0.019720624818386844,
"learning_rate": 1.9032014874196476e-05,
"loss": 0.1269,
"step": 790
},
{
"epoch": 0.22901587987468944,
"grad_norm": 0.018318990062697504,
"learning_rate": 1.901031211005756e-05,
"loss": 0.1203,
"step": 795
},
{
"epoch": 0.23045623132044218,
"grad_norm": 0.019164348842944955,
"learning_rate": 1.898838139848614e-05,
"loss": 0.1213,
"step": 800
},
{
"epoch": 0.23189658276619496,
"grad_norm": 0.02213208810579334,
"learning_rate": 1.8966223294296445e-05,
"loss": 0.1213,
"step": 805
},
{
"epoch": 0.2333369342119477,
"grad_norm": 0.018200798899224192,
"learning_rate": 1.8943838358055403e-05,
"loss": 0.1275,
"step": 810
},
{
"epoch": 0.23477728565770048,
"grad_norm": 0.018399660059020002,
"learning_rate": 1.892122715606846e-05,
"loss": 0.1226,
"step": 815
},
{
"epoch": 0.23621763710345325,
"grad_norm": 0.017674503935350343,
"learning_rate": 1.8898390260365227e-05,
"loss": 0.1175,
"step": 820
},
{
"epoch": 0.237657988549206,
"grad_norm": 0.019168528020990075,
"learning_rate": 1.8875328248685047e-05,
"loss": 0.1228,
"step": 825
},
{
"epoch": 0.23909833999495878,
"grad_norm": 0.01707510254798075,
"learning_rate": 1.885204170446235e-05,
"loss": 0.1229,
"step": 830
},
{
"epoch": 0.24053869144071152,
"grad_norm": 0.017143155499374434,
"learning_rate": 1.8828531216811912e-05,
"loss": 0.1163,
"step": 835
},
{
"epoch": 0.2419790428864643,
"grad_norm": 0.01972260397119316,
"learning_rate": 1.8804797380513944e-05,
"loss": 0.1151,
"step": 840
},
{
"epoch": 0.24341939433221707,
"grad_norm": 0.017300491914188608,
"learning_rate": 1.878084079599903e-05,
"loss": 0.1197,
"step": 845
},
{
"epoch": 0.24485974577796982,
"grad_norm": 0.017415521885546983,
"learning_rate": 1.8756662069332966e-05,
"loss": 0.12,
"step": 850
},
{
"epoch": 0.2463000972237226,
"grad_norm": 0.020650688836768367,
"learning_rate": 1.8732261812201408e-05,
"loss": 0.1159,
"step": 855
},
{
"epoch": 0.24774044866947534,
"grad_norm": 0.022394976870943134,
"learning_rate": 1.8707640641894395e-05,
"loss": 0.1182,
"step": 860
},
{
"epoch": 0.2491808001152281,
"grad_norm": 0.018481000136883082,
"learning_rate": 1.8682799181290747e-05,
"loss": 0.117,
"step": 865
},
{
"epoch": 0.25062115156098086,
"grad_norm": 0.020812551732477143,
"learning_rate": 1.86577380588423e-05,
"loss": 0.1174,
"step": 870
},
{
"epoch": 0.25206150300673363,
"grad_norm": 0.019146509121457756,
"learning_rate": 1.8632457908558006e-05,
"loss": 0.1128,
"step": 875
},
{
"epoch": 0.2535018544524864,
"grad_norm": 0.02088872645363683,
"learning_rate": 1.8606959369987885e-05,
"loss": 0.1258,
"step": 880
},
{
"epoch": 0.2549422058982392,
"grad_norm": 0.020975712319762363,
"learning_rate": 1.8581243088206865e-05,
"loss": 0.1252,
"step": 885
},
{
"epoch": 0.25638255734399196,
"grad_norm": 0.022104093266461965,
"learning_rate": 1.8555309713798445e-05,
"loss": 0.1175,
"step": 890
},
{
"epoch": 0.2578229087897447,
"grad_norm": 0.01935436681312094,
"learning_rate": 1.8529159902838253e-05,
"loss": 0.1155,
"step": 895
},
{
"epoch": 0.25926326023549745,
"grad_norm": 0.02103943800048682,
"learning_rate": 1.8502794316877423e-05,
"loss": 0.1163,
"step": 900
},
{
"epoch": 0.2607036116812502,
"grad_norm": 0.02064951760573597,
"learning_rate": 1.8476213622925885e-05,
"loss": 0.1151,
"step": 905
},
{
"epoch": 0.262143963127003,
"grad_norm": 0.02391509020619219,
"learning_rate": 1.844941849343548e-05,
"loss": 0.1154,
"step": 910
},
{
"epoch": 0.2635843145727558,
"grad_norm": 0.019299067245624735,
"learning_rate": 1.842240960628294e-05,
"loss": 0.118,
"step": 915
},
{
"epoch": 0.2650246660185085,
"grad_norm": 0.018999660132613904,
"learning_rate": 1.8395187644752756e-05,
"loss": 0.1121,
"step": 920
},
{
"epoch": 0.26646501746426127,
"grad_norm": 0.019465104345890567,
"learning_rate": 1.8367753297519874e-05,
"loss": 0.1147,
"step": 925
},
{
"epoch": 0.26790536891001404,
"grad_norm": 0.017798540374291294,
"learning_rate": 1.8340107258632288e-05,
"loss": 0.1192,
"step": 930
},
{
"epoch": 0.2693457203557668,
"grad_norm": 0.022579684079520896,
"learning_rate": 1.831225022749347e-05,
"loss": 0.1127,
"step": 935
},
{
"epoch": 0.2707860718015196,
"grad_norm": 0.019409928418770574,
"learning_rate": 1.828418290884468e-05,
"loss": 0.1141,
"step": 940
},
{
"epoch": 0.2722264232472723,
"grad_norm": 0.019982776222918414,
"learning_rate": 1.8255906012747137e-05,
"loss": 0.1107,
"step": 945
},
{
"epoch": 0.2736667746930251,
"grad_norm": 0.019327451944760602,
"learning_rate": 1.8227420254564066e-05,
"loss": 0.1088,
"step": 950
},
{
"epoch": 0.27510712613877786,
"grad_norm": 0.02195579382510436,
"learning_rate": 1.819872635494258e-05,
"loss": 0.1149,
"step": 955
},
{
"epoch": 0.27654747758453063,
"grad_norm": 0.019331091074997867,
"learning_rate": 1.816982503979546e-05,
"loss": 0.1187,
"step": 960
},
{
"epoch": 0.2779878290302834,
"grad_norm": 0.0198267215148203,
"learning_rate": 1.8140717040282797e-05,
"loss": 0.1133,
"step": 965
},
{
"epoch": 0.2794281804760361,
"grad_norm": 0.021536265466576106,
"learning_rate": 1.811140309279348e-05,
"loss": 0.1169,
"step": 970
},
{
"epoch": 0.2808685319217889,
"grad_norm": 0.02522795945903232,
"learning_rate": 1.808188393892658e-05,
"loss": 0.1162,
"step": 975
},
{
"epoch": 0.2823088833675417,
"grad_norm": 0.021202903232191053,
"learning_rate": 1.805216032547258e-05,
"loss": 0.1177,
"step": 980
},
{
"epoch": 0.28374923481329445,
"grad_norm": 0.02708284619788456,
"learning_rate": 1.8022233004394487e-05,
"loss": 0.1189,
"step": 985
},
{
"epoch": 0.2851895862590472,
"grad_norm": 0.02891719428802073,
"learning_rate": 1.7992102732808798e-05,
"loss": 0.116,
"step": 990
},
{
"epoch": 0.2866299377048,
"grad_norm": 0.020017133174120256,
"learning_rate": 1.796177027296637e-05,
"loss": 0.1192,
"step": 995
},
{
"epoch": 0.2880702891505527,
"grad_norm": 0.022814840523695007,
"learning_rate": 1.79312363922331e-05,
"loss": 0.1196,
"step": 1000
},
{
"epoch": 0.2880702891505527,
"eval_loss": 0.1148042306303978,
"eval_runtime": 862.8924,
"eval_samples_per_second": 2.091,
"eval_steps_per_second": 0.523,
"step": 1000
},
{
"epoch": 0.2895106405963055,
"grad_norm": 0.020165318251700003,
"learning_rate": 1.7900501863070552e-05,
"loss": 0.1092,
"step": 1005
},
{
"epoch": 0.29095099204205827,
"grad_norm": 0.020276491773740017,
"learning_rate": 1.7869567463016394e-05,
"loss": 0.108,
"step": 1010
},
{
"epoch": 0.29239134348781104,
"grad_norm": 0.025430907092320074,
"learning_rate": 1.7838433974664714e-05,
"loss": 0.1198,
"step": 1015
},
{
"epoch": 0.2938316949335638,
"grad_norm": 0.02600654612802859,
"learning_rate": 1.7807102185646247e-05,
"loss": 0.1164,
"step": 1020
},
{
"epoch": 0.29527204637931653,
"grad_norm": 0.0193830589897909,
"learning_rate": 1.7775572888608438e-05,
"loss": 0.1151,
"step": 1025
},
{
"epoch": 0.2967123978250693,
"grad_norm": 0.023599658745454725,
"learning_rate": 1.774384688119539e-05,
"loss": 0.1183,
"step": 1030
},
{
"epoch": 0.2981527492708221,
"grad_norm": 0.021674931585504164,
"learning_rate": 1.7711924966027678e-05,
"loss": 0.1141,
"step": 1035
},
{
"epoch": 0.29959310071657486,
"grad_norm": 0.023299214871684407,
"learning_rate": 1.767980795068206e-05,
"loss": 0.1194,
"step": 1040
},
{
"epoch": 0.30103345216232763,
"grad_norm": 0.02147089523180919,
"learning_rate": 1.7647496647671033e-05,
"loss": 0.1123,
"step": 1045
},
{
"epoch": 0.30247380360808035,
"grad_norm": 0.021457716083701384,
"learning_rate": 1.761499187442228e-05,
"loss": 0.1058,
"step": 1050
},
{
"epoch": 0.3039141550538331,
"grad_norm": 0.01969637917151117,
"learning_rate": 1.7582294453257996e-05,
"loss": 0.1207,
"step": 1055
},
{
"epoch": 0.3053545064995859,
"grad_norm": 0.023839310938145133,
"learning_rate": 1.7549405211374072e-05,
"loss": 0.1146,
"step": 1060
},
{
"epoch": 0.3067948579453387,
"grad_norm": 0.01832092911473367,
"learning_rate": 1.7516324980819185e-05,
"loss": 0.113,
"step": 1065
},
{
"epoch": 0.30823520939109145,
"grad_norm": 0.02124767953755396,
"learning_rate": 1.7483054598473734e-05,
"loss": 0.1213,
"step": 1070
},
{
"epoch": 0.30967556083684417,
"grad_norm": 0.0247043392947563,
"learning_rate": 1.7449594906028684e-05,
"loss": 0.1121,
"step": 1075
},
{
"epoch": 0.31111591228259694,
"grad_norm": 0.022139995366462776,
"learning_rate": 1.7415946749964252e-05,
"loss": 0.1132,
"step": 1080
},
{
"epoch": 0.3125562637283497,
"grad_norm": 0.01979540959946202,
"learning_rate": 1.7382110981528506e-05,
"loss": 0.1085,
"step": 1085
},
{
"epoch": 0.3139966151741025,
"grad_norm": 0.029924025152490485,
"learning_rate": 1.734808845671583e-05,
"loss": 0.1054,
"step": 1090
},
{
"epoch": 0.31543696661985526,
"grad_norm": 0.023093772502185278,
"learning_rate": 1.7313880036245257e-05,
"loss": 0.112,
"step": 1095
},
{
"epoch": 0.316877318065608,
"grad_norm": 0.023473379108296808,
"learning_rate": 1.7279486585538712e-05,
"loss": 0.1135,
"step": 1100
},
{
"epoch": 0.31831766951136076,
"grad_norm": 0.021733393242624945,
"learning_rate": 1.7244908974699112e-05,
"loss": 0.1134,
"step": 1105
},
{
"epoch": 0.31975802095711353,
"grad_norm": 0.021271929042974584,
"learning_rate": 1.721014807848833e-05,
"loss": 0.1164,
"step": 1110
},
{
"epoch": 0.3211983724028663,
"grad_norm": 0.020487120222693576,
"learning_rate": 1.7175204776305102e-05,
"loss": 0.1077,
"step": 1115
},
{
"epoch": 0.3226387238486191,
"grad_norm": 0.021171582329437753,
"learning_rate": 1.7140079952162765e-05,
"loss": 0.1102,
"step": 1120
},
{
"epoch": 0.32407907529437185,
"grad_norm": 0.02144446773648067,
"learning_rate": 1.7104774494666877e-05,
"loss": 0.1092,
"step": 1125
},
{
"epoch": 0.3255194267401246,
"grad_norm": 0.019040039144316633,
"learning_rate": 1.7069289296992756e-05,
"loss": 0.112,
"step": 1130
},
{
"epoch": 0.32695977818587735,
"grad_norm": 0.021286149625859865,
"learning_rate": 1.703362525686288e-05,
"loss": 0.1106,
"step": 1135
},
{
"epoch": 0.3284001296316301,
"grad_norm": 0.02183865564006019,
"learning_rate": 1.6997783276524177e-05,
"loss": 0.1134,
"step": 1140
},
{
"epoch": 0.3298404810773829,
"grad_norm": 0.021387717982638638,
"learning_rate": 1.6961764262725187e-05,
"loss": 0.1063,
"step": 1145
},
{
"epoch": 0.33128083252313567,
"grad_norm": 0.024865173996299898,
"learning_rate": 1.6925569126693135e-05,
"loss": 0.1051,
"step": 1150
},
{
"epoch": 0.3327211839688884,
"grad_norm": 0.023180712867703127,
"learning_rate": 1.6889198784110883e-05,
"loss": 0.1058,
"step": 1155
},
{
"epoch": 0.33416153541464116,
"grad_norm": 0.02218441265430226,
"learning_rate": 1.6852654155093745e-05,
"loss": 0.1117,
"step": 1160
},
{
"epoch": 0.33560188686039394,
"grad_norm": 0.02165110006319973,
"learning_rate": 1.681593616416623e-05,
"loss": 0.112,
"step": 1165
},
{
"epoch": 0.3370422383061467,
"grad_norm": 0.02200923978026833,
"learning_rate": 1.6779045740238643e-05,
"loss": 0.114,
"step": 1170
},
{
"epoch": 0.3384825897518995,
"grad_norm": 0.02197865224880007,
"learning_rate": 1.6741983816583583e-05,
"loss": 0.111,
"step": 1175
},
{
"epoch": 0.3399229411976522,
"grad_norm": 0.019554896311713962,
"learning_rate": 1.6704751330812342e-05,
"loss": 0.1019,
"step": 1180
},
{
"epoch": 0.341363292643405,
"grad_norm": 0.024134291900428952,
"learning_rate": 1.666734922485117e-05,
"loss": 0.1092,
"step": 1185
},
{
"epoch": 0.34280364408915776,
"grad_norm": 0.022673698274783088,
"learning_rate": 1.662977844491746e-05,
"loss": 0.113,
"step": 1190
},
{
"epoch": 0.34424399553491053,
"grad_norm": 0.02603003971238878,
"learning_rate": 1.6592039941495803e-05,
"loss": 0.1036,
"step": 1195
},
{
"epoch": 0.3456843469806633,
"grad_norm": 0.024554078349019966,
"learning_rate": 1.6554134669313943e-05,
"loss": 0.1121,
"step": 1200
},
{
"epoch": 0.347124698426416,
"grad_norm": 0.019717425570643314,
"learning_rate": 1.6516063587318627e-05,
"loss": 0.1068,
"step": 1205
},
{
"epoch": 0.3485650498721688,
"grad_norm": 0.027429511589759405,
"learning_rate": 1.647782765865134e-05,
"loss": 0.106,
"step": 1210
},
{
"epoch": 0.35000540131792157,
"grad_norm": 0.026628925564535798,
"learning_rate": 1.6439427850623944e-05,
"loss": 0.1123,
"step": 1215
},
{
"epoch": 0.35144575276367435,
"grad_norm": 0.026461597229239776,
"learning_rate": 1.64008651346942e-05,
"loss": 0.1099,
"step": 1220
},
{
"epoch": 0.3528861042094271,
"grad_norm": 0.02584266985928258,
"learning_rate": 1.63621404864412e-05,
"loss": 0.1107,
"step": 1225
},
{
"epoch": 0.35432645565517984,
"grad_norm": 0.019646883384690197,
"learning_rate": 1.6323254885540672e-05,
"loss": 0.1121,
"step": 1230
},
{
"epoch": 0.3557668071009326,
"grad_norm": 0.024026012804653142,
"learning_rate": 1.6284209315740225e-05,
"loss": 0.1047,
"step": 1235
},
{
"epoch": 0.3572071585466854,
"grad_norm": 0.024476248259603612,
"learning_rate": 1.6245004764834423e-05,
"loss": 0.109,
"step": 1240
},
{
"epoch": 0.35864750999243816,
"grad_norm": 0.022810087697062553,
"learning_rate": 1.620564222463982e-05,
"loss": 0.1102,
"step": 1245
},
{
"epoch": 0.36008786143819094,
"grad_norm": 0.024569475912673062,
"learning_rate": 1.6166122690969872e-05,
"loss": 0.1057,
"step": 1250
},
{
"epoch": 0.36152821288394366,
"grad_norm": 0.026160188797535698,
"learning_rate": 1.612644716360972e-05,
"loss": 0.0995,
"step": 1255
},
{
"epoch": 0.36296856432969643,
"grad_norm": 0.025602704231673833,
"learning_rate": 1.6086616646290926e-05,
"loss": 0.116,
"step": 1260
},
{
"epoch": 0.3644089157754492,
"grad_norm": 0.02307850757189033,
"learning_rate": 1.6046632146666056e-05,
"loss": 0.1088,
"step": 1265
},
{
"epoch": 0.365849267221202,
"grad_norm": 0.024139060873310914,
"learning_rate": 1.60064946762832e-05,
"loss": 0.1044,
"step": 1270
},
{
"epoch": 0.36728961866695475,
"grad_norm": 0.023315249645455345,
"learning_rate": 1.5966205250560393e-05,
"loss": 0.1119,
"step": 1275
},
{
"epoch": 0.36872997011270753,
"grad_norm": 0.021641266926506458,
"learning_rate": 1.592576488875989e-05,
"loss": 0.1094,
"step": 1280
},
{
"epoch": 0.37017032155846025,
"grad_norm": 0.022047478963055464,
"learning_rate": 1.5885174613962427e-05,
"loss": 0.1083,
"step": 1285
},
{
"epoch": 0.371610673004213,
"grad_norm": 0.02400330121786872,
"learning_rate": 1.5844435453041294e-05,
"loss": 0.1139,
"step": 1290
},
{
"epoch": 0.3730510244499658,
"grad_norm": 0.022595131555882185,
"learning_rate": 1.5803548436636394e-05,
"loss": 0.108,
"step": 1295
},
{
"epoch": 0.37449137589571857,
"grad_norm": 0.02791419433017582,
"learning_rate": 1.576251459912814e-05,
"loss": 0.1134,
"step": 1300
},
{
"epoch": 0.37593172734147134,
"grad_norm": 0.026002680251400723,
"learning_rate": 1.5721334978611307e-05,
"loss": 0.1107,
"step": 1305
},
{
"epoch": 0.37737207878722406,
"grad_norm": 0.022484735806660563,
"learning_rate": 1.5680010616868762e-05,
"loss": 0.1076,
"step": 1310
},
{
"epoch": 0.37881243023297684,
"grad_norm": 0.021483070318961833,
"learning_rate": 1.5638542559345106e-05,
"loss": 0.1034,
"step": 1315
},
{
"epoch": 0.3802527816787296,
"grad_norm": 0.023871643967031505,
"learning_rate": 1.559693185512023e-05,
"loss": 0.1106,
"step": 1320
},
{
"epoch": 0.3816931331244824,
"grad_norm": 0.021906873584555547,
"learning_rate": 1.555517955688277e-05,
"loss": 0.1077,
"step": 1325
},
{
"epoch": 0.38313348457023516,
"grad_norm": 0.023737665896899265,
"learning_rate": 1.5513286720903488e-05,
"loss": 0.1153,
"step": 1330
},
{
"epoch": 0.3845738360159879,
"grad_norm": 0.022433920258610178,
"learning_rate": 1.5471254407008526e-05,
"loss": 0.1037,
"step": 1335
},
{
"epoch": 0.38601418746174065,
"grad_norm": 0.02261886005036298,
"learning_rate": 1.542908367855263e-05,
"loss": 0.1088,
"step": 1340
},
{
"epoch": 0.38745453890749343,
"grad_norm": 0.021421057577652544,
"learning_rate": 1.53867756023922e-05,
"loss": 0.1047,
"step": 1345
},
{
"epoch": 0.3888948903532462,
"grad_norm": 0.021709509569311863,
"learning_rate": 1.534433124885836e-05,
"loss": 0.1033,
"step": 1350
},
{
"epoch": 0.390335241798999,
"grad_norm": 0.024820443979506786,
"learning_rate": 1.530175169172982e-05,
"loss": 0.1097,
"step": 1355
},
{
"epoch": 0.3917755932447517,
"grad_norm": 0.025227643673449818,
"learning_rate": 1.525903800820575e-05,
"loss": 0.1065,
"step": 1360
},
{
"epoch": 0.39321594469050447,
"grad_norm": 0.022487840207955263,
"learning_rate": 1.5216191278878522e-05,
"loss": 0.1064,
"step": 1365
},
{
"epoch": 0.39465629613625725,
"grad_norm": 0.025854963249344314,
"learning_rate": 1.517321258770636e-05,
"loss": 0.1043,
"step": 1370
},
{
"epoch": 0.39609664758201,
"grad_norm": 0.02260216955018553,
"learning_rate": 1.5130103021985929e-05,
"loss": 0.1126,
"step": 1375
},
{
"epoch": 0.3975369990277628,
"grad_norm": 0.02491639306364899,
"learning_rate": 1.5086863672324826e-05,
"loss": 0.1039,
"step": 1380
},
{
"epoch": 0.3989773504735155,
"grad_norm": 0.022899995574611955,
"learning_rate": 1.5043495632613982e-05,
"loss": 0.1066,
"step": 1385
},
{
"epoch": 0.4004177019192683,
"grad_norm": 0.024707905868095445,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.1106,
"step": 1390
},
{
"epoch": 0.40185805336502106,
"grad_norm": 0.0241361491731915,
"learning_rate": 1.4956377874857395e-05,
"loss": 0.1001,
"step": 1395
},
{
"epoch": 0.40329840481077384,
"grad_norm": 0.025677892301256368,
"learning_rate": 1.4912630360760743e-05,
"loss": 0.1092,
"step": 1400
},
{
"epoch": 0.4047387562565266,
"grad_norm": 0.024135758692475445,
"learning_rate": 1.4868758564456785e-05,
"loss": 0.1116,
"step": 1405
},
{
"epoch": 0.40617910770227933,
"grad_norm": 0.024527918000747655,
"learning_rate": 1.4824763595836404e-05,
"loss": 0.1065,
"step": 1410
},
{
"epoch": 0.4076194591480321,
"grad_norm": 0.023548606886532042,
"learning_rate": 1.4780646567906571e-05,
"loss": 0.1009,
"step": 1415
},
{
"epoch": 0.4090598105937849,
"grad_norm": 0.026633678739013535,
"learning_rate": 1.473640859676217e-05,
"loss": 0.1059,
"step": 1420
},
{
"epoch": 0.41050016203953765,
"grad_norm": 0.02593995796499118,
"learning_rate": 1.4692050801557769e-05,
"loss": 0.107,
"step": 1425
},
{
"epoch": 0.4119405134852904,
"grad_norm": 0.02413535724648029,
"learning_rate": 1.4647574304479295e-05,
"loss": 0.1115,
"step": 1430
},
{
"epoch": 0.4133808649310432,
"grad_norm": 0.02297878729642286,
"learning_rate": 1.4602980230715674e-05,
"loss": 0.1083,
"step": 1435
},
{
"epoch": 0.4148212163767959,
"grad_norm": 0.02697991820571532,
"learning_rate": 1.4558269708430333e-05,
"loss": 0.1051,
"step": 1440
},
{
"epoch": 0.4162615678225487,
"grad_norm": 0.025004735182102,
"learning_rate": 1.4513443868732674e-05,
"loss": 0.1103,
"step": 1445
},
{
"epoch": 0.41770191926830147,
"grad_norm": 0.024578324660308907,
"learning_rate": 1.4468503845649462e-05,
"loss": 0.108,
"step": 1450
},
{
"epoch": 0.41914227071405424,
"grad_norm": 0.024290400744172484,
"learning_rate": 1.4423450776096122e-05,
"loss": 0.102,
"step": 1455
},
{
"epoch": 0.420582622159807,
"grad_norm": 0.025652851866900635,
"learning_rate": 1.4378285799848004e-05,
"loss": 0.1139,
"step": 1460
},
{
"epoch": 0.42202297360555974,
"grad_norm": 0.022542605080051747,
"learning_rate": 1.4333010059511505e-05,
"loss": 0.1048,
"step": 1465
},
{
"epoch": 0.4234633250513125,
"grad_norm": 0.022564970591101585,
"learning_rate": 1.4287624700495211e-05,
"loss": 0.1053,
"step": 1470
},
{
"epoch": 0.4249036764970653,
"grad_norm": 0.024332336625839772,
"learning_rate": 1.4242130870980882e-05,
"loss": 0.1035,
"step": 1475
},
{
"epoch": 0.42634402794281806,
"grad_norm": 0.02565641763751157,
"learning_rate": 1.4196529721894427e-05,
"loss": 0.1054,
"step": 1480
},
{
"epoch": 0.42778437938857083,
"grad_norm": 0.022329221067199423,
"learning_rate": 1.4150822406876774e-05,
"loss": 0.1064,
"step": 1485
},
{
"epoch": 0.42922473083432355,
"grad_norm": 0.02525079350860865,
"learning_rate": 1.4105010082254697e-05,
"loss": 0.1089,
"step": 1490
},
{
"epoch": 0.43066508228007633,
"grad_norm": 0.025470140748940266,
"learning_rate": 1.4059093907011552e-05,
"loss": 0.103,
"step": 1495
},
{
"epoch": 0.4321054337258291,
"grad_norm": 0.03889636426311705,
"learning_rate": 1.401307504275796e-05,
"loss": 0.106,
"step": 1500
},
{
"epoch": 0.4321054337258291,
"eval_loss": 0.10669519007205963,
"eval_runtime": 863.9882,
"eval_samples_per_second": 2.088,
"eval_steps_per_second": 0.522,
"step": 1500
},
{
"epoch": 0.4335457851715819,
"grad_norm": 0.03107255388021069,
"learning_rate": 1.3966954653702423e-05,
"loss": 0.1086,
"step": 1505
},
{
"epoch": 0.43498613661733465,
"grad_norm": 0.029782191799490893,
"learning_rate": 1.3920733906621861e-05,
"loss": 0.1056,
"step": 1510
},
{
"epoch": 0.43642648806308737,
"grad_norm": 0.03596510921750815,
"learning_rate": 1.3874413970832123e-05,
"loss": 0.1057,
"step": 1515
},
{
"epoch": 0.43786683950884014,
"grad_norm": 0.02690673392192122,
"learning_rate": 1.3827996018158356e-05,
"loss": 0.1009,
"step": 1520
},
{
"epoch": 0.4393071909545929,
"grad_norm": 0.02415574342794587,
"learning_rate": 1.378148122290541e-05,
"loss": 0.1092,
"step": 1525
},
{
"epoch": 0.4407475424003457,
"grad_norm": 0.025267363786179952,
"learning_rate": 1.3734870761828095e-05,
"loss": 0.1046,
"step": 1530
},
{
"epoch": 0.44218789384609847,
"grad_norm": 0.02251899862464666,
"learning_rate": 1.368816581410143e-05,
"loss": 0.1026,
"step": 1535
},
{
"epoch": 0.4436282452918512,
"grad_norm": 0.024093194788389363,
"learning_rate": 1.3641367561290795e-05,
"loss": 0.1069,
"step": 1540
},
{
"epoch": 0.44506859673760396,
"grad_norm": 0.02865301350630449,
"learning_rate": 1.3594477187322065e-05,
"loss": 0.101,
"step": 1545
},
{
"epoch": 0.44650894818335674,
"grad_norm": 0.025991749936729126,
"learning_rate": 1.3547495878451635e-05,
"loss": 0.104,
"step": 1550
},
{
"epoch": 0.4479492996291095,
"grad_norm": 0.02427741302197225,
"learning_rate": 1.3500424823236413e-05,
"loss": 0.1097,
"step": 1555
},
{
"epoch": 0.4493896510748623,
"grad_norm": 0.025286951386541058,
"learning_rate": 1.3453265212503756e-05,
"loss": 0.1025,
"step": 1560
},
{
"epoch": 0.450830002520615,
"grad_norm": 0.02293738313311065,
"learning_rate": 1.340601823932135e-05,
"loss": 0.1026,
"step": 1565
},
{
"epoch": 0.4522703539663678,
"grad_norm": 0.029909633188798766,
"learning_rate": 1.335868509896702e-05,
"loss": 0.1093,
"step": 1570
},
{
"epoch": 0.45371070541212055,
"grad_norm": 0.02489864832302057,
"learning_rate": 1.3311266988898477e-05,
"loss": 0.1075,
"step": 1575
},
{
"epoch": 0.4551510568578733,
"grad_norm": 0.02626703651775361,
"learning_rate": 1.3263765108723061e-05,
"loss": 0.1052,
"step": 1580
},
{
"epoch": 0.4565914083036261,
"grad_norm": 0.028232314920455838,
"learning_rate": 1.3216180660167355e-05,
"loss": 0.103,
"step": 1585
},
{
"epoch": 0.4580317597493789,
"grad_norm": 0.02893621210041688,
"learning_rate": 1.3168514847046802e-05,
"loss": 0.102,
"step": 1590
},
{
"epoch": 0.4594721111951316,
"grad_norm": 0.027792315343772488,
"learning_rate": 1.3120768875235252e-05,
"loss": 0.1006,
"step": 1595
},
{
"epoch": 0.46091246264088437,
"grad_norm": 0.022941453299660446,
"learning_rate": 1.3072943952634446e-05,
"loss": 0.101,
"step": 1600
},
{
"epoch": 0.46235281408663714,
"grad_norm": 0.02754317441994994,
"learning_rate": 1.3025041289143459e-05,
"loss": 0.1014,
"step": 1605
},
{
"epoch": 0.4637931655323899,
"grad_norm": 0.023409812236038566,
"learning_rate": 1.2977062096628096e-05,
"loss": 0.1018,
"step": 1610
},
{
"epoch": 0.4652335169781427,
"grad_norm": 0.02736910879882447,
"learning_rate": 1.2929007588890241e-05,
"loss": 0.1029,
"step": 1615
},
{
"epoch": 0.4666738684238954,
"grad_norm": 0.024599830487748515,
"learning_rate": 1.2880878981637129e-05,
"loss": 0.1076,
"step": 1620
},
{
"epoch": 0.4681142198696482,
"grad_norm": 0.030038012520127225,
"learning_rate": 1.2832677492450602e-05,
"loss": 0.1058,
"step": 1625
},
{
"epoch": 0.46955457131540096,
"grad_norm": 0.027368913953633472,
"learning_rate": 1.2784404340756315e-05,
"loss": 0.1082,
"step": 1630
},
{
"epoch": 0.47099492276115373,
"grad_norm": 0.02334017117279336,
"learning_rate": 1.2736060747792877e-05,
"loss": 0.106,
"step": 1635
},
{
"epoch": 0.4724352742069065,
"grad_norm": 0.02870373822056335,
"learning_rate": 1.268764793658094e-05,
"loss": 0.1099,
"step": 1640
},
{
"epoch": 0.4738756256526592,
"grad_norm": 0.029235939335636004,
"learning_rate": 1.2639167131892294e-05,
"loss": 0.0973,
"step": 1645
},
{
"epoch": 0.475315977098412,
"grad_norm": 0.026146387485585586,
"learning_rate": 1.2590619560218851e-05,
"loss": 0.1087,
"step": 1650
},
{
"epoch": 0.4767563285441648,
"grad_norm": 0.02842049987719084,
"learning_rate": 1.2542006449741631e-05,
"loss": 0.1061,
"step": 1655
},
{
"epoch": 0.47819667998991755,
"grad_norm": 0.024518713018732985,
"learning_rate": 1.249332903029969e-05,
"loss": 0.1051,
"step": 1660
},
{
"epoch": 0.4796370314356703,
"grad_norm": 0.02484058398034788,
"learning_rate": 1.2444588533358996e-05,
"loss": 0.1022,
"step": 1665
},
{
"epoch": 0.48107738288142304,
"grad_norm": 0.03724000521562738,
"learning_rate": 1.23957861919813e-05,
"loss": 0.1056,
"step": 1670
},
{
"epoch": 0.4825177343271758,
"grad_norm": 0.03293802155567722,
"learning_rate": 1.2346923240792907e-05,
"loss": 0.1092,
"step": 1675
},
{
"epoch": 0.4839580857729286,
"grad_norm": 0.026650789981673747,
"learning_rate": 1.229800091595347e-05,
"loss": 0.1008,
"step": 1680
},
{
"epoch": 0.48539843721868137,
"grad_norm": 0.02758805567603314,
"learning_rate": 1.2249020455124703e-05,
"loss": 0.1024,
"step": 1685
},
{
"epoch": 0.48683878866443414,
"grad_norm": 0.0289561047779867,
"learning_rate": 1.2199983097439079e-05,
"loss": 0.0974,
"step": 1690
},
{
"epoch": 0.48827914011018686,
"grad_norm": 0.02472150617570116,
"learning_rate": 1.2150890083468465e-05,
"loss": 0.1017,
"step": 1695
},
{
"epoch": 0.48971949155593963,
"grad_norm": 0.026933463536791147,
"learning_rate": 1.2101742655192761e-05,
"loss": 0.107,
"step": 1700
},
{
"epoch": 0.4911598430016924,
"grad_norm": 0.02704841529205261,
"learning_rate": 1.2052542055968461e-05,
"loss": 0.1049,
"step": 1705
},
{
"epoch": 0.4926001944474452,
"grad_norm": 0.029435211264724145,
"learning_rate": 1.2003289530497206e-05,
"loss": 0.1097,
"step": 1710
},
{
"epoch": 0.49404054589319796,
"grad_norm": 0.023704099909174187,
"learning_rate": 1.1953986324794295e-05,
"loss": 0.1027,
"step": 1715
},
{
"epoch": 0.4954808973389507,
"grad_norm": 0.024368103569431525,
"learning_rate": 1.1904633686157158e-05,
"loss": 0.1078,
"step": 1720
},
{
"epoch": 0.49692124878470345,
"grad_norm": 0.024017154477919515,
"learning_rate": 1.1855232863133809e-05,
"loss": 0.1009,
"step": 1725
},
{
"epoch": 0.4983616002304562,
"grad_norm": 0.03151983841636764,
"learning_rate": 1.1805785105491247e-05,
"loss": 0.1021,
"step": 1730
},
{
"epoch": 0.499801951676209,
"grad_norm": 0.02991694116471432,
"learning_rate": 1.1756291664183858e-05,
"loss": 0.1076,
"step": 1735
},
{
"epoch": 0.5012423031219617,
"grad_norm": 0.03657521191046876,
"learning_rate": 1.1706753791321748e-05,
"loss": 0.1042,
"step": 1740
},
{
"epoch": 0.5026826545677145,
"grad_norm": 0.0279482378393294,
"learning_rate": 1.1657172740139074e-05,
"loss": 0.1017,
"step": 1745
},
{
"epoch": 0.5041230060134673,
"grad_norm": 0.030069339039746814,
"learning_rate": 1.1607549764962342e-05,
"loss": 0.1042,
"step": 1750
},
{
"epoch": 0.5055633574592201,
"grad_norm": 0.026487495544772206,
"learning_rate": 1.1557886121178683e-05,
"loss": 0.1079,
"step": 1755
},
{
"epoch": 0.5070037089049728,
"grad_norm": 0.024984858540099013,
"learning_rate": 1.1508183065204066e-05,
"loss": 0.1037,
"step": 1760
},
{
"epoch": 0.5084440603507255,
"grad_norm": 0.026914213670237396,
"learning_rate": 1.1458441854451539e-05,
"loss": 0.1001,
"step": 1765
},
{
"epoch": 0.5098844117964784,
"grad_norm": 0.029179893358272806,
"learning_rate": 1.1408663747299409e-05,
"loss": 0.1002,
"step": 1770
},
{
"epoch": 0.5113247632422311,
"grad_norm": 0.039586221088331666,
"learning_rate": 1.13588500030594e-05,
"loss": 0.105,
"step": 1775
},
{
"epoch": 0.5127651146879839,
"grad_norm": 0.03249368292701708,
"learning_rate": 1.130900188194481e-05,
"loss": 0.1031,
"step": 1780
},
{
"epoch": 0.5142054661337366,
"grad_norm": 0.024055776588308493,
"learning_rate": 1.1259120645038612e-05,
"loss": 0.1013,
"step": 1785
},
{
"epoch": 0.5156458175794894,
"grad_norm": 0.025863370886123525,
"learning_rate": 1.1209207554261573e-05,
"loss": 0.1,
"step": 1790
},
{
"epoch": 0.5170861690252422,
"grad_norm": 0.025434390514411754,
"learning_rate": 1.1159263872340293e-05,
"loss": 0.1007,
"step": 1795
},
{
"epoch": 0.5185265204709949,
"grad_norm": 0.02745842348721705,
"learning_rate": 1.1109290862775307e-05,
"loss": 0.096,
"step": 1800
},
{
"epoch": 0.5199668719167477,
"grad_norm": 0.027615229169377083,
"learning_rate": 1.1059289789809071e-05,
"loss": 0.0998,
"step": 1805
},
{
"epoch": 0.5214072233625004,
"grad_norm": 0.027453704008570706,
"learning_rate": 1.1009261918394028e-05,
"loss": 0.0972,
"step": 1810
},
{
"epoch": 0.5228475748082532,
"grad_norm": 0.024700516688583,
"learning_rate": 1.0959208514160561e-05,
"loss": 0.1055,
"step": 1815
},
{
"epoch": 0.524287926254006,
"grad_norm": 0.032125782290431566,
"learning_rate": 1.0909130843385009e-05,
"loss": 0.0979,
"step": 1820
},
{
"epoch": 0.5257282776997587,
"grad_norm": 0.027487991421855863,
"learning_rate": 1.085903017295761e-05,
"loss": 0.0992,
"step": 1825
},
{
"epoch": 0.5271686291455115,
"grad_norm": 0.0284479042668781,
"learning_rate": 1.0808907770350463e-05,
"loss": 0.098,
"step": 1830
},
{
"epoch": 0.5286089805912643,
"grad_norm": 0.02721705482015998,
"learning_rate": 1.0758764903585457e-05,
"loss": 0.1025,
"step": 1835
},
{
"epoch": 0.530049332037017,
"grad_norm": 0.028561724576000452,
"learning_rate": 1.070860284120219e-05,
"loss": 0.1002,
"step": 1840
},
{
"epoch": 0.5314896834827698,
"grad_norm": 0.030215649153775434,
"learning_rate": 1.0658422852225889e-05,
"loss": 0.113,
"step": 1845
},
{
"epoch": 0.5329300349285225,
"grad_norm": 0.023003505373751384,
"learning_rate": 1.0608226206135292e-05,
"loss": 0.1004,
"step": 1850
},
{
"epoch": 0.5343703863742754,
"grad_norm": 0.032778870546567584,
"learning_rate": 1.0558014172830537e-05,
"loss": 0.1026,
"step": 1855
},
{
"epoch": 0.5358107378200281,
"grad_norm": 0.024228471157562194,
"learning_rate": 1.0507788022601033e-05,
"loss": 0.0959,
"step": 1860
},
{
"epoch": 0.5372510892657808,
"grad_norm": 0.027347316934608,
"learning_rate": 1.0457549026093338e-05,
"loss": 0.0994,
"step": 1865
},
{
"epoch": 0.5386914407115336,
"grad_norm": 0.02492119460153467,
"learning_rate": 1.0407298454278983e-05,
"loss": 0.096,
"step": 1870
},
{
"epoch": 0.5401317921572864,
"grad_norm": 0.02730713294802567,
"learning_rate": 1.0357037578422349e-05,
"loss": 0.1063,
"step": 1875
},
{
"epoch": 0.5415721436030392,
"grad_norm": 0.02753505593775209,
"learning_rate": 1.0306767670048497e-05,
"loss": 0.1004,
"step": 1880
},
{
"epoch": 0.5430124950487919,
"grad_norm": 0.027125750017208197,
"learning_rate": 1.0256490000910986e-05,
"loss": 0.1033,
"step": 1885
},
{
"epoch": 0.5444528464945446,
"grad_norm": 0.02526923620994891,
"learning_rate": 1.0206205842959718e-05,
"loss": 0.1044,
"step": 1890
},
{
"epoch": 0.5458931979402974,
"grad_norm": 0.03456611614472051,
"learning_rate": 1.0155916468308749e-05,
"loss": 0.1018,
"step": 1895
},
{
"epoch": 0.5473335493860502,
"grad_norm": 0.02687752666688773,
"learning_rate": 1.0105623149204118e-05,
"loss": 0.1071,
"step": 1900
},
{
"epoch": 0.548773900831803,
"grad_norm": 0.0348842935948145,
"learning_rate": 1.0055327157991639e-05,
"loss": 0.1018,
"step": 1905
},
{
"epoch": 0.5502142522775557,
"grad_norm": 0.03047489251408279,
"learning_rate": 1.0005029767084739e-05,
"loss": 0.1029,
"step": 1910
},
{
"epoch": 0.5516546037233084,
"grad_norm": 0.03295680008205151,
"learning_rate": 9.954732248932243e-06,
"loss": 0.1042,
"step": 1915
},
{
"epoch": 0.5530949551690613,
"grad_norm": 0.026523017503203378,
"learning_rate": 9.904435875986213e-06,
"loss": 0.0989,
"step": 1920
},
{
"epoch": 0.554535306614814,
"grad_norm": 0.03199960655757483,
"learning_rate": 9.85414192066972e-06,
"loss": 0.1034,
"step": 1925
},
{
"epoch": 0.5559756580605668,
"grad_norm": 0.03321351438487483,
"learning_rate": 9.803851655344682e-06,
"loss": 0.0954,
"step": 1930
},
{
"epoch": 0.5574160095063195,
"grad_norm": 0.026379907025225486,
"learning_rate": 9.75356635227966e-06,
"loss": 0.1007,
"step": 1935
},
{
"epoch": 0.5588563609520723,
"grad_norm": 0.03215809081400103,
"learning_rate": 9.70328728361769e-06,
"loss": 0.1041,
"step": 1940
},
{
"epoch": 0.5602967123978251,
"grad_norm": 0.028935560901012115,
"learning_rate": 9.653015721344073e-06,
"loss": 0.104,
"step": 1945
},
{
"epoch": 0.5617370638435778,
"grad_norm": 0.026479371672269946,
"learning_rate": 9.602752937254215e-06,
"loss": 0.0987,
"step": 1950
},
{
"epoch": 0.5631774152893306,
"grad_norm": 0.02735022790208615,
"learning_rate": 9.552500202921449e-06,
"loss": 0.1052,
"step": 1955
},
{
"epoch": 0.5646177667350833,
"grad_norm": 0.029074239132167123,
"learning_rate": 9.502258789664865e-06,
"loss": 0.1069,
"step": 1960
},
{
"epoch": 0.5660581181808362,
"grad_norm": 0.03497134937258824,
"learning_rate": 9.45202996851714e-06,
"loss": 0.1068,
"step": 1965
},
{
"epoch": 0.5674984696265889,
"grad_norm": 0.03121000622963146,
"learning_rate": 9.401815010192388e-06,
"loss": 0.1058,
"step": 1970
},
{
"epoch": 0.5689388210723416,
"grad_norm": 0.029762366581440217,
"learning_rate": 9.351615185054029e-06,
"loss": 0.1027,
"step": 1975
},
{
"epoch": 0.5703791725180944,
"grad_norm": 0.031933869149627876,
"learning_rate": 9.301431763082623e-06,
"loss": 0.1001,
"step": 1980
},
{
"epoch": 0.5718195239638472,
"grad_norm": 0.024595673857129345,
"learning_rate": 9.251266013843757e-06,
"loss": 0.0993,
"step": 1985
},
{
"epoch": 0.5732598754096,
"grad_norm": 0.02931011040644607,
"learning_rate": 9.201119206455927e-06,
"loss": 0.107,
"step": 1990
},
{
"epoch": 0.5747002268553527,
"grad_norm": 0.02841549880839795,
"learning_rate": 9.150992609558425e-06,
"loss": 0.1013,
"step": 1995
},
{
"epoch": 0.5761405783011054,
"grad_norm": 0.029699203984976344,
"learning_rate": 9.10088749127926e-06,
"loss": 0.1003,
"step": 2000
},
{
"epoch": 0.5761405783011054,
"eval_loss": 0.10270440578460693,
"eval_runtime": 862.9139,
"eval_samples_per_second": 2.091,
"eval_steps_per_second": 0.523,
"step": 2000
},
{
"epoch": 0.5775809297468583,
"grad_norm": 0.02555642653773717,
"learning_rate": 9.050805119203035e-06,
"loss": 0.0964,
"step": 2005
},
{
"epoch": 0.579021281192611,
"grad_norm": 0.03206482511392409,
"learning_rate": 9.000746760338938e-06,
"loss": 0.1061,
"step": 2010
},
{
"epoch": 0.5804616326383638,
"grad_norm": 0.029335358877190958,
"learning_rate": 8.950713681088647e-06,
"loss": 0.0989,
"step": 2015
},
{
"epoch": 0.5819019840841165,
"grad_norm": 0.029061894519663043,
"learning_rate": 8.900707147214301e-06,
"loss": 0.1041,
"step": 2020
},
{
"epoch": 0.5833423355298692,
"grad_norm": 0.027789281243274542,
"learning_rate": 8.850728423806487e-06,
"loss": 0.1002,
"step": 2025
},
{
"epoch": 0.5847826869756221,
"grad_norm": 0.028006016395305748,
"learning_rate": 8.800778775252221e-06,
"loss": 0.0971,
"step": 2030
},
{
"epoch": 0.5862230384213748,
"grad_norm": 0.029135758984445807,
"learning_rate": 8.75085946520298e-06,
"loss": 0.1089,
"step": 2035
},
{
"epoch": 0.5876633898671276,
"grad_norm": 0.028264974680774237,
"learning_rate": 8.700971756542707e-06,
"loss": 0.0999,
"step": 2040
},
{
"epoch": 0.5891037413128803,
"grad_norm": 0.02787134319938327,
"learning_rate": 8.65111691135589e-06,
"loss": 0.0996,
"step": 2045
},
{
"epoch": 0.5905440927586331,
"grad_norm": 0.02930265638543135,
"learning_rate": 8.601296190895611e-06,
"loss": 0.0998,
"step": 2050
},
{
"epoch": 0.5919844442043859,
"grad_norm": 0.028136073425125765,
"learning_rate": 8.551510855551658e-06,
"loss": 0.1022,
"step": 2055
},
{
"epoch": 0.5934247956501386,
"grad_norm": 0.030075075847505933,
"learning_rate": 8.501762164818615e-06,
"loss": 0.1011,
"step": 2060
},
{
"epoch": 0.5948651470958914,
"grad_norm": 0.02509232015436124,
"learning_rate": 8.452051377264025e-06,
"loss": 0.1089,
"step": 2065
},
{
"epoch": 0.5963054985416442,
"grad_norm": 0.029019522531841563,
"learning_rate": 8.402379750496535e-06,
"loss": 0.1043,
"step": 2070
},
{
"epoch": 0.5977458499873969,
"grad_norm": 0.02946442999816807,
"learning_rate": 8.35274854113407e-06,
"loss": 0.1033,
"step": 2075
},
{
"epoch": 0.5991862014331497,
"grad_norm": 0.028470170352907883,
"learning_rate": 8.303159004772065e-06,
"loss": 0.1015,
"step": 2080
},
{
"epoch": 0.6006265528789024,
"grad_norm": 0.027074007980222393,
"learning_rate": 8.253612395951697e-06,
"loss": 0.1077,
"step": 2085
},
{
"epoch": 0.6020669043246553,
"grad_norm": 0.02594322523858361,
"learning_rate": 8.204109968128128e-06,
"loss": 0.1052,
"step": 2090
},
{
"epoch": 0.603507255770408,
"grad_norm": 0.027981527626155637,
"learning_rate": 8.15465297363881e-06,
"loss": 0.1076,
"step": 2095
},
{
"epoch": 0.6049476072161607,
"grad_norm": 0.027128594062002694,
"learning_rate": 8.105242663671807e-06,
"loss": 0.1019,
"step": 2100
},
{
"epoch": 0.6063879586619135,
"grad_norm": 0.02399321575005835,
"learning_rate": 8.055880288234127e-06,
"loss": 0.1014,
"step": 2105
},
{
"epoch": 0.6078283101076662,
"grad_norm": 0.030384147510662384,
"learning_rate": 8.006567096120103e-06,
"loss": 0.102,
"step": 2110
},
{
"epoch": 0.6092686615534191,
"grad_norm": 0.0275729769794035,
"learning_rate": 7.957304334879809e-06,
"loss": 0.1066,
"step": 2115
},
{
"epoch": 0.6107090129991718,
"grad_norm": 0.02827555110704183,
"learning_rate": 7.908093250787496e-06,
"loss": 0.1026,
"step": 2120
},
{
"epoch": 0.6121493644449245,
"grad_norm": 0.0280447976215247,
"learning_rate": 7.858935088810054e-06,
"loss": 0.1041,
"step": 2125
},
{
"epoch": 0.6135897158906773,
"grad_norm": 0.026871658750667743,
"learning_rate": 7.809831092575528e-06,
"loss": 0.1033,
"step": 2130
},
{
"epoch": 0.6150300673364301,
"grad_norm": 0.029232871601405933,
"learning_rate": 7.760782504341644e-06,
"loss": 0.1047,
"step": 2135
},
{
"epoch": 0.6164704187821829,
"grad_norm": 0.023805191017251096,
"learning_rate": 7.7117905649644e-06,
"loss": 0.0983,
"step": 2140
},
{
"epoch": 0.6179107702279356,
"grad_norm": 0.03103222984517563,
"learning_rate": 7.662856513866643e-06,
"loss": 0.0968,
"step": 2145
},
{
"epoch": 0.6193511216736883,
"grad_norm": 0.028105627160877313,
"learning_rate": 7.613981589006754e-06,
"loss": 0.099,
"step": 2150
},
{
"epoch": 0.6207914731194412,
"grad_norm": 0.030597646841460515,
"learning_rate": 7.565167026847294e-06,
"loss": 0.0979,
"step": 2155
},
{
"epoch": 0.6222318245651939,
"grad_norm": 0.02970594057212837,
"learning_rate": 7.5164140623237454e-06,
"loss": 0.0975,
"step": 2160
},
{
"epoch": 0.6236721760109467,
"grad_norm": 0.027159849019462532,
"learning_rate": 7.467723928813262e-06,
"loss": 0.1036,
"step": 2165
},
{
"epoch": 0.6251125274566994,
"grad_norm": 0.03257508113397351,
"learning_rate": 7.419097858103464e-06,
"loss": 0.0997,
"step": 2170
},
{
"epoch": 0.6265528789024521,
"grad_norm": 0.029509877961538443,
"learning_rate": 7.370537080361282e-06,
"loss": 0.0977,
"step": 2175
},
{
"epoch": 0.627993230348205,
"grad_norm": 0.02914606675872259,
"learning_rate": 7.3220428241018225e-06,
"loss": 0.0961,
"step": 2180
},
{
"epoch": 0.6294335817939577,
"grad_norm": 0.028245193180560037,
"learning_rate": 7.273616316157312e-06,
"loss": 0.1009,
"step": 2185
},
{
"epoch": 0.6308739332397105,
"grad_norm": 0.031258732018464554,
"learning_rate": 7.225258781646036e-06,
"loss": 0.0959,
"step": 2190
},
{
"epoch": 0.6323142846854632,
"grad_norm": 0.029187877030262547,
"learning_rate": 7.176971443941359e-06,
"loss": 0.0984,
"step": 2195
},
{
"epoch": 0.633754636131216,
"grad_norm": 0.027481848533676127,
"learning_rate": 7.128755524640771e-06,
"loss": 0.1023,
"step": 2200
},
{
"epoch": 0.6351949875769688,
"grad_norm": 0.028467126169710644,
"learning_rate": 7.080612243534981e-06,
"loss": 0.104,
"step": 2205
},
{
"epoch": 0.6366353390227215,
"grad_norm": 0.02584742929400372,
"learning_rate": 7.032542818577066e-06,
"loss": 0.0968,
"step": 2210
},
{
"epoch": 0.6380756904684743,
"grad_norm": 0.024188950488983806,
"learning_rate": 6.984548465851641e-06,
"loss": 0.0965,
"step": 2215
},
{
"epoch": 0.6395160419142271,
"grad_norm": 0.028851010852631174,
"learning_rate": 6.93663039954412e-06,
"loss": 0.1007,
"step": 2220
},
{
"epoch": 0.6409563933599798,
"grad_norm": 0.025328516360271506,
"learning_rate": 6.888789831909972e-06,
"loss": 0.101,
"step": 2225
},
{
"epoch": 0.6423967448057326,
"grad_norm": 0.03102519024419765,
"learning_rate": 6.841027973244077e-06,
"loss": 0.098,
"step": 2230
},
{
"epoch": 0.6438370962514853,
"grad_norm": 0.037884633433375385,
"learning_rate": 6.793346031850085e-06,
"loss": 0.0987,
"step": 2235
},
{
"epoch": 0.6452774476972382,
"grad_norm": 0.03111478820563393,
"learning_rate": 6.745745214009866e-06,
"loss": 0.0984,
"step": 2240
},
{
"epoch": 0.6467177991429909,
"grad_norm": 0.02761748015741369,
"learning_rate": 6.698226723952985e-06,
"loss": 0.0996,
"step": 2245
},
{
"epoch": 0.6481581505887437,
"grad_norm": 0.02546847357291813,
"learning_rate": 6.65079176382623e-06,
"loss": 0.1027,
"step": 2250
},
{
"epoch": 0.6495985020344964,
"grad_norm": 0.028225946201781264,
"learning_rate": 6.603441533663214e-06,
"loss": 0.1052,
"step": 2255
},
{
"epoch": 0.6510388534802491,
"grad_norm": 0.028818402757150797,
"learning_rate": 6.556177231354003e-06,
"loss": 0.103,
"step": 2260
},
{
"epoch": 0.652479204926002,
"grad_norm": 0.027109318220465493,
"learning_rate": 6.509000052614824e-06,
"loss": 0.1043,
"step": 2265
},
{
"epoch": 0.6539195563717547,
"grad_norm": 0.02806157217123479,
"learning_rate": 6.4619111909577995e-06,
"loss": 0.0962,
"step": 2270
},
{
"epoch": 0.6553599078175075,
"grad_norm": 0.029350220106511105,
"learning_rate": 6.414911837660768e-06,
"loss": 0.0997,
"step": 2275
},
{
"epoch": 0.6568002592632602,
"grad_norm": 0.02863157441645804,
"learning_rate": 6.36800318173714e-06,
"loss": 0.1004,
"step": 2280
},
{
"epoch": 0.658240610709013,
"grad_norm": 0.030549045202254512,
"learning_rate": 6.321186409905812e-06,
"loss": 0.0995,
"step": 2285
},
{
"epoch": 0.6596809621547658,
"grad_norm": 0.03217236024450049,
"learning_rate": 6.274462706561153e-06,
"loss": 0.1008,
"step": 2290
},
{
"epoch": 0.6611213136005185,
"grad_norm": 0.02822548256388095,
"learning_rate": 6.227833253743045e-06,
"loss": 0.0962,
"step": 2295
},
{
"epoch": 0.6625616650462713,
"grad_norm": 0.02614875322287693,
"learning_rate": 6.181299231106963e-06,
"loss": 0.0987,
"step": 2300
},
{
"epoch": 0.6640020164920241,
"grad_norm": 0.02541722852556565,
"learning_rate": 6.134861815894146e-06,
"loss": 0.0967,
"step": 2305
},
{
"epoch": 0.6654423679377768,
"grad_norm": 0.02850719972847212,
"learning_rate": 6.08852218290181e-06,
"loss": 0.0947,
"step": 2310
},
{
"epoch": 0.6668827193835296,
"grad_norm": 0.02908443110544134,
"learning_rate": 6.0422815044534265e-06,
"loss": 0.1037,
"step": 2315
},
{
"epoch": 0.6683230708292823,
"grad_norm": 0.029315617072749215,
"learning_rate": 5.9961409503690605e-06,
"loss": 0.0963,
"step": 2320
},
{
"epoch": 0.6697634222750352,
"grad_norm": 0.030033042623501118,
"learning_rate": 5.950101687935783e-06,
"loss": 0.1027,
"step": 2325
},
{
"epoch": 0.6712037737207879,
"grad_norm": 0.02915046992535852,
"learning_rate": 5.904164881878143e-06,
"loss": 0.1055,
"step": 2330
},
{
"epoch": 0.6726441251665406,
"grad_norm": 0.025236471457550445,
"learning_rate": 5.858331694328686e-06,
"loss": 0.0982,
"step": 2335
},
{
"epoch": 0.6740844766122934,
"grad_norm": 0.03401543932639446,
"learning_rate": 5.812603284798575e-06,
"loss": 0.0949,
"step": 2340
},
{
"epoch": 0.6755248280580461,
"grad_norm": 0.028026329462367786,
"learning_rate": 5.766980810148238e-06,
"loss": 0.1,
"step": 2345
},
{
"epoch": 0.676965179503799,
"grad_norm": 0.02874201116713264,
"learning_rate": 5.721465424558111e-06,
"loss": 0.0958,
"step": 2350
},
{
"epoch": 0.6784055309495517,
"grad_norm": 0.028963974093940446,
"learning_rate": 5.676058279499438e-06,
"loss": 0.0989,
"step": 2355
},
{
"epoch": 0.6798458823953044,
"grad_norm": 0.029496429086982617,
"learning_rate": 5.630760523705137e-06,
"loss": 0.0969,
"step": 2360
},
{
"epoch": 0.6812862338410572,
"grad_norm": 0.02472939959975296,
"learning_rate": 5.585573303140741e-06,
"loss": 0.1005,
"step": 2365
},
{
"epoch": 0.68272658528681,
"grad_norm": 0.02943173144839556,
"learning_rate": 5.540497760975412e-06,
"loss": 0.0991,
"step": 2370
},
{
"epoch": 0.6841669367325628,
"grad_norm": 0.027039554476503823,
"learning_rate": 5.495535037553003e-06,
"loss": 0.0967,
"step": 2375
},
{
"epoch": 0.6856072881783155,
"grad_norm": 0.03130069538494267,
"learning_rate": 5.450686270363244e-06,
"loss": 0.104,
"step": 2380
},
{
"epoch": 0.6870476396240682,
"grad_norm": 0.02792170269632637,
"learning_rate": 5.405952594012921e-06,
"loss": 0.0971,
"step": 2385
},
{
"epoch": 0.6884879910698211,
"grad_norm": 0.026536938565135147,
"learning_rate": 5.361335140197199e-06,
"loss": 0.0998,
"step": 2390
},
{
"epoch": 0.6899283425155738,
"grad_norm": 0.030495085484657028,
"learning_rate": 5.316835037670987e-06,
"loss": 0.0989,
"step": 2395
},
{
"epoch": 0.6913686939613266,
"grad_norm": 0.025636540838806603,
"learning_rate": 5.272453412220389e-06,
"loss": 0.1005,
"step": 2400
},
{
"epoch": 0.6928090454070793,
"grad_norm": 0.029276688852897748,
"learning_rate": 5.228191386634212e-06,
"loss": 0.096,
"step": 2405
},
{
"epoch": 0.694249396852832,
"grad_norm": 0.028892951411287924,
"learning_rate": 5.184050080675558e-06,
"loss": 0.0982,
"step": 2410
},
{
"epoch": 0.6956897482985849,
"grad_norm": 0.029668168507339023,
"learning_rate": 5.140030611053512e-06,
"loss": 0.0983,
"step": 2415
},
{
"epoch": 0.6971300997443376,
"grad_norm": 0.029331223786389957,
"learning_rate": 5.096134091394879e-06,
"loss": 0.1039,
"step": 2420
},
{
"epoch": 0.6985704511900904,
"grad_norm": 0.028007436697092276,
"learning_rate": 5.052361632216009e-06,
"loss": 0.0952,
"step": 2425
},
{
"epoch": 0.7000108026358431,
"grad_norm": 0.028924268558975137,
"learning_rate": 5.008714340894716e-06,
"loss": 0.1025,
"step": 2430
},
{
"epoch": 0.7014511540815959,
"grad_norm": 0.028546541140812298,
"learning_rate": 4.965193321642245e-06,
"loss": 0.1051,
"step": 2435
},
{
"epoch": 0.7028915055273487,
"grad_norm": 0.03228047586525536,
"learning_rate": 4.9217996754753505e-06,
"loss": 0.1042,
"step": 2440
},
{
"epoch": 0.7043318569731014,
"grad_norm": 0.026889053511112666,
"learning_rate": 4.878534500188443e-06,
"loss": 0.0979,
"step": 2445
},
{
"epoch": 0.7057722084188542,
"grad_norm": 0.024493931252593228,
"learning_rate": 4.835398890325806e-06,
"loss": 0.1006,
"step": 2450
},
{
"epoch": 0.707212559864607,
"grad_norm": 0.031072455568760707,
"learning_rate": 4.792393937153914e-06,
"loss": 0.1,
"step": 2455
},
{
"epoch": 0.7086529113103597,
"grad_norm": 0.03526863560562569,
"learning_rate": 4.749520728633812e-06,
"loss": 0.1004,
"step": 2460
},
{
"epoch": 0.7100932627561125,
"grad_norm": 0.027375718409826295,
"learning_rate": 4.706780349393621e-06,
"loss": 0.099,
"step": 2465
},
{
"epoch": 0.7115336142018652,
"grad_norm": 0.029579970429738103,
"learning_rate": 4.664173880701065e-06,
"loss": 0.1008,
"step": 2470
},
{
"epoch": 0.7129739656476181,
"grad_norm": 0.031758604921792485,
"learning_rate": 4.62170240043614e-06,
"loss": 0.0983,
"step": 2475
},
{
"epoch": 0.7144143170933708,
"grad_norm": 0.02656899768489825,
"learning_rate": 4.579366983063829e-06,
"loss": 0.0997,
"step": 2480
},
{
"epoch": 0.7158546685391235,
"grad_norm": 0.02566962597338034,
"learning_rate": 4.537168699606928e-06,
"loss": 0.1015,
"step": 2485
},
{
"epoch": 0.7172950199848763,
"grad_norm": 0.028715798458422183,
"learning_rate": 4.4951086176189666e-06,
"loss": 0.107,
"step": 2490
},
{
"epoch": 0.718735371430629,
"grad_norm": 0.02887905888346922,
"learning_rate": 4.453187801157155e-06,
"loss": 0.0984,
"step": 2495
},
{
"epoch": 0.7201757228763819,
"grad_norm": 0.028332689752910086,
"learning_rate": 4.411407310755513e-06,
"loss": 0.1007,
"step": 2500
},
{
"epoch": 0.7201757228763819,
"eval_loss": 0.10072407871484756,
"eval_runtime": 862.0549,
"eval_samples_per_second": 2.093,
"eval_steps_per_second": 0.523,
"step": 2500
},
{
"epoch": 0.7216160743221346,
"grad_norm": 0.02833390362499488,
"learning_rate": 4.369768203398014e-06,
"loss": 0.0957,
"step": 2505
},
{
"epoch": 0.7230564257678873,
"grad_norm": 0.030945876414709347,
"learning_rate": 4.328271532491859e-06,
"loss": 0.102,
"step": 2510
},
{
"epoch": 0.7244967772136401,
"grad_norm": 0.029283752981204014,
"learning_rate": 4.2869183478408125e-06,
"loss": 0.0985,
"step": 2515
},
{
"epoch": 0.7259371286593929,
"grad_norm": 0.027428758917667755,
"learning_rate": 4.2457096956186525e-06,
"loss": 0.1018,
"step": 2520
},
{
"epoch": 0.7273774801051457,
"grad_norm": 0.03315591743445466,
"learning_rate": 4.2046466183427035e-06,
"loss": 0.1075,
"step": 2525
},
{
"epoch": 0.7288178315508984,
"grad_norm": 0.030025762956048904,
"learning_rate": 4.163730154847462e-06,
"loss": 0.0929,
"step": 2530
},
{
"epoch": 0.7302581829966511,
"grad_norm": 0.030103449099341675,
"learning_rate": 4.122961340258312e-06,
"loss": 0.0964,
"step": 2535
},
{
"epoch": 0.731698534442404,
"grad_norm": 0.02917754573138957,
"learning_rate": 4.082341205965344e-06,
"loss": 0.0993,
"step": 2540
},
{
"epoch": 0.7331388858881567,
"grad_norm": 0.02789369472409532,
"learning_rate": 4.0418707795972575e-06,
"loss": 0.0995,
"step": 2545
},
{
"epoch": 0.7345792373339095,
"grad_norm": 0.029979889727084676,
"learning_rate": 4.001551084995363e-06,
"loss": 0.1031,
"step": 2550
},
{
"epoch": 0.7360195887796622,
"grad_norm": 0.027974195684437115,
"learning_rate": 3.961383142187691e-06,
"loss": 0.1021,
"step": 2555
},
{
"epoch": 0.7374599402254151,
"grad_norm": 0.030070715679730322,
"learning_rate": 3.9213679673631745e-06,
"loss": 0.1029,
"step": 2560
},
{
"epoch": 0.7389002916711678,
"grad_norm": 0.027066646976217957,
"learning_rate": 3.881506572845933e-06,
"loss": 0.1056,
"step": 2565
},
{
"epoch": 0.7403406431169205,
"grad_norm": 0.0284274448607563,
"learning_rate": 3.841799967069686e-06,
"loss": 0.1075,
"step": 2570
},
{
"epoch": 0.7417809945626733,
"grad_norm": 0.028074887598320608,
"learning_rate": 3.8022491545522346e-06,
"loss": 0.103,
"step": 2575
},
{
"epoch": 0.743221346008426,
"grad_norm": 0.03333775657686949,
"learning_rate": 3.7628551358700303e-06,
"loss": 0.0997,
"step": 2580
},
{
"epoch": 0.7446616974541789,
"grad_norm": 0.027376231042512,
"learning_rate": 3.723618907632882e-06,
"loss": 0.1024,
"step": 2585
},
{
"epoch": 0.7461020488999316,
"grad_norm": 0.031319210954457215,
"learning_rate": 3.6845414624587326e-06,
"loss": 0.1032,
"step": 2590
},
{
"epoch": 0.7475424003456843,
"grad_norm": 0.028908867069655585,
"learning_rate": 3.6456237889485547e-06,
"loss": 0.1023,
"step": 2595
},
{
"epoch": 0.7489827517914371,
"grad_norm": 0.027194027978343076,
"learning_rate": 3.606866871661333e-06,
"loss": 0.0976,
"step": 2600
},
{
"epoch": 0.7504231032371899,
"grad_norm": 0.028443079367809734,
"learning_rate": 3.5682716910891613e-06,
"loss": 0.0986,
"step": 2605
},
{
"epoch": 0.7518634546829427,
"grad_norm": 0.030731013519078837,
"learning_rate": 3.5298392236324365e-06,
"loss": 0.1009,
"step": 2610
},
{
"epoch": 0.7533038061286954,
"grad_norm": 0.0285250306739096,
"learning_rate": 3.491570441575154e-06,
"loss": 0.0989,
"step": 2615
},
{
"epoch": 0.7547441575744481,
"grad_norm": 0.02742316326439345,
"learning_rate": 3.453466313060322e-06,
"loss": 0.0997,
"step": 2620
},
{
"epoch": 0.756184509020201,
"grad_norm": 0.03242870927580512,
"learning_rate": 3.4155278020654502e-06,
"loss": 0.101,
"step": 2625
},
{
"epoch": 0.7576248604659537,
"grad_norm": 0.02846968206424331,
"learning_rate": 3.3777558683781795e-06,
"loss": 0.1002,
"step": 2630
},
{
"epoch": 0.7590652119117065,
"grad_norm": 0.025257296374579147,
"learning_rate": 3.3401514675719815e-06,
"loss": 0.0969,
"step": 2635
},
{
"epoch": 0.7605055633574592,
"grad_norm": 0.02845603161574245,
"learning_rate": 3.302715550982014e-06,
"loss": 0.0998,
"step": 2640
},
{
"epoch": 0.7619459148032119,
"grad_norm": 0.029606705311034962,
"learning_rate": 3.2654490656810256e-06,
"loss": 0.0965,
"step": 2645
},
{
"epoch": 0.7633862662489648,
"grad_norm": 0.029342507148503363,
"learning_rate": 3.228352954455406e-06,
"loss": 0.0966,
"step": 2650
},
{
"epoch": 0.7648266176947175,
"grad_norm": 0.02856949602254836,
"learning_rate": 3.1914281557813386e-06,
"loss": 0.0966,
"step": 2655
},
{
"epoch": 0.7662669691404703,
"grad_norm": 0.03497921521636249,
"learning_rate": 3.1546756038010507e-06,
"loss": 0.0981,
"step": 2660
},
{
"epoch": 0.767707320586223,
"grad_norm": 0.030940811674726062,
"learning_rate": 3.1180962282991976e-06,
"loss": 0.1008,
"step": 2665
},
{
"epoch": 0.7691476720319758,
"grad_norm": 0.02732290490010739,
"learning_rate": 3.081690954679313e-06,
"loss": 0.0968,
"step": 2670
},
{
"epoch": 0.7705880234777286,
"grad_norm": 0.029043196623878744,
"learning_rate": 3.0454607039404206e-06,
"loss": 0.0957,
"step": 2675
},
{
"epoch": 0.7720283749234813,
"grad_norm": 0.029602576168767845,
"learning_rate": 3.0094063926537233e-06,
"loss": 0.0993,
"step": 2680
},
{
"epoch": 0.7734687263692341,
"grad_norm": 0.02834425216635453,
"learning_rate": 2.973528932939429e-06,
"loss": 0.0954,
"step": 2685
},
{
"epoch": 0.7749090778149869,
"grad_norm": 0.02696740012171304,
"learning_rate": 2.937829232443654e-06,
"loss": 0.0974,
"step": 2690
},
{
"epoch": 0.7763494292607396,
"grad_norm": 0.029300105549739816,
"learning_rate": 2.9023081943154753e-06,
"loss": 0.1012,
"step": 2695
},
{
"epoch": 0.7777897807064924,
"grad_norm": 0.025590035684474745,
"learning_rate": 2.86696671718408e-06,
"loss": 0.0954,
"step": 2700
},
{
"epoch": 0.7792301321522451,
"grad_norm": 0.028340578845450066,
"learning_rate": 2.8318056951360294e-06,
"loss": 0.0978,
"step": 2705
},
{
"epoch": 0.780670483597998,
"grad_norm": 0.027993276158026614,
"learning_rate": 2.7968260176926407e-06,
"loss": 0.1014,
"step": 2710
},
{
"epoch": 0.7821108350437507,
"grad_norm": 0.03153867114181106,
"learning_rate": 2.762028569787485e-06,
"loss": 0.1051,
"step": 2715
},
{
"epoch": 0.7835511864895034,
"grad_norm": 0.03024494353093925,
"learning_rate": 2.7274142317439956e-06,
"loss": 0.0979,
"step": 2720
},
{
"epoch": 0.7849915379352562,
"grad_norm": 0.02889386857808468,
"learning_rate": 2.6929838792532035e-06,
"loss": 0.102,
"step": 2725
},
{
"epoch": 0.7864318893810089,
"grad_norm": 0.028444248618397415,
"learning_rate": 2.6587383833515746e-06,
"loss": 0.097,
"step": 2730
},
{
"epoch": 0.7878722408267618,
"grad_norm": 0.03018963515700107,
"learning_rate": 2.6246786103989887e-06,
"loss": 0.0969,
"step": 2735
},
{
"epoch": 0.7893125922725145,
"grad_norm": 0.028448487698969887,
"learning_rate": 2.590805422056807e-06,
"loss": 0.0976,
"step": 2740
},
{
"epoch": 0.7907529437182672,
"grad_norm": 0.025980334968211127,
"learning_rate": 2.5571196752660733e-06,
"loss": 0.098,
"step": 2745
},
{
"epoch": 0.79219329516402,
"grad_norm": 0.0279120349990706,
"learning_rate": 2.5236222222258455e-06,
"loss": 0.1006,
"step": 2750
},
{
"epoch": 0.7936336466097728,
"grad_norm": 0.03043213886693375,
"learning_rate": 2.4903139103716365e-06,
"loss": 0.0961,
"step": 2755
},
{
"epoch": 0.7950739980555256,
"grad_norm": 0.027713327075462212,
"learning_rate": 2.4571955823539617e-06,
"loss": 0.0927,
"step": 2760
},
{
"epoch": 0.7965143495012783,
"grad_norm": 0.03200709702140973,
"learning_rate": 2.424268076017032e-06,
"loss": 0.099,
"step": 2765
},
{
"epoch": 0.797954700947031,
"grad_norm": 0.02862031576494855,
"learning_rate": 2.3915322243775564e-06,
"loss": 0.095,
"step": 2770
},
{
"epoch": 0.7993950523927839,
"grad_norm": 0.030880108006987813,
"learning_rate": 2.3589888556036623e-06,
"loss": 0.1012,
"step": 2775
},
{
"epoch": 0.8008354038385366,
"grad_norm": 0.028581095269047594,
"learning_rate": 2.3266387929939525e-06,
"loss": 0.1004,
"step": 2780
},
{
"epoch": 0.8022757552842894,
"grad_norm": 0.029454825149171658,
"learning_rate": 2.294482854956672e-06,
"loss": 0.0984,
"step": 2785
},
{
"epoch": 0.8037161067300421,
"grad_norm": 0.03006431233771053,
"learning_rate": 2.2625218549890014e-06,
"loss": 0.0995,
"step": 2790
},
{
"epoch": 0.8051564581757948,
"grad_norm": 0.027799037780853072,
"learning_rate": 2.230756601656481e-06,
"loss": 0.1018,
"step": 2795
},
{
"epoch": 0.8065968096215477,
"grad_norm": 0.03649536849392369,
"learning_rate": 2.1991878985725566e-06,
"loss": 0.0936,
"step": 2800
},
{
"epoch": 0.8080371610673004,
"grad_norm": 0.028269913936191833,
"learning_rate": 2.167816544378244e-06,
"loss": 0.1009,
"step": 2805
},
{
"epoch": 0.8094775125130532,
"grad_norm": 0.028604988396104106,
"learning_rate": 2.1366433327219284e-06,
"loss": 0.1025,
"step": 2810
},
{
"epoch": 0.8109178639588059,
"grad_norm": 0.02876137453572099,
"learning_rate": 2.105669052239274e-06,
"loss": 0.1029,
"step": 2815
},
{
"epoch": 0.8123582154045587,
"grad_norm": 0.028447843531367326,
"learning_rate": 2.0748944865333033e-06,
"loss": 0.0989,
"step": 2820
},
{
"epoch": 0.8137985668503115,
"grad_norm": 0.02868959528432683,
"learning_rate": 2.0443204141545393e-06,
"loss": 0.0991,
"step": 2825
},
{
"epoch": 0.8152389182960642,
"grad_norm": 0.030631865795776963,
"learning_rate": 2.013947608581327e-06,
"loss": 0.0988,
"step": 2830
},
{
"epoch": 0.816679269741817,
"grad_norm": 0.028274634163983327,
"learning_rate": 1.983776838200262e-06,
"loss": 0.0967,
"step": 2835
},
{
"epoch": 0.8181196211875698,
"grad_norm": 0.03127569269957067,
"learning_rate": 1.9538088662867495e-06,
"loss": 0.0989,
"step": 2840
},
{
"epoch": 0.8195599726333225,
"grad_norm": 0.028435906385402127,
"learning_rate": 1.924044450985706e-06,
"loss": 0.1035,
"step": 2845
},
{
"epoch": 0.8210003240790753,
"grad_norm": 0.030905808512586234,
"learning_rate": 1.8944843452923546e-06,
"loss": 0.1031,
"step": 2850
},
{
"epoch": 0.822440675524828,
"grad_norm": 0.02821159641559338,
"learning_rate": 1.8651292970332003e-06,
"loss": 0.0975,
"step": 2855
},
{
"epoch": 0.8238810269705809,
"grad_norm": 0.026301972641988965,
"learning_rate": 1.835980048847098e-06,
"loss": 0.1016,
"step": 2860
},
{
"epoch": 0.8253213784163336,
"grad_norm": 0.02850407318953127,
"learning_rate": 1.8070373381664752e-06,
"loss": 0.0957,
"step": 2865
},
{
"epoch": 0.8267617298620864,
"grad_norm": 0.03287640405240904,
"learning_rate": 1.77830189719866e-06,
"loss": 0.0974,
"step": 2870
},
{
"epoch": 0.8282020813078391,
"grad_norm": 0.02979639575966765,
"learning_rate": 1.7497744529073712e-06,
"loss": 0.0962,
"step": 2875
},
{
"epoch": 0.8296424327535918,
"grad_norm": 0.028883353854663875,
"learning_rate": 1.721455726994321e-06,
"loss": 0.1033,
"step": 2880
},
{
"epoch": 0.8310827841993447,
"grad_norm": 0.027623128069878516,
"learning_rate": 1.6933464358809593e-06,
"loss": 0.0915,
"step": 2885
},
{
"epoch": 0.8325231356450974,
"grad_norm": 0.031518406493856295,
"learning_rate": 1.6654472906903486e-06,
"loss": 0.0953,
"step": 2890
},
{
"epoch": 0.8339634870908502,
"grad_norm": 0.025571212721863242,
"learning_rate": 1.637758997229173e-06,
"loss": 0.0939,
"step": 2895
},
{
"epoch": 0.8354038385366029,
"grad_norm": 0.026694901604252565,
"learning_rate": 1.6102822559698828e-06,
"loss": 0.0947,
"step": 2900
},
{
"epoch": 0.8368441899823557,
"grad_norm": 0.0277750741435232,
"learning_rate": 1.5830177620329712e-06,
"loss": 0.0989,
"step": 2905
},
{
"epoch": 0.8382845414281085,
"grad_norm": 0.02924974903749895,
"learning_rate": 1.5559662051694002e-06,
"loss": 0.0986,
"step": 2910
},
{
"epoch": 0.8397248928738612,
"grad_norm": 0.032423419131996915,
"learning_rate": 1.5291282697431353e-06,
"loss": 0.0989,
"step": 2915
},
{
"epoch": 0.841165244319614,
"grad_norm": 0.030642374976224046,
"learning_rate": 1.502504634713835e-06,
"loss": 0.0963,
"step": 2920
},
{
"epoch": 0.8426055957653668,
"grad_norm": 0.02981716833954207,
"learning_rate": 1.4760959736196834e-06,
"loss": 0.0961,
"step": 2925
},
{
"epoch": 0.8440459472111195,
"grad_norm": 0.028126597558143722,
"learning_rate": 1.4499029545603472e-06,
"loss": 0.0996,
"step": 2930
},
{
"epoch": 0.8454862986568723,
"grad_norm": 0.030061407833917978,
"learning_rate": 1.423926240180068e-06,
"loss": 0.1027,
"step": 2935
},
{
"epoch": 0.846926650102625,
"grad_norm": 0.030976625673774977,
"learning_rate": 1.3981664876509028e-06,
"loss": 0.0995,
"step": 2940
},
{
"epoch": 0.8483670015483779,
"grad_norm": 0.030419140568639338,
"learning_rate": 1.3726243486560975e-06,
"loss": 0.1049,
"step": 2945
},
{
"epoch": 0.8498073529941306,
"grad_norm": 0.02669531962544557,
"learning_rate": 1.3473004693736037e-06,
"loss": 0.1026,
"step": 2950
},
{
"epoch": 0.8512477044398833,
"grad_norm": 0.028011138842770462,
"learning_rate": 1.3221954904597256e-06,
"loss": 0.0957,
"step": 2955
},
{
"epoch": 0.8526880558856361,
"grad_norm": 0.029219102403077252,
"learning_rate": 1.2973100470329159e-06,
"loss": 0.0989,
"step": 2960
},
{
"epoch": 0.8541284073313888,
"grad_norm": 0.028252079128452696,
"learning_rate": 1.272644768657707e-06,
"loss": 0.096,
"step": 2965
},
{
"epoch": 0.8555687587771417,
"grad_norm": 0.028035615158873324,
"learning_rate": 1.248200279328784e-06,
"loss": 0.0985,
"step": 2970
},
{
"epoch": 0.8570091102228944,
"grad_norm": 0.03053162765030892,
"learning_rate": 1.223977197455204e-06,
"loss": 0.1006,
"step": 2975
},
{
"epoch": 0.8584494616686471,
"grad_norm": 0.028648955173488484,
"learning_rate": 1.1999761358447403e-06,
"loss": 0.0994,
"step": 2980
},
{
"epoch": 0.8598898131143999,
"grad_norm": 0.02748195928805478,
"learning_rate": 1.1761977016883897e-06,
"loss": 0.0958,
"step": 2985
},
{
"epoch": 0.8613301645601527,
"grad_norm": 0.028621187530613948,
"learning_rate": 1.152642496544998e-06,
"loss": 0.0958,
"step": 2990
},
{
"epoch": 0.8627705160059055,
"grad_norm": 0.027853067803610992,
"learning_rate": 1.1293111163260639e-06,
"loss": 0.0997,
"step": 2995
},
{
"epoch": 0.8642108674516582,
"grad_norm": 0.027187976560834638,
"learning_rate": 1.1062041512806409e-06,
"loss": 0.1028,
"step": 3000
},
{
"epoch": 0.8642108674516582,
"eval_loss": 0.09996497631072998,
"eval_runtime": 862.256,
"eval_samples_per_second": 2.092,
"eval_steps_per_second": 0.523,
"step": 3000
},
{
"epoch": 0.8656512188974109,
"grad_norm": 0.026511113426348355,
"learning_rate": 1.0833221859804188e-06,
"loss": 0.0976,
"step": 3005
},
{
"epoch": 0.8670915703431638,
"grad_norm": 0.02987993551142259,
"learning_rate": 1.0606657993049253e-06,
"loss": 0.0966,
"step": 3010
},
{
"epoch": 0.8685319217889165,
"grad_norm": 0.029689732788246524,
"learning_rate": 1.0382355644268871e-06,
"loss": 0.1041,
"step": 3015
},
{
"epoch": 0.8699722732346693,
"grad_norm": 0.0267869386208169,
"learning_rate": 1.0160320487977349e-06,
"loss": 0.0966,
"step": 3020
},
{
"epoch": 0.871412624680422,
"grad_norm": 0.032293109106566735,
"learning_rate": 9.940558141332323e-07,
"loss": 0.1048,
"step": 3025
},
{
"epoch": 0.8728529761261747,
"grad_norm": 0.026383256040976497,
"learning_rate": 9.723074163992774e-07,
"loss": 0.0988,
"step": 3030
},
{
"epoch": 0.8742933275719276,
"grad_norm": 0.028451018898224757,
"learning_rate": 9.507874057978339e-07,
"loss": 0.0974,
"step": 3035
},
{
"epoch": 0.8757336790176803,
"grad_norm": 0.03180099043766917,
"learning_rate": 9.294963267530177e-07,
"loss": 0.097,
"step": 3040
},
{
"epoch": 0.8771740304634331,
"grad_norm": 0.030029444501747108,
"learning_rate": 9.084347178973107e-07,
"loss": 0.0963,
"step": 3045
},
{
"epoch": 0.8786143819091858,
"grad_norm": 0.030459369725609208,
"learning_rate": 8.876031120579454e-07,
"loss": 0.0985,
"step": 3050
},
{
"epoch": 0.8800547333549386,
"grad_norm": 0.030593158083603012,
"learning_rate": 8.670020362434229e-07,
"loss": 0.0975,
"step": 3055
},
{
"epoch": 0.8814950848006914,
"grad_norm": 0.02863412252326483,
"learning_rate": 8.466320116301752e-07,
"loss": 0.0959,
"step": 3060
},
{
"epoch": 0.8829354362464441,
"grad_norm": 0.028283718935748024,
"learning_rate": 8.264935535493879e-07,
"loss": 0.0956,
"step": 3065
},
{
"epoch": 0.8843757876921969,
"grad_norm": 0.032405657391703896,
"learning_rate": 8.065871714739581e-07,
"loss": 0.1016,
"step": 3070
},
{
"epoch": 0.8858161391379497,
"grad_norm": 0.02841639634176925,
"learning_rate": 7.869133690056063e-07,
"loss": 0.0982,
"step": 3075
},
{
"epoch": 0.8872564905837024,
"grad_norm": 0.026380393134296947,
"learning_rate": 7.67472643862136e-07,
"loss": 0.0949,
"step": 3080
},
{
"epoch": 0.8886968420294552,
"grad_norm": 0.03051349318569555,
"learning_rate": 7.482654878648465e-07,
"loss": 0.1076,
"step": 3085
},
{
"epoch": 0.8901371934752079,
"grad_norm": 0.026798564091589188,
"learning_rate": 7.292923869260837e-07,
"loss": 0.1009,
"step": 3090
},
{
"epoch": 0.8915775449209608,
"grad_norm": 0.026243273967039876,
"learning_rate": 7.105538210369467e-07,
"loss": 0.0937,
"step": 3095
},
{
"epoch": 0.8930178963667135,
"grad_norm": 0.027316057706649032,
"learning_rate": 6.920502642551519e-07,
"loss": 0.0981,
"step": 3100
},
{
"epoch": 0.8944582478124662,
"grad_norm": 0.030461472244114565,
"learning_rate": 6.737821846930403e-07,
"loss": 0.1013,
"step": 3105
},
{
"epoch": 0.895898599258219,
"grad_norm": 0.029378550013335232,
"learning_rate": 6.557500445057252e-07,
"loss": 0.1032,
"step": 3110
},
{
"epoch": 0.8973389507039717,
"grad_norm": 0.03205510198240369,
"learning_rate": 6.379542998794086e-07,
"loss": 0.0942,
"step": 3115
},
{
"epoch": 0.8987793021497246,
"grad_norm": 0.02919082875531507,
"learning_rate": 6.203954010198387e-07,
"loss": 0.1016,
"step": 3120
},
{
"epoch": 0.9002196535954773,
"grad_norm": 0.028309778648473748,
"learning_rate": 6.030737921409169e-07,
"loss": 0.0972,
"step": 3125
},
{
"epoch": 0.90166000504123,
"grad_norm": 0.02666645822561416,
"learning_rate": 5.859899114534662e-07,
"loss": 0.0958,
"step": 3130
},
{
"epoch": 0.9031003564869828,
"grad_norm": 0.02914798086446344,
"learning_rate": 5.691441911541385e-07,
"loss": 0.0989,
"step": 3135
},
{
"epoch": 0.9045407079327356,
"grad_norm": 0.029321419229845342,
"learning_rate": 5.525370574144873e-07,
"loss": 0.0943,
"step": 3140
},
{
"epoch": 0.9059810593784884,
"grad_norm": 0.029224751094801393,
"learning_rate": 5.361689303701767e-07,
"loss": 0.0927,
"step": 3145
},
{
"epoch": 0.9074214108242411,
"grad_norm": 0.028920548399641463,
"learning_rate": 5.200402241103674e-07,
"loss": 0.0973,
"step": 3150
},
{
"epoch": 0.9088617622699939,
"grad_norm": 0.026508682335799016,
"learning_rate": 5.041513466672254e-07,
"loss": 0.0911,
"step": 3155
},
{
"epoch": 0.9103021137157467,
"grad_norm": 0.0286913163547185,
"learning_rate": 4.885027000056075e-07,
"loss": 0.1003,
"step": 3160
},
{
"epoch": 0.9117424651614994,
"grad_norm": 0.028408650516893636,
"learning_rate": 4.730946800128888e-07,
"loss": 0.0991,
"step": 3165
},
{
"epoch": 0.9131828166072522,
"grad_norm": 0.027915924830151973,
"learning_rate": 4.5792767648895396e-07,
"loss": 0.1008,
"step": 3170
},
{
"epoch": 0.9146231680530049,
"grad_norm": 0.03016832518348953,
"learning_rate": 4.4300207313632713e-07,
"loss": 0.0979,
"step": 3175
},
{
"epoch": 0.9160635194987578,
"grad_norm": 0.025345673276194875,
"learning_rate": 4.2831824755046994e-07,
"loss": 0.0963,
"step": 3180
},
{
"epoch": 0.9175038709445105,
"grad_norm": 0.026699911404801765,
"learning_rate": 4.138765712102299e-07,
"loss": 0.0949,
"step": 3185
},
{
"epoch": 0.9189442223902632,
"grad_norm": 0.028093887453801158,
"learning_rate": 3.9967740946843523e-07,
"loss": 0.0962,
"step": 3190
},
{
"epoch": 0.920384573836016,
"grad_norm": 0.030027557409672574,
"learning_rate": 3.8572112154266593e-07,
"loss": 0.1003,
"step": 3195
},
{
"epoch": 0.9218249252817687,
"grad_norm": 0.030720426408657343,
"learning_rate": 3.7200806050614714e-07,
"loss": 0.0966,
"step": 3200
},
{
"epoch": 0.9232652767275216,
"grad_norm": 0.028926025323559437,
"learning_rate": 3.585385732788327e-07,
"loss": 0.0957,
"step": 3205
},
{
"epoch": 0.9247056281732743,
"grad_norm": 0.03702215777030734,
"learning_rate": 3.453130006186234e-07,
"loss": 0.0982,
"step": 3210
},
{
"epoch": 0.926145979619027,
"grad_norm": 0.029009798104548613,
"learning_rate": 3.3233167711274496e-07,
"loss": 0.1018,
"step": 3215
},
{
"epoch": 0.9275863310647798,
"grad_norm": 0.026198880889564888,
"learning_rate": 3.1959493116928473e-07,
"loss": 0.0987,
"step": 3220
},
{
"epoch": 0.9290266825105326,
"grad_norm": 0.028949046641791966,
"learning_rate": 3.0710308500888184e-07,
"loss": 0.1009,
"step": 3225
},
{
"epoch": 0.9304670339562854,
"grad_norm": 0.03211145771064907,
"learning_rate": 2.948564546565791e-07,
"loss": 0.0964,
"step": 3230
},
{
"epoch": 0.9319073854020381,
"grad_norm": 0.031242518818499414,
"learning_rate": 2.828553499338227e-07,
"loss": 0.0946,
"step": 3235
},
{
"epoch": 0.9333477368477908,
"grad_norm": 0.031264506938530154,
"learning_rate": 2.71100074450632e-07,
"loss": 0.0958,
"step": 3240
},
{
"epoch": 0.9347880882935437,
"grad_norm": 0.029366673066917617,
"learning_rate": 2.595909255979079e-07,
"loss": 0.0981,
"step": 3245
},
{
"epoch": 0.9362284397392964,
"grad_norm": 0.0265988622897014,
"learning_rate": 2.4832819453992073e-07,
"loss": 0.099,
"step": 3250
},
{
"epoch": 0.9376687911850492,
"grad_norm": 0.027150460711859906,
"learning_rate": 2.3731216620693554e-07,
"loss": 0.1026,
"step": 3255
},
{
"epoch": 0.9391091426308019,
"grad_norm": 0.03127106760239122,
"learning_rate": 2.2654311928800965e-07,
"loss": 0.093,
"step": 3260
},
{
"epoch": 0.9405494940765546,
"grad_norm": 0.030758685060327442,
"learning_rate": 2.1602132622393745e-07,
"loss": 0.1015,
"step": 3265
},
{
"epoch": 0.9419898455223075,
"grad_norm": 0.028184366940994177,
"learning_rate": 2.0574705320036025e-07,
"loss": 0.1074,
"step": 3270
},
{
"epoch": 0.9434301969680602,
"grad_norm": 0.02849030323321046,
"learning_rate": 1.9572056014103281e-07,
"loss": 0.1001,
"step": 3275
},
{
"epoch": 0.944870548413813,
"grad_norm": 0.03266549303159609,
"learning_rate": 1.8594210070124852e-07,
"loss": 0.1045,
"step": 3280
},
{
"epoch": 0.9463108998595657,
"grad_norm": 0.026988212769410665,
"learning_rate": 1.7641192226141913e-07,
"loss": 0.0937,
"step": 3285
},
{
"epoch": 0.9477512513053185,
"grad_norm": 0.02529946877891728,
"learning_rate": 1.671302659208185e-07,
"loss": 0.098,
"step": 3290
},
{
"epoch": 0.9491916027510713,
"grad_norm": 0.03314606321848586,
"learning_rate": 1.58097366491482e-07,
"loss": 0.1009,
"step": 3295
},
{
"epoch": 0.950631954196824,
"grad_norm": 0.027540135720360067,
"learning_rate": 1.4931345249226792e-07,
"loss": 0.0967,
"step": 3300
},
{
"epoch": 0.9520723056425768,
"grad_norm": 0.028602952429147437,
"learning_rate": 1.407787461430743e-07,
"loss": 0.1042,
"step": 3305
},
{
"epoch": 0.9535126570883296,
"grad_norm": 0.028879029680259325,
"learning_rate": 1.324934633592201e-07,
"loss": 0.0945,
"step": 3310
},
{
"epoch": 0.9549530085340823,
"grad_norm": 0.02860475239355599,
"learning_rate": 1.2445781374597842e-07,
"loss": 0.1017,
"step": 3315
},
{
"epoch": 0.9563933599798351,
"grad_norm": 0.026737800323851892,
"learning_rate": 1.1667200059327644e-07,
"loss": 0.0971,
"step": 3320
},
{
"epoch": 0.9578337114255878,
"grad_norm": 0.029364181788732343,
"learning_rate": 1.0913622087055264e-07,
"loss": 0.1005,
"step": 3325
},
{
"epoch": 0.9592740628713406,
"grad_norm": 0.03131909258187323,
"learning_rate": 1.0185066522177545e-07,
"loss": 0.092,
"step": 3330
},
{
"epoch": 0.9607144143170934,
"grad_norm": 0.029983775954717285,
"learning_rate": 9.481551796061472e-08,
"loss": 0.0989,
"step": 3335
},
{
"epoch": 0.9621547657628461,
"grad_norm": 0.030901618242872846,
"learning_rate": 8.803095706578335e-08,
"loss": 0.0987,
"step": 3340
},
{
"epoch": 0.9635951172085989,
"grad_norm": 0.027894545371751872,
"learning_rate": 8.149715417653414e-08,
"loss": 0.0969,
"step": 3345
},
{
"epoch": 0.9650354686543516,
"grad_norm": 0.027564458728654692,
"learning_rate": 7.521427458831776e-08,
"loss": 0.099,
"step": 3350
},
{
"epoch": 0.9664758201001045,
"grad_norm": 0.025676377662014388,
"learning_rate": 6.918247724859939e-08,
"loss": 0.0941,
"step": 3355
},
{
"epoch": 0.9679161715458572,
"grad_norm": 0.030832764931546873,
"learning_rate": 6.340191475283753e-08,
"loss": 0.0988,
"step": 3360
},
{
"epoch": 0.9693565229916099,
"grad_norm": 0.03090619704744881,
"learning_rate": 5.787273334062593e-08,
"loss": 0.1,
"step": 3365
},
{
"epoch": 0.9707968744373627,
"grad_norm": 0.028989234177967914,
"learning_rate": 5.259507289199328e-08,
"loss": 0.1042,
"step": 3370
},
{
"epoch": 0.9722372258831155,
"grad_norm": 0.031096318018671423,
"learning_rate": 4.756906692386043e-08,
"loss": 0.1015,
"step": 3375
},
{
"epoch": 0.9736775773288683,
"grad_norm": 0.031160065189087587,
"learning_rate": 4.2794842586670884e-08,
"loss": 0.0987,
"step": 3380
},
{
"epoch": 0.975117928774621,
"grad_norm": 0.027856429944083137,
"learning_rate": 3.827252066116338e-08,
"loss": 0.0934,
"step": 3385
},
{
"epoch": 0.9765582802203737,
"grad_norm": 0.0292530610363585,
"learning_rate": 3.400221555532768e-08,
"loss": 0.0931,
"step": 3390
},
{
"epoch": 0.9779986316661265,
"grad_norm": 0.030499523184600807,
"learning_rate": 2.998403530150018e-08,
"loss": 0.1011,
"step": 3395
},
{
"epoch": 0.9794389831118793,
"grad_norm": 0.030425018653641535,
"learning_rate": 2.6218081553638363e-08,
"loss": 0.1015,
"step": 3400
},
{
"epoch": 0.9808793345576321,
"grad_norm": 0.028935900226943704,
"learning_rate": 2.2704449584745046e-08,
"loss": 0.1015,
"step": 3405
},
{
"epoch": 0.9823196860033848,
"grad_norm": 0.031000647192898592,
"learning_rate": 1.9443228284455882e-08,
"loss": 0.0967,
"step": 3410
},
{
"epoch": 0.9837600374491375,
"grad_norm": 0.02615522374107861,
"learning_rate": 1.6434500156800037e-08,
"loss": 0.098,
"step": 3415
},
{
"epoch": 0.9852003888948904,
"grad_norm": 0.0296512129505122,
"learning_rate": 1.3678341318100751e-08,
"loss": 0.0941,
"step": 3420
},
{
"epoch": 0.9866407403406431,
"grad_norm": 0.027814116523255324,
"learning_rate": 1.1174821495059106e-08,
"loss": 0.0959,
"step": 3425
},
{
"epoch": 0.9880810917863959,
"grad_norm": 0.03492776644651804,
"learning_rate": 8.924004022986543e-09,
"loss": 0.0953,
"step": 3430
},
{
"epoch": 0.9895214432321486,
"grad_norm": 0.02820257683900699,
"learning_rate": 6.9259458442005875e-09,
"loss": 0.1026,
"step": 3435
},
{
"epoch": 0.9909617946779014,
"grad_norm": 0.030304015663341097,
"learning_rate": 5.180697506587118e-09,
"loss": 0.0955,
"step": 3440
},
{
"epoch": 0.9924021461236542,
"grad_norm": 0.026993412863199263,
"learning_rate": 3.688303162322493e-09,
"loss": 0.0969,
"step": 3445
},
{
"epoch": 0.9938424975694069,
"grad_norm": 0.02671367368351453,
"learning_rate": 2.44880056675334e-09,
"loss": 0.0921,
"step": 3450
},
{
"epoch": 0.9952828490151597,
"grad_norm": 0.027626479574708512,
"learning_rate": 1.4622210774428714e-09,
"loss": 0.0986,
"step": 3455
},
{
"epoch": 0.9967232004609125,
"grad_norm": 0.02933812040981167,
"learning_rate": 7.285896533770765e-10,
"loss": 0.0992,
"step": 3460
},
{
"epoch": 0.9981635519066653,
"grad_norm": 0.027258112232128342,
"learning_rate": 2.479248543363344e-10,
"loss": 0.0967,
"step": 3465
},
{
"epoch": 0.999603903352418,
"grad_norm": 0.029806576902039788,
"learning_rate": 2.0238840421349382e-11,
"loss": 0.0979,
"step": 3470
},
{
"epoch": 0.9998919736415686,
"step": 3471,
"total_flos": 2.660820529446912e+16,
"train_loss": 0.2432632086317191,
"train_runtime": 212607.6799,
"train_samples_per_second": 1.045,
"train_steps_per_second": 0.016
}
],
"logging_steps": 5,
"max_steps": 3471,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.660820529446912e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}