Lansechen's picture
Model save
436404b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 100,
"global_step": 850,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029411764705882353,
"grad_norm": 7.394163236707443,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.4868,
"mean_token_accuracy": 0.670187404697577,
"step": 5
},
{
"epoch": 0.058823529411764705,
"grad_norm": 4.989051492464897,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.4548,
"mean_token_accuracy": 0.6696601003430951,
"step": 10
},
{
"epoch": 0.08823529411764706,
"grad_norm": 1.718958499756161,
"learning_rate": 3.529411764705883e-06,
"loss": 1.2494,
"mean_token_accuracy": 0.6895946883758232,
"step": 15
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.7334017715973813,
"learning_rate": 4.705882352941177e-06,
"loss": 1.1266,
"mean_token_accuracy": 0.7048006769329966,
"step": 20
},
{
"epoch": 0.14705882352941177,
"grad_norm": 0.764379154080371,
"learning_rate": 5.882352941176471e-06,
"loss": 1.0738,
"mean_token_accuracy": 0.7112541876090825,
"step": 25
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.5574786594203679,
"learning_rate": 7.058823529411766e-06,
"loss": 0.9891,
"mean_token_accuracy": 0.7278180810901669,
"step": 30
},
{
"epoch": 0.20588235294117646,
"grad_norm": 0.43685454872404356,
"learning_rate": 8.23529411764706e-06,
"loss": 0.9706,
"mean_token_accuracy": 0.7296801992148935,
"step": 35
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.3720554551518854,
"learning_rate": 9.411764705882354e-06,
"loss": 0.933,
"mean_token_accuracy": 0.738009820036878,
"step": 40
},
{
"epoch": 0.2647058823529412,
"grad_norm": 0.32483974488667805,
"learning_rate": 1.0588235294117648e-05,
"loss": 0.9158,
"mean_token_accuracy": 0.7409755301059546,
"step": 45
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.3218526340694397,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.9203,
"mean_token_accuracy": 0.7389487349847822,
"step": 50
},
{
"epoch": 0.3235294117647059,
"grad_norm": 0.3002154739759154,
"learning_rate": 1.2941176470588238e-05,
"loss": 0.8935,
"mean_token_accuracy": 0.7449524112208951,
"step": 55
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.30954774782779204,
"learning_rate": 1.4117647058823532e-05,
"loss": 0.8859,
"mean_token_accuracy": 0.7456974429587235,
"step": 60
},
{
"epoch": 0.38235294117647056,
"grad_norm": 0.31990114149137194,
"learning_rate": 1.5294117647058822e-05,
"loss": 0.8857,
"mean_token_accuracy": 0.7454657299610173,
"step": 65
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.29621110088082725,
"learning_rate": 1.647058823529412e-05,
"loss": 0.863,
"mean_token_accuracy": 0.751065675239264,
"step": 70
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.31927741229254775,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.8662,
"mean_token_accuracy": 0.7495714356533111,
"step": 75
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.3005392170981355,
"learning_rate": 1.8823529411764708e-05,
"loss": 0.8481,
"mean_token_accuracy": 0.753874134893318,
"step": 80
},
{
"epoch": 0.5,
"grad_norm": 0.29730380637608206,
"learning_rate": 2e-05,
"loss": 0.8395,
"mean_token_accuracy": 0.755573912125712,
"step": 85
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.3669506861420523,
"learning_rate": 1.9997891995035914e-05,
"loss": 0.8465,
"mean_token_accuracy": 0.7536556086477052,
"step": 90
},
{
"epoch": 0.5588235294117647,
"grad_norm": 0.29666794474782,
"learning_rate": 1.999156886888064e-05,
"loss": 0.8272,
"mean_token_accuracy": 0.7588607737716251,
"step": 95
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.30382605735837637,
"learning_rate": 1.9981033287370443e-05,
"loss": 0.8219,
"mean_token_accuracy": 0.760143495941066,
"step": 100
},
{
"epoch": 0.6176470588235294,
"grad_norm": 0.29012444795471226,
"learning_rate": 1.9966289692316944e-05,
"loss": 0.8311,
"mean_token_accuracy": 0.7565005350426584,
"step": 105
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.2912583936011533,
"learning_rate": 1.9947344299634464e-05,
"loss": 0.8222,
"mean_token_accuracy": 0.7590272071446524,
"step": 110
},
{
"epoch": 0.6764705882352942,
"grad_norm": 0.30772503851162025,
"learning_rate": 1.992420509671936e-05,
"loss": 0.82,
"mean_token_accuracy": 0.7592413729046079,
"step": 115
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.37056353510208045,
"learning_rate": 1.9896881839082554e-05,
"loss": 0.8212,
"mean_token_accuracy": 0.759279812103064,
"step": 120
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.29893908828157484,
"learning_rate": 1.9865386046236597e-05,
"loss": 0.8256,
"mean_token_accuracy": 0.7575601720590184,
"step": 125
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.3074055212225788,
"learning_rate": 1.982973099683902e-05,
"loss": 0.8155,
"mean_token_accuracy": 0.75987807875422,
"step": 130
},
{
"epoch": 0.7941176470588235,
"grad_norm": 0.3054700413716101,
"learning_rate": 1.9789931723094046e-05,
"loss": 0.8145,
"mean_token_accuracy": 0.7605635562676385,
"step": 135
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.31001811075570757,
"learning_rate": 1.9746005004415004e-05,
"loss": 0.8039,
"mean_token_accuracy": 0.7631607094065346,
"step": 140
},
{
"epoch": 0.8529411764705882,
"grad_norm": 0.3136963713557885,
"learning_rate": 1.9697969360350098e-05,
"loss": 0.8049,
"mean_token_accuracy": 0.7624575022466116,
"step": 145
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.3191880097370058,
"learning_rate": 1.9645845042774555e-05,
"loss": 0.802,
"mean_token_accuracy": 0.7633903493459862,
"step": 150
},
{
"epoch": 0.9117647058823529,
"grad_norm": 0.31057235371298675,
"learning_rate": 1.9589654027352412e-05,
"loss": 0.7986,
"mean_token_accuracy": 0.7643766169115257,
"step": 155
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.29582364900334857,
"learning_rate": 1.9529420004271568e-05,
"loss": 0.7914,
"mean_token_accuracy": 0.7649156224508902,
"step": 160
},
{
"epoch": 0.9705882352941176,
"grad_norm": 0.3193491594935024,
"learning_rate": 1.9465168368255946e-05,
"loss": 0.8003,
"mean_token_accuracy": 0.762817530530919,
"step": 165
},
{
"epoch": 1.0,
"grad_norm": 0.3110939132941867,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.79,
"mean_token_accuracy": 0.7647602598149285,
"step": 170
},
{
"epoch": 1.0294117647058822,
"grad_norm": 0.3633335573513833,
"learning_rate": 1.932472229404356e-05,
"loss": 0.7299,
"mean_token_accuracy": 0.7798239661883426,
"step": 175
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.3285229069104878,
"learning_rate": 1.924858706805112e-05,
"loss": 0.7229,
"mean_token_accuracy": 0.7820958235079569,
"step": 180
},
{
"epoch": 1.088235294117647,
"grad_norm": 0.3128912088038598,
"learning_rate": 1.9168552628568632e-05,
"loss": 0.7271,
"mean_token_accuracy": 0.7804769391368682,
"step": 185
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.3081304554701627,
"learning_rate": 1.9084652718195237e-05,
"loss": 0.7312,
"mean_token_accuracy": 0.7789700891513861,
"step": 190
},
{
"epoch": 1.1470588235294117,
"grad_norm": 0.3276014451232185,
"learning_rate": 1.8996922709216456e-05,
"loss": 0.7198,
"mean_token_accuracy": 0.7823098470986267,
"step": 195
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.30110481549521834,
"learning_rate": 1.8905399588691165e-05,
"loss": 0.7243,
"mean_token_accuracy": 0.7809635103462556,
"step": 200
},
{
"epoch": 1.2058823529411764,
"grad_norm": 0.30156642595064437,
"learning_rate": 1.8810121942857848e-05,
"loss": 0.7087,
"mean_token_accuracy": 0.7859269243460426,
"step": 205
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.34077822382920914,
"learning_rate": 1.8711129940866577e-05,
"loss": 0.7248,
"mean_token_accuracy": 0.7809192289978137,
"step": 210
},
{
"epoch": 1.2647058823529411,
"grad_norm": 0.32911315395038965,
"learning_rate": 1.860846531784368e-05,
"loss": 0.7093,
"mean_token_accuracy": 0.785446429758738,
"step": 215
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.3157810716541353,
"learning_rate": 1.8502171357296144e-05,
"loss": 0.7278,
"mean_token_accuracy": 0.7799419555666971,
"step": 220
},
{
"epoch": 1.3235294117647058,
"grad_norm": 0.30424816114084574,
"learning_rate": 1.839229287286327e-05,
"loss": 0.7275,
"mean_token_accuracy": 0.7794343137623393,
"step": 225
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.2984011834982273,
"learning_rate": 1.827887618942318e-05,
"loss": 0.7175,
"mean_token_accuracy": 0.7828088672818859,
"step": 230
},
{
"epoch": 1.3823529411764706,
"grad_norm": 0.33971195458260056,
"learning_rate": 1.816196912356222e-05,
"loss": 0.7172,
"mean_token_accuracy": 0.7824906691918898,
"step": 235
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.32633469157456657,
"learning_rate": 1.8041620963415418e-05,
"loss": 0.7208,
"mean_token_accuracy": 0.7814289254041461,
"step": 240
},
{
"epoch": 1.4411764705882353,
"grad_norm": 0.31489767315655054,
"learning_rate": 1.7917882447886585e-05,
"loss": 0.7365,
"mean_token_accuracy": 0.7766203149595159,
"step": 245
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.32277097391893517,
"learning_rate": 1.7790805745256703e-05,
"loss": 0.7177,
"mean_token_accuracy": 0.782642169037961,
"step": 250
},
{
"epoch": 1.5,
"grad_norm": 0.30987881280212765,
"learning_rate": 1.766044443118978e-05,
"loss": 0.727,
"mean_token_accuracy": 0.779525294630621,
"step": 255
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.31448685091412465,
"learning_rate": 1.7526853466145248e-05,
"loss": 0.721,
"mean_token_accuracy": 0.7812619303947332,
"step": 260
},
{
"epoch": 1.5588235294117647,
"grad_norm": 0.32175478306501826,
"learning_rate": 1.7390089172206594e-05,
"loss": 0.7241,
"mean_token_accuracy": 0.780364964868133,
"step": 265
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.2981306127426255,
"learning_rate": 1.725020920933593e-05,
"loss": 0.7267,
"mean_token_accuracy": 0.7790934213702954,
"step": 270
},
{
"epoch": 1.6176470588235294,
"grad_norm": 0.3033949440928067,
"learning_rate": 1.710727255106447e-05,
"loss": 0.7257,
"mean_token_accuracy": 0.7797863888193627,
"step": 275
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.30974296262432216,
"learning_rate": 1.696133945962927e-05,
"loss": 0.7235,
"mean_token_accuracy": 0.7804705115541349,
"step": 280
},
{
"epoch": 1.6764705882352942,
"grad_norm": 0.31142423335839425,
"learning_rate": 1.681247146056654e-05,
"loss": 0.7086,
"mean_token_accuracy": 0.7846730427727773,
"step": 285
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.3071934896661938,
"learning_rate": 1.6660731316772503e-05,
"loss": 0.7174,
"mean_token_accuracy": 0.7824091019225257,
"step": 290
},
{
"epoch": 1.7352941176470589,
"grad_norm": 0.3386291051319253,
"learning_rate": 1.650618300204242e-05,
"loss": 0.7232,
"mean_token_accuracy": 0.7803308030095385,
"step": 295
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.3140186357800053,
"learning_rate": 1.634889167409923e-05,
"loss": 0.7129,
"mean_token_accuracy": 0.782982412331459,
"step": 300
},
{
"epoch": 1.7941176470588234,
"grad_norm": 0.3021409601720461,
"learning_rate": 1.6188923647122946e-05,
"loss": 0.7107,
"mean_token_accuracy": 0.7836278096533718,
"step": 305
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.314246176884694,
"learning_rate": 1.6026346363792565e-05,
"loss": 0.7049,
"mean_token_accuracy": 0.7850809024928556,
"step": 310
},
{
"epoch": 1.8529411764705883,
"grad_norm": 0.30224040461662033,
"learning_rate": 1.5861228366852148e-05,
"loss": 0.7007,
"mean_token_accuracy": 0.7861974119207782,
"step": 315
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.3247059160222777,
"learning_rate": 1.5693639270213138e-05,
"loss": 0.7185,
"mean_token_accuracy": 0.7818327782869858,
"step": 320
},
{
"epoch": 1.9117647058823528,
"grad_norm": 0.2990975208169173,
"learning_rate": 1.552364972960506e-05,
"loss": 0.7105,
"mean_token_accuracy": 0.7837350647310662,
"step": 325
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.292315327238204,
"learning_rate": 1.5351331412787004e-05,
"loss": 0.7062,
"mean_token_accuracy": 0.7850761466894601,
"step": 330
},
{
"epoch": 1.9705882352941178,
"grad_norm": 0.34311073870983216,
"learning_rate": 1.5176756969332428e-05,
"loss": 0.7103,
"mean_token_accuracy": 0.7839965589810655,
"step": 335
},
{
"epoch": 2.0,
"grad_norm": 0.29664361496826974,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.7133,
"mean_token_accuracy": 0.7825006235163733,
"step": 340
},
{
"epoch": 2.0294117647058822,
"grad_norm": 0.3924789864040961,
"learning_rate": 1.4821135025703491e-05,
"loss": 0.6271,
"mean_token_accuracy": 0.8049891882579882,
"step": 345
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.37701624048661836,
"learning_rate": 1.4640237456093636e-05,
"loss": 0.6164,
"mean_token_accuracy": 0.8081895456693632,
"step": 350
},
{
"epoch": 2.088235294117647,
"grad_norm": 0.33511734135583915,
"learning_rate": 1.4457383557765385e-05,
"loss": 0.6166,
"mean_token_accuracy": 0.8076739145094626,
"step": 355
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.33243930178538456,
"learning_rate": 1.427265042210381e-05,
"loss": 0.6243,
"mean_token_accuracy": 0.8049194097878407,
"step": 360
},
{
"epoch": 2.1470588235294117,
"grad_norm": 0.33350676253837797,
"learning_rate": 1.4086115932782316e-05,
"loss": 0.6084,
"mean_token_accuracy": 0.810022229810283,
"step": 365
},
{
"epoch": 2.176470588235294,
"grad_norm": 0.3634421833067062,
"learning_rate": 1.3897858732926794e-05,
"loss": 0.6214,
"mean_token_accuracy": 0.8059747023198034,
"step": 370
},
{
"epoch": 2.2058823529411766,
"grad_norm": 0.32882544775622274,
"learning_rate": 1.3707958191959609e-05,
"loss": 0.6226,
"mean_token_accuracy": 0.8054682041286021,
"step": 375
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.3305436002182192,
"learning_rate": 1.3516494372137368e-05,
"loss": 0.6112,
"mean_token_accuracy": 0.8089070977046144,
"step": 380
},
{
"epoch": 2.264705882352941,
"grad_norm": 0.33722751621182623,
"learning_rate": 1.3323547994796597e-05,
"loss": 0.628,
"mean_token_accuracy": 0.8039877302887909,
"step": 385
},
{
"epoch": 2.2941176470588234,
"grad_norm": 0.33248407945905123,
"learning_rate": 1.3129200406321545e-05,
"loss": 0.617,
"mean_token_accuracy": 0.8073568639992759,
"step": 390
},
{
"epoch": 2.323529411764706,
"grad_norm": 0.32142510146932524,
"learning_rate": 1.2933533543848462e-05,
"loss": 0.6215,
"mean_token_accuracy": 0.8055732590173094,
"step": 395
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.33654477807126704,
"learning_rate": 1.2736629900720832e-05,
"loss": 0.6157,
"mean_token_accuracy": 0.8077722830589826,
"step": 400
},
{
"epoch": 2.3823529411764706,
"grad_norm": 0.3235396120568174,
"learning_rate": 1.2538572491710079e-05,
"loss": 0.6175,
"mean_token_accuracy": 0.8072083426089869,
"step": 405
},
{
"epoch": 2.411764705882353,
"grad_norm": 0.32834778791268815,
"learning_rate": 1.2339444818016488e-05,
"loss": 0.609,
"mean_token_accuracy": 0.8097089774864203,
"step": 410
},
{
"epoch": 2.4411764705882355,
"grad_norm": 0.3256651992427977,
"learning_rate": 1.2139330832064975e-05,
"loss": 0.6093,
"mean_token_accuracy": 0.809542989158514,
"step": 415
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.32713429702744246,
"learning_rate": 1.1938314902110701e-05,
"loss": 0.6221,
"mean_token_accuracy": 0.80594164904137,
"step": 420
},
{
"epoch": 2.5,
"grad_norm": 0.32892077647415724,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.623,
"mean_token_accuracy": 0.8050380371141161,
"step": 425
},
{
"epoch": 2.5294117647058822,
"grad_norm": 0.3351510239392618,
"learning_rate": 1.1533916548786856e-05,
"loss": 0.6221,
"mean_token_accuracy": 0.8059383027153586,
"step": 430
},
{
"epoch": 2.5588235294117645,
"grad_norm": 0.32376353924923923,
"learning_rate": 1.133070462016454e-05,
"loss": 0.6296,
"mean_token_accuracy": 0.8035830262709686,
"step": 435
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.33352315854264303,
"learning_rate": 1.1126931665153213e-05,
"loss": 0.6303,
"mean_token_accuracy": 0.8032632915017253,
"step": 440
},
{
"epoch": 2.6176470588235294,
"grad_norm": 0.33504031536917384,
"learning_rate": 1.092268359463302e-05,
"loss": 0.6162,
"mean_token_accuracy": 0.8078497177646451,
"step": 445
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.32470133593647843,
"learning_rate": 1.0718046519793276e-05,
"loss": 0.6199,
"mean_token_accuracy": 0.8059828691740311,
"step": 450
},
{
"epoch": 2.6764705882352944,
"grad_norm": 0.3311817254874853,
"learning_rate": 1.0513106715827897e-05,
"loss": 0.6157,
"mean_token_accuracy": 0.8077946153778027,
"step": 455
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.32993180062270316,
"learning_rate": 1.0307950585561705e-05,
"loss": 0.6132,
"mean_token_accuracy": 0.8084912132776563,
"step": 460
},
{
"epoch": 2.735294117647059,
"grad_norm": 0.325542621113992,
"learning_rate": 1.01026646230229e-05,
"loss": 0.6227,
"mean_token_accuracy": 0.805343767865405,
"step": 465
},
{
"epoch": 2.764705882352941,
"grad_norm": 0.3173303746756242,
"learning_rate": 9.897335376977104e-06,
"loss": 0.6234,
"mean_token_accuracy": 0.8049582474536037,
"step": 470
},
{
"epoch": 2.7941176470588234,
"grad_norm": 0.31358035777765325,
"learning_rate": 9.692049414438298e-06,
"loss": 0.6189,
"mean_token_accuracy": 0.806300440606665,
"step": 475
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.3115874102200609,
"learning_rate": 9.486893284172103e-06,
"loss": 0.611,
"mean_token_accuracy": 0.8087930759771561,
"step": 480
},
{
"epoch": 2.8529411764705883,
"grad_norm": 0.32045359513311766,
"learning_rate": 9.281953480206725e-06,
"loss": 0.617,
"mean_token_accuracy": 0.806921570390956,
"step": 485
},
{
"epoch": 2.8823529411764706,
"grad_norm": 0.31864642830498724,
"learning_rate": 9.07731640536698e-06,
"loss": 0.6174,
"mean_token_accuracy": 0.8067054562694989,
"step": 490
},
{
"epoch": 2.911764705882353,
"grad_norm": 0.321661470925901,
"learning_rate": 8.87306833484679e-06,
"loss": 0.6177,
"mean_token_accuracy": 0.8073543308517797,
"step": 495
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.31775074388030633,
"learning_rate": 8.669295379835467e-06,
"loss": 0.6104,
"mean_token_accuracy": 0.8088693944661568,
"step": 500
},
{
"epoch": 2.9705882352941178,
"grad_norm": 0.31639792662745364,
"learning_rate": 8.466083451213145e-06,
"loss": 0.6137,
"mean_token_accuracy": 0.8079348085921285,
"step": 505
},
{
"epoch": 3.0,
"grad_norm": 0.3133845393789324,
"learning_rate": 8.263518223330698e-06,
"loss": 0.6199,
"mean_token_accuracy": 0.8058531411884824,
"step": 510
},
{
"epoch": 3.0294117647058822,
"grad_norm": 0.4515133376678954,
"learning_rate": 8.0616850978893e-06,
"loss": 0.5429,
"mean_token_accuracy": 0.8280749968215273,
"step": 515
},
{
"epoch": 3.0588235294117645,
"grad_norm": 0.43255149163083684,
"learning_rate": 7.860669167935028e-06,
"loss": 0.5424,
"mean_token_accuracy": 0.8279721150148507,
"step": 520
},
{
"epoch": 3.088235294117647,
"grad_norm": 0.3814367503664446,
"learning_rate": 7.660555181983517e-06,
"loss": 0.5369,
"mean_token_accuracy": 0.8295389031535093,
"step": 525
},
{
"epoch": 3.1176470588235294,
"grad_norm": 0.3585734376658857,
"learning_rate": 7.461427508289922e-06,
"loss": 0.5324,
"mean_token_accuracy": 0.8305589141009605,
"step": 530
},
{
"epoch": 3.1470588235294117,
"grad_norm": 0.3516058935793569,
"learning_rate": 7.263370099279173e-06,
"loss": 0.5295,
"mean_token_accuracy": 0.8319955163114205,
"step": 535
},
{
"epoch": 3.176470588235294,
"grad_norm": 0.3515670124714757,
"learning_rate": 7.066466456151541e-06,
"loss": 0.5409,
"mean_token_accuracy": 0.827999215139253,
"step": 540
},
{
"epoch": 3.2058823529411766,
"grad_norm": 0.3425326731760254,
"learning_rate": 6.870799593678459e-06,
"loss": 0.5343,
"mean_token_accuracy": 0.8301978132314783,
"step": 545
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.4045712289767328,
"learning_rate": 6.6764520052034054e-06,
"loss": 0.5265,
"mean_token_accuracy": 0.8329135799939124,
"step": 550
},
{
"epoch": 3.264705882352941,
"grad_norm": 0.3466050915406474,
"learning_rate": 6.483505627862632e-06,
"loss": 0.5305,
"mean_token_accuracy": 0.8308944335567917,
"step": 555
},
{
"epoch": 3.2941176470588234,
"grad_norm": 0.3480592845097812,
"learning_rate": 6.292041808040393e-06,
"loss": 0.5312,
"mean_token_accuracy": 0.8311541824181544,
"step": 560
},
{
"epoch": 3.323529411764706,
"grad_norm": 0.3476466845454666,
"learning_rate": 6.102141267073207e-06,
"loss": 0.5294,
"mean_token_accuracy": 0.8318951179091675,
"step": 565
},
{
"epoch": 3.3529411764705883,
"grad_norm": 0.5241508868023107,
"learning_rate": 5.913884067217686e-06,
"loss": 0.5528,
"mean_token_accuracy": 0.824183505569461,
"step": 570
},
{
"epoch": 3.3823529411764706,
"grad_norm": 0.35062363129549806,
"learning_rate": 5.727349577896194e-06,
"loss": 0.5397,
"mean_token_accuracy": 0.8283691203284675,
"step": 575
},
{
"epoch": 3.411764705882353,
"grad_norm": 0.3347143755417769,
"learning_rate": 5.542616442234618e-06,
"loss": 0.5346,
"mean_token_accuracy": 0.8302798791058473,
"step": 580
},
{
"epoch": 3.4411764705882355,
"grad_norm": 0.35274599242118565,
"learning_rate": 5.3597625439063685e-06,
"loss": 0.5367,
"mean_token_accuracy": 0.8289645071509449,
"step": 585
},
{
"epoch": 3.4705882352941178,
"grad_norm": 0.3420158033073416,
"learning_rate": 5.178864974296511e-06,
"loss": 0.5337,
"mean_token_accuracy": 0.8301539169568171,
"step": 590
},
{
"epoch": 3.5,
"grad_norm": 0.3454361511498841,
"learning_rate": 5.000000000000003e-06,
"loss": 0.5221,
"mean_token_accuracy": 0.8342361720674832,
"step": 595
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.33581256443628404,
"learning_rate": 4.823243030667576e-06,
"loss": 0.5441,
"mean_token_accuracy": 0.8269164229088842,
"step": 600
},
{
"epoch": 3.5588235294117645,
"grad_norm": 0.34226274444007343,
"learning_rate": 4.648668587212998e-06,
"loss": 0.54,
"mean_token_accuracy": 0.8282965213330156,
"step": 605
},
{
"epoch": 3.588235294117647,
"grad_norm": 0.3500745490149332,
"learning_rate": 4.476350270394942e-06,
"loss": 0.5365,
"mean_token_accuracy": 0.8292353029731997,
"step": 610
},
{
"epoch": 3.6176470588235294,
"grad_norm": 0.3397747170148032,
"learning_rate": 4.306360729786867e-06,
"loss": 0.531,
"mean_token_accuracy": 0.8312139710361102,
"step": 615
},
{
"epoch": 3.6470588235294117,
"grad_norm": 0.3441348350064406,
"learning_rate": 4.138771633147856e-06,
"loss": 0.5401,
"mean_token_accuracy": 0.8284560238696452,
"step": 620
},
{
"epoch": 3.6764705882352944,
"grad_norm": 0.34066437903851693,
"learning_rate": 3.973653636207437e-06,
"loss": 0.5389,
"mean_token_accuracy": 0.8286709573008434,
"step": 625
},
{
"epoch": 3.7058823529411766,
"grad_norm": 0.3435442485915975,
"learning_rate": 3.8110763528770543e-06,
"loss": 0.5328,
"mean_token_accuracy": 0.8307493440913218,
"step": 630
},
{
"epoch": 3.735294117647059,
"grad_norm": 0.33871491756685385,
"learning_rate": 3.651108325900773e-06,
"loss": 0.5396,
"mean_token_accuracy": 0.8285779483750313,
"step": 635
},
{
"epoch": 3.764705882352941,
"grad_norm": 0.3322716149770833,
"learning_rate": 3.493816997957582e-06,
"loss": 0.5281,
"mean_token_accuracy": 0.8320226506861953,
"step": 640
},
{
"epoch": 3.7941176470588234,
"grad_norm": 0.3408913341899395,
"learning_rate": 3.339268683227499e-06,
"loss": 0.5276,
"mean_token_accuracy": 0.8323494901801878,
"step": 645
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.33820541461548986,
"learning_rate": 3.1875285394334575e-06,
"loss": 0.5382,
"mean_token_accuracy": 0.8290421087951524,
"step": 650
},
{
"epoch": 3.8529411764705883,
"grad_norm": 0.3274191844325364,
"learning_rate": 3.0386605403707347e-06,
"loss": 0.5314,
"mean_token_accuracy": 0.8310860589974585,
"step": 655
},
{
"epoch": 3.8823529411764706,
"grad_norm": 0.3290584011649337,
"learning_rate": 2.8927274489355296e-06,
"loss": 0.5423,
"mean_token_accuracy": 0.8274875320572039,
"step": 660
},
{
"epoch": 3.911764705882353,
"grad_norm": 0.331251976751198,
"learning_rate": 2.749790790664074e-06,
"loss": 0.5321,
"mean_token_accuracy": 0.8305707371259743,
"step": 665
},
{
"epoch": 3.9411764705882355,
"grad_norm": 0.3357781073651013,
"learning_rate": 2.6099108277934105e-06,
"loss": 0.53,
"mean_token_accuracy": 0.8316958534732434,
"step": 670
},
{
"epoch": 3.9705882352941178,
"grad_norm": 0.36159917760376725,
"learning_rate": 2.4731465338547556e-06,
"loss": 0.5347,
"mean_token_accuracy": 0.8300342318417423,
"step": 675
},
{
"epoch": 4.0,
"grad_norm": 0.3308064618286195,
"learning_rate": 2.339555568810221e-06,
"loss": 0.5379,
"mean_token_accuracy": 0.8293708444456257,
"step": 680
},
{
"epoch": 4.029411764705882,
"grad_norm": 0.40943584673721745,
"learning_rate": 2.209194254743295e-06,
"loss": 0.4867,
"mean_token_accuracy": 0.8454998608013117,
"step": 685
},
{
"epoch": 4.0588235294117645,
"grad_norm": 0.45370431209989043,
"learning_rate": 2.0821175521134208e-06,
"loss": 0.4913,
"mean_token_accuracy": 0.8430829216918625,
"step": 690
},
{
"epoch": 4.088235294117647,
"grad_norm": 0.37717302254595253,
"learning_rate": 1.9583790365845823e-06,
"loss": 0.4873,
"mean_token_accuracy": 0.8438923476557101,
"step": 695
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.37698941484112286,
"learning_rate": 1.8380308764377841e-06,
"loss": 0.4836,
"mean_token_accuracy": 0.8452770314043809,
"step": 700
},
{
"epoch": 4.147058823529412,
"grad_norm": 0.34607890458991425,
"learning_rate": 1.7211238105768213e-06,
"loss": 0.4981,
"mean_token_accuracy": 0.8410555546918426,
"step": 705
},
{
"epoch": 4.176470588235294,
"grad_norm": 0.35787679802722067,
"learning_rate": 1.607707127136734e-06,
"loss": 0.4906,
"mean_token_accuracy": 0.8431163617148336,
"step": 710
},
{
"epoch": 4.205882352941177,
"grad_norm": 0.34960387882531924,
"learning_rate": 1.4978286427038602e-06,
"loss": 0.4842,
"mean_token_accuracy": 0.8452673287333395,
"step": 715
},
{
"epoch": 4.235294117647059,
"grad_norm": 0.3499750570371467,
"learning_rate": 1.3915346821563235e-06,
"loss": 0.4905,
"mean_token_accuracy": 0.8434864507543587,
"step": 720
},
{
"epoch": 4.264705882352941,
"grad_norm": 0.3491087040345698,
"learning_rate": 1.2888700591334225e-06,
"loss": 0.4853,
"mean_token_accuracy": 0.8449809552096237,
"step": 725
},
{
"epoch": 4.294117647058823,
"grad_norm": 0.34592474384102057,
"learning_rate": 1.1898780571421554e-06,
"loss": 0.4959,
"mean_token_accuracy": 0.8415862048252866,
"step": 730
},
{
"epoch": 4.323529411764706,
"grad_norm": 0.35193064175626443,
"learning_rate": 1.0946004113088381e-06,
"loss": 0.4779,
"mean_token_accuracy": 0.8476395134466322,
"step": 735
},
{
"epoch": 4.352941176470588,
"grad_norm": 0.3494094701083019,
"learning_rate": 1.0030772907835484e-06,
"loss": 0.4811,
"mean_token_accuracy": 0.8462678766991031,
"step": 740
},
{
"epoch": 4.382352941176471,
"grad_norm": 0.3418341304288849,
"learning_rate": 9.153472818047627e-07,
"loss": 0.4784,
"mean_token_accuracy": 0.8473868594161684,
"step": 745
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.46400060241908164,
"learning_rate": 8.31447371431372e-07,
"loss": 0.4902,
"mean_token_accuracy": 0.8432360408003623,
"step": 750
},
{
"epoch": 4.4411764705882355,
"grad_norm": 0.34372164358517193,
"learning_rate": 7.514129319488839e-07,
"loss": 0.4901,
"mean_token_accuracy": 0.8436843577092926,
"step": 755
},
{
"epoch": 4.470588235294118,
"grad_norm": 0.3433127486725145,
"learning_rate": 6.752777059564431e-07,
"loss": 0.4876,
"mean_token_accuracy": 0.8438663347463565,
"step": 760
},
{
"epoch": 4.5,
"grad_norm": 0.3512358511844125,
"learning_rate": 6.030737921409169e-07,
"loss": 0.4761,
"mean_token_accuracy": 0.8475494992121411,
"step": 765
},
{
"epoch": 4.529411764705882,
"grad_norm": 0.47038019186648355,
"learning_rate": 5.348316317440549e-07,
"loss": 0.4856,
"mean_token_accuracy": 0.8448994452927483,
"step": 770
},
{
"epoch": 4.5588235294117645,
"grad_norm": 1.9678500078373677,
"learning_rate": 4.7057999572843516e-07,
"loss": 0.4939,
"mean_token_accuracy": 0.8421438229981003,
"step": 775
},
{
"epoch": 4.588235294117647,
"grad_norm": 0.3407337138700973,
"learning_rate": 4.103459726475889e-07,
"loss": 0.4827,
"mean_token_accuracy": 0.845469943467501,
"step": 780
},
{
"epoch": 4.617647058823529,
"grad_norm": 0.3414928705541955,
"learning_rate": 3.541549572254488e-07,
"loss": 0.4888,
"mean_token_accuracy": 0.8437554763885953,
"step": 785
},
{
"epoch": 4.647058823529412,
"grad_norm": 0.34079162834625754,
"learning_rate": 3.020306396499062e-07,
"loss": 0.4824,
"mean_token_accuracy": 0.8457629333628377,
"step": 790
},
{
"epoch": 4.676470588235294,
"grad_norm": 0.3367904168904605,
"learning_rate": 2.539949955849985e-07,
"loss": 0.4891,
"mean_token_accuracy": 0.8440021811993889,
"step": 795
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.3439387496930163,
"learning_rate": 2.1006827690595478e-07,
"loss": 0.4829,
"mean_token_accuracy": 0.8455398197222916,
"step": 800
},
{
"epoch": 4.735294117647059,
"grad_norm": 0.348751695583376,
"learning_rate": 1.7026900316098217e-07,
"loss": 0.4905,
"mean_token_accuracy": 0.8436304876776248,
"step": 805
},
{
"epoch": 4.764705882352941,
"grad_norm": 0.34277352712561143,
"learning_rate": 1.3461395376340502e-07,
"loss": 0.4795,
"mean_token_accuracy": 0.8471346559845427,
"step": 810
},
{
"epoch": 4.794117647058823,
"grad_norm": 0.34865285423056336,
"learning_rate": 1.0311816091744698e-07,
"loss": 0.4887,
"mean_token_accuracy": 0.8439427585239713,
"step": 815
},
{
"epoch": 4.823529411764706,
"grad_norm": 0.3510593438971943,
"learning_rate": 7.579490328064265e-08,
"loss": 0.4833,
"mean_token_accuracy": 0.8458809389490854,
"step": 820
},
{
"epoch": 4.852941176470588,
"grad_norm": 0.3463646553260396,
"learning_rate": 5.265570036553813e-08,
"loss": 0.4891,
"mean_token_accuracy": 0.8441209681319158,
"step": 825
},
{
"epoch": 4.882352941176471,
"grad_norm": 0.3425423146063826,
"learning_rate": 3.371030768305583e-08,
"loss": 0.4918,
"mean_token_accuracy": 0.8428475797274417,
"step": 830
},
{
"epoch": 4.911764705882353,
"grad_norm": 0.3417126353423841,
"learning_rate": 1.896671262955896e-08,
"loss": 0.4892,
"mean_token_accuracy": 0.8436010165370922,
"step": 835
},
{
"epoch": 4.9411764705882355,
"grad_norm": 0.3578021424188356,
"learning_rate": 8.431131119361891e-09,
"loss": 0.4852,
"mean_token_accuracy": 0.8451092251253867,
"step": 840
},
{
"epoch": 4.970588235294118,
"grad_norm": 0.3424641211768565,
"learning_rate": 2.108004964086474e-09,
"loss": 0.4894,
"mean_token_accuracy": 0.8437161999232033,
"step": 845
},
{
"epoch": 5.0,
"grad_norm": 0.3466047787737126,
"learning_rate": 0.0,
"loss": 0.4888,
"mean_token_accuracy": 0.8429997643925609,
"step": 850
},
{
"epoch": 5.0,
"step": 850,
"total_flos": 355893069742080.0,
"train_loss": 0.6545549502092249,
"train_runtime": 47744.7612,
"train_samples_per_second": 2.278,
"train_steps_per_second": 0.018
}
],
"logging_steps": 5,
"max_steps": 850,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 355893069742080.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}