|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 100, |
|
"global_step": 850, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029411764705882353, |
|
"grad_norm": 7.394163236707443, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 1.4868, |
|
"mean_token_accuracy": 0.670187404697577, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.058823529411764705, |
|
"grad_norm": 4.989051492464897, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 1.4548, |
|
"mean_token_accuracy": 0.6696601003430951, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08823529411764706, |
|
"grad_norm": 1.718958499756161, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 1.2494, |
|
"mean_token_accuracy": 0.6895946883758232, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 1.7334017715973813, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 1.1266, |
|
"mean_token_accuracy": 0.7048006769329966, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.764379154080371, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 1.0738, |
|
"mean_token_accuracy": 0.7112541876090825, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17647058823529413, |
|
"grad_norm": 0.5574786594203679, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 0.9891, |
|
"mean_token_accuracy": 0.7278180810901669, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20588235294117646, |
|
"grad_norm": 0.43685454872404356, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 0.9706, |
|
"mean_token_accuracy": 0.7296801992148935, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 0.3720554551518854, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 0.933, |
|
"mean_token_accuracy": 0.738009820036878, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2647058823529412, |
|
"grad_norm": 0.32483974488667805, |
|
"learning_rate": 1.0588235294117648e-05, |
|
"loss": 0.9158, |
|
"mean_token_accuracy": 0.7409755301059546, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 0.3218526340694397, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 0.9203, |
|
"mean_token_accuracy": 0.7389487349847822, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3235294117647059, |
|
"grad_norm": 0.3002154739759154, |
|
"learning_rate": 1.2941176470588238e-05, |
|
"loss": 0.8935, |
|
"mean_token_accuracy": 0.7449524112208951, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.30954774782779204, |
|
"learning_rate": 1.4117647058823532e-05, |
|
"loss": 0.8859, |
|
"mean_token_accuracy": 0.7456974429587235, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.38235294117647056, |
|
"grad_norm": 0.31990114149137194, |
|
"learning_rate": 1.5294117647058822e-05, |
|
"loss": 0.8857, |
|
"mean_token_accuracy": 0.7454657299610173, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4117647058823529, |
|
"grad_norm": 0.29621110088082725, |
|
"learning_rate": 1.647058823529412e-05, |
|
"loss": 0.863, |
|
"mean_token_accuracy": 0.751065675239264, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 0.31927741229254775, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 0.8662, |
|
"mean_token_accuracy": 0.7495714356533111, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.3005392170981355, |
|
"learning_rate": 1.8823529411764708e-05, |
|
"loss": 0.8481, |
|
"mean_token_accuracy": 0.753874134893318, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.29730380637608206, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8395, |
|
"mean_token_accuracy": 0.755573912125712, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5294117647058824, |
|
"grad_norm": 0.3669506861420523, |
|
"learning_rate": 1.9997891995035914e-05, |
|
"loss": 0.8465, |
|
"mean_token_accuracy": 0.7536556086477052, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5588235294117647, |
|
"grad_norm": 0.29666794474782, |
|
"learning_rate": 1.999156886888064e-05, |
|
"loss": 0.8272, |
|
"mean_token_accuracy": 0.7588607737716251, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.30382605735837637, |
|
"learning_rate": 1.9981033287370443e-05, |
|
"loss": 0.8219, |
|
"mean_token_accuracy": 0.760143495941066, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6176470588235294, |
|
"grad_norm": 0.29012444795471226, |
|
"learning_rate": 1.9966289692316944e-05, |
|
"loss": 0.8311, |
|
"mean_token_accuracy": 0.7565005350426584, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6470588235294118, |
|
"grad_norm": 0.2912583936011533, |
|
"learning_rate": 1.9947344299634464e-05, |
|
"loss": 0.8222, |
|
"mean_token_accuracy": 0.7590272071446524, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6764705882352942, |
|
"grad_norm": 0.30772503851162025, |
|
"learning_rate": 1.992420509671936e-05, |
|
"loss": 0.82, |
|
"mean_token_accuracy": 0.7592413729046079, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 0.37056353510208045, |
|
"learning_rate": 1.9896881839082554e-05, |
|
"loss": 0.8212, |
|
"mean_token_accuracy": 0.759279812103064, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 0.29893908828157484, |
|
"learning_rate": 1.9865386046236597e-05, |
|
"loss": 0.8256, |
|
"mean_token_accuracy": 0.7575601720590184, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7647058823529411, |
|
"grad_norm": 0.3074055212225788, |
|
"learning_rate": 1.982973099683902e-05, |
|
"loss": 0.8155, |
|
"mean_token_accuracy": 0.75987807875422, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7941176470588235, |
|
"grad_norm": 0.3054700413716101, |
|
"learning_rate": 1.9789931723094046e-05, |
|
"loss": 0.8145, |
|
"mean_token_accuracy": 0.7605635562676385, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8235294117647058, |
|
"grad_norm": 0.31001811075570757, |
|
"learning_rate": 1.9746005004415004e-05, |
|
"loss": 0.8039, |
|
"mean_token_accuracy": 0.7631607094065346, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8529411764705882, |
|
"grad_norm": 0.3136963713557885, |
|
"learning_rate": 1.9697969360350098e-05, |
|
"loss": 0.8049, |
|
"mean_token_accuracy": 0.7624575022466116, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.3191880097370058, |
|
"learning_rate": 1.9645845042774555e-05, |
|
"loss": 0.802, |
|
"mean_token_accuracy": 0.7633903493459862, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9117647058823529, |
|
"grad_norm": 0.31057235371298675, |
|
"learning_rate": 1.9589654027352412e-05, |
|
"loss": 0.7986, |
|
"mean_token_accuracy": 0.7643766169115257, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.29582364900334857, |
|
"learning_rate": 1.9529420004271568e-05, |
|
"loss": 0.7914, |
|
"mean_token_accuracy": 0.7649156224508902, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9705882352941176, |
|
"grad_norm": 0.3193491594935024, |
|
"learning_rate": 1.9465168368255946e-05, |
|
"loss": 0.8003, |
|
"mean_token_accuracy": 0.762817530530919, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3110939132941867, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.79, |
|
"mean_token_accuracy": 0.7647602598149285, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0294117647058822, |
|
"grad_norm": 0.3633335573513833, |
|
"learning_rate": 1.932472229404356e-05, |
|
"loss": 0.7299, |
|
"mean_token_accuracy": 0.7798239661883426, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.0588235294117647, |
|
"grad_norm": 0.3285229069104878, |
|
"learning_rate": 1.924858706805112e-05, |
|
"loss": 0.7229, |
|
"mean_token_accuracy": 0.7820958235079569, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.088235294117647, |
|
"grad_norm": 0.3128912088038598, |
|
"learning_rate": 1.9168552628568632e-05, |
|
"loss": 0.7271, |
|
"mean_token_accuracy": 0.7804769391368682, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.1176470588235294, |
|
"grad_norm": 0.3081304554701627, |
|
"learning_rate": 1.9084652718195237e-05, |
|
"loss": 0.7312, |
|
"mean_token_accuracy": 0.7789700891513861, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1470588235294117, |
|
"grad_norm": 0.3276014451232185, |
|
"learning_rate": 1.8996922709216456e-05, |
|
"loss": 0.7198, |
|
"mean_token_accuracy": 0.7823098470986267, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.30110481549521834, |
|
"learning_rate": 1.8905399588691165e-05, |
|
"loss": 0.7243, |
|
"mean_token_accuracy": 0.7809635103462556, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2058823529411764, |
|
"grad_norm": 0.30156642595064437, |
|
"learning_rate": 1.8810121942857848e-05, |
|
"loss": 0.7087, |
|
"mean_token_accuracy": 0.7859269243460426, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.2352941176470589, |
|
"grad_norm": 0.34077822382920914, |
|
"learning_rate": 1.8711129940866577e-05, |
|
"loss": 0.7248, |
|
"mean_token_accuracy": 0.7809192289978137, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2647058823529411, |
|
"grad_norm": 0.32911315395038965, |
|
"learning_rate": 1.860846531784368e-05, |
|
"loss": 0.7093, |
|
"mean_token_accuracy": 0.785446429758738, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.2941176470588236, |
|
"grad_norm": 0.3157810716541353, |
|
"learning_rate": 1.8502171357296144e-05, |
|
"loss": 0.7278, |
|
"mean_token_accuracy": 0.7799419555666971, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3235294117647058, |
|
"grad_norm": 0.30424816114084574, |
|
"learning_rate": 1.839229287286327e-05, |
|
"loss": 0.7275, |
|
"mean_token_accuracy": 0.7794343137623393, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.3529411764705883, |
|
"grad_norm": 0.2984011834982273, |
|
"learning_rate": 1.827887618942318e-05, |
|
"loss": 0.7175, |
|
"mean_token_accuracy": 0.7828088672818859, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3823529411764706, |
|
"grad_norm": 0.33971195458260056, |
|
"learning_rate": 1.816196912356222e-05, |
|
"loss": 0.7172, |
|
"mean_token_accuracy": 0.7824906691918898, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 0.32633469157456657, |
|
"learning_rate": 1.8041620963415418e-05, |
|
"loss": 0.7208, |
|
"mean_token_accuracy": 0.7814289254041461, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4411764705882353, |
|
"grad_norm": 0.31489767315655054, |
|
"learning_rate": 1.7917882447886585e-05, |
|
"loss": 0.7365, |
|
"mean_token_accuracy": 0.7766203149595159, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.32277097391893517, |
|
"learning_rate": 1.7790805745256703e-05, |
|
"loss": 0.7177, |
|
"mean_token_accuracy": 0.782642169037961, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.30987881280212765, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.727, |
|
"mean_token_accuracy": 0.779525294630621, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.5294117647058822, |
|
"grad_norm": 0.31448685091412465, |
|
"learning_rate": 1.7526853466145248e-05, |
|
"loss": 0.721, |
|
"mean_token_accuracy": 0.7812619303947332, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5588235294117647, |
|
"grad_norm": 0.32175478306501826, |
|
"learning_rate": 1.7390089172206594e-05, |
|
"loss": 0.7241, |
|
"mean_token_accuracy": 0.780364964868133, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.5882352941176472, |
|
"grad_norm": 0.2981306127426255, |
|
"learning_rate": 1.725020920933593e-05, |
|
"loss": 0.7267, |
|
"mean_token_accuracy": 0.7790934213702954, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6176470588235294, |
|
"grad_norm": 0.3033949440928067, |
|
"learning_rate": 1.710727255106447e-05, |
|
"loss": 0.7257, |
|
"mean_token_accuracy": 0.7797863888193627, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6470588235294117, |
|
"grad_norm": 0.30974296262432216, |
|
"learning_rate": 1.696133945962927e-05, |
|
"loss": 0.7235, |
|
"mean_token_accuracy": 0.7804705115541349, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.6764705882352942, |
|
"grad_norm": 0.31142423335839425, |
|
"learning_rate": 1.681247146056654e-05, |
|
"loss": 0.7086, |
|
"mean_token_accuracy": 0.7846730427727773, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.7058823529411766, |
|
"grad_norm": 0.3071934896661938, |
|
"learning_rate": 1.6660731316772503e-05, |
|
"loss": 0.7174, |
|
"mean_token_accuracy": 0.7824091019225257, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7352941176470589, |
|
"grad_norm": 0.3386291051319253, |
|
"learning_rate": 1.650618300204242e-05, |
|
"loss": 0.7232, |
|
"mean_token_accuracy": 0.7803308030095385, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.3140186357800053, |
|
"learning_rate": 1.634889167409923e-05, |
|
"loss": 0.7129, |
|
"mean_token_accuracy": 0.782982412331459, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7941176470588234, |
|
"grad_norm": 0.3021409601720461, |
|
"learning_rate": 1.6188923647122946e-05, |
|
"loss": 0.7107, |
|
"mean_token_accuracy": 0.7836278096533718, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.8235294117647058, |
|
"grad_norm": 0.314246176884694, |
|
"learning_rate": 1.6026346363792565e-05, |
|
"loss": 0.7049, |
|
"mean_token_accuracy": 0.7850809024928556, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.8529411764705883, |
|
"grad_norm": 0.30224040461662033, |
|
"learning_rate": 1.5861228366852148e-05, |
|
"loss": 0.7007, |
|
"mean_token_accuracy": 0.7861974119207782, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 0.3247059160222777, |
|
"learning_rate": 1.5693639270213138e-05, |
|
"loss": 0.7185, |
|
"mean_token_accuracy": 0.7818327782869858, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9117647058823528, |
|
"grad_norm": 0.2990975208169173, |
|
"learning_rate": 1.552364972960506e-05, |
|
"loss": 0.7105, |
|
"mean_token_accuracy": 0.7837350647310662, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.9411764705882353, |
|
"grad_norm": 0.292315327238204, |
|
"learning_rate": 1.5351331412787004e-05, |
|
"loss": 0.7062, |
|
"mean_token_accuracy": 0.7850761466894601, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.9705882352941178, |
|
"grad_norm": 0.34311073870983216, |
|
"learning_rate": 1.5176756969332428e-05, |
|
"loss": 0.7103, |
|
"mean_token_accuracy": 0.7839965589810655, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.29664361496826974, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.7133, |
|
"mean_token_accuracy": 0.7825006235163733, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0294117647058822, |
|
"grad_norm": 0.3924789864040961, |
|
"learning_rate": 1.4821135025703491e-05, |
|
"loss": 0.6271, |
|
"mean_token_accuracy": 0.8049891882579882, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.0588235294117645, |
|
"grad_norm": 0.37701624048661836, |
|
"learning_rate": 1.4640237456093636e-05, |
|
"loss": 0.6164, |
|
"mean_token_accuracy": 0.8081895456693632, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.088235294117647, |
|
"grad_norm": 0.33511734135583915, |
|
"learning_rate": 1.4457383557765385e-05, |
|
"loss": 0.6166, |
|
"mean_token_accuracy": 0.8076739145094626, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 0.33243930178538456, |
|
"learning_rate": 1.427265042210381e-05, |
|
"loss": 0.6243, |
|
"mean_token_accuracy": 0.8049194097878407, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.1470588235294117, |
|
"grad_norm": 0.33350676253837797, |
|
"learning_rate": 1.4086115932782316e-05, |
|
"loss": 0.6084, |
|
"mean_token_accuracy": 0.810022229810283, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.176470588235294, |
|
"grad_norm": 0.3634421833067062, |
|
"learning_rate": 1.3897858732926794e-05, |
|
"loss": 0.6214, |
|
"mean_token_accuracy": 0.8059747023198034, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.2058823529411766, |
|
"grad_norm": 0.32882544775622274, |
|
"learning_rate": 1.3707958191959609e-05, |
|
"loss": 0.6226, |
|
"mean_token_accuracy": 0.8054682041286021, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.235294117647059, |
|
"grad_norm": 0.3305436002182192, |
|
"learning_rate": 1.3516494372137368e-05, |
|
"loss": 0.6112, |
|
"mean_token_accuracy": 0.8089070977046144, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.264705882352941, |
|
"grad_norm": 0.33722751621182623, |
|
"learning_rate": 1.3323547994796597e-05, |
|
"loss": 0.628, |
|
"mean_token_accuracy": 0.8039877302887909, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.2941176470588234, |
|
"grad_norm": 0.33248407945905123, |
|
"learning_rate": 1.3129200406321545e-05, |
|
"loss": 0.617, |
|
"mean_token_accuracy": 0.8073568639992759, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.323529411764706, |
|
"grad_norm": 0.32142510146932524, |
|
"learning_rate": 1.2933533543848462e-05, |
|
"loss": 0.6215, |
|
"mean_token_accuracy": 0.8055732590173094, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.33654477807126704, |
|
"learning_rate": 1.2736629900720832e-05, |
|
"loss": 0.6157, |
|
"mean_token_accuracy": 0.8077722830589826, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.3823529411764706, |
|
"grad_norm": 0.3235396120568174, |
|
"learning_rate": 1.2538572491710079e-05, |
|
"loss": 0.6175, |
|
"mean_token_accuracy": 0.8072083426089869, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.411764705882353, |
|
"grad_norm": 0.32834778791268815, |
|
"learning_rate": 1.2339444818016488e-05, |
|
"loss": 0.609, |
|
"mean_token_accuracy": 0.8097089774864203, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.4411764705882355, |
|
"grad_norm": 0.3256651992427977, |
|
"learning_rate": 1.2139330832064975e-05, |
|
"loss": 0.6093, |
|
"mean_token_accuracy": 0.809542989158514, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.4705882352941178, |
|
"grad_norm": 0.32713429702744246, |
|
"learning_rate": 1.1938314902110701e-05, |
|
"loss": 0.6221, |
|
"mean_token_accuracy": 0.80594164904137, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.32892077647415724, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.623, |
|
"mean_token_accuracy": 0.8050380371141161, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.5294117647058822, |
|
"grad_norm": 0.3351510239392618, |
|
"learning_rate": 1.1533916548786856e-05, |
|
"loss": 0.6221, |
|
"mean_token_accuracy": 0.8059383027153586, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.5588235294117645, |
|
"grad_norm": 0.32376353924923923, |
|
"learning_rate": 1.133070462016454e-05, |
|
"loss": 0.6296, |
|
"mean_token_accuracy": 0.8035830262709686, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.588235294117647, |
|
"grad_norm": 0.33352315854264303, |
|
"learning_rate": 1.1126931665153213e-05, |
|
"loss": 0.6303, |
|
"mean_token_accuracy": 0.8032632915017253, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.6176470588235294, |
|
"grad_norm": 0.33504031536917384, |
|
"learning_rate": 1.092268359463302e-05, |
|
"loss": 0.6162, |
|
"mean_token_accuracy": 0.8078497177646451, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 0.32470133593647843, |
|
"learning_rate": 1.0718046519793276e-05, |
|
"loss": 0.6199, |
|
"mean_token_accuracy": 0.8059828691740311, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.6764705882352944, |
|
"grad_norm": 0.3311817254874853, |
|
"learning_rate": 1.0513106715827897e-05, |
|
"loss": 0.6157, |
|
"mean_token_accuracy": 0.8077946153778027, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.7058823529411766, |
|
"grad_norm": 0.32993180062270316, |
|
"learning_rate": 1.0307950585561705e-05, |
|
"loss": 0.6132, |
|
"mean_token_accuracy": 0.8084912132776563, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.735294117647059, |
|
"grad_norm": 0.325542621113992, |
|
"learning_rate": 1.01026646230229e-05, |
|
"loss": 0.6227, |
|
"mean_token_accuracy": 0.805343767865405, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.764705882352941, |
|
"grad_norm": 0.3173303746756242, |
|
"learning_rate": 9.897335376977104e-06, |
|
"loss": 0.6234, |
|
"mean_token_accuracy": 0.8049582474536037, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.7941176470588234, |
|
"grad_norm": 0.31358035777765325, |
|
"learning_rate": 9.692049414438298e-06, |
|
"loss": 0.6189, |
|
"mean_token_accuracy": 0.806300440606665, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 0.3115874102200609, |
|
"learning_rate": 9.486893284172103e-06, |
|
"loss": 0.611, |
|
"mean_token_accuracy": 0.8087930759771561, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.8529411764705883, |
|
"grad_norm": 0.32045359513311766, |
|
"learning_rate": 9.281953480206725e-06, |
|
"loss": 0.617, |
|
"mean_token_accuracy": 0.806921570390956, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.8823529411764706, |
|
"grad_norm": 0.31864642830498724, |
|
"learning_rate": 9.07731640536698e-06, |
|
"loss": 0.6174, |
|
"mean_token_accuracy": 0.8067054562694989, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.911764705882353, |
|
"grad_norm": 0.321661470925901, |
|
"learning_rate": 8.87306833484679e-06, |
|
"loss": 0.6177, |
|
"mean_token_accuracy": 0.8073543308517797, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.31775074388030633, |
|
"learning_rate": 8.669295379835467e-06, |
|
"loss": 0.6104, |
|
"mean_token_accuracy": 0.8088693944661568, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.9705882352941178, |
|
"grad_norm": 0.31639792662745364, |
|
"learning_rate": 8.466083451213145e-06, |
|
"loss": 0.6137, |
|
"mean_token_accuracy": 0.8079348085921285, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.3133845393789324, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.6199, |
|
"mean_token_accuracy": 0.8058531411884824, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.0294117647058822, |
|
"grad_norm": 0.4515133376678954, |
|
"learning_rate": 8.0616850978893e-06, |
|
"loss": 0.5429, |
|
"mean_token_accuracy": 0.8280749968215273, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.0588235294117645, |
|
"grad_norm": 0.43255149163083684, |
|
"learning_rate": 7.860669167935028e-06, |
|
"loss": 0.5424, |
|
"mean_token_accuracy": 0.8279721150148507, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.088235294117647, |
|
"grad_norm": 0.3814367503664446, |
|
"learning_rate": 7.660555181983517e-06, |
|
"loss": 0.5369, |
|
"mean_token_accuracy": 0.8295389031535093, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.1176470588235294, |
|
"grad_norm": 0.3585734376658857, |
|
"learning_rate": 7.461427508289922e-06, |
|
"loss": 0.5324, |
|
"mean_token_accuracy": 0.8305589141009605, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.1470588235294117, |
|
"grad_norm": 0.3516058935793569, |
|
"learning_rate": 7.263370099279173e-06, |
|
"loss": 0.5295, |
|
"mean_token_accuracy": 0.8319955163114205, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.176470588235294, |
|
"grad_norm": 0.3515670124714757, |
|
"learning_rate": 7.066466456151541e-06, |
|
"loss": 0.5409, |
|
"mean_token_accuracy": 0.827999215139253, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.2058823529411766, |
|
"grad_norm": 0.3425326731760254, |
|
"learning_rate": 6.870799593678459e-06, |
|
"loss": 0.5343, |
|
"mean_token_accuracy": 0.8301978132314783, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.235294117647059, |
|
"grad_norm": 0.4045712289767328, |
|
"learning_rate": 6.6764520052034054e-06, |
|
"loss": 0.5265, |
|
"mean_token_accuracy": 0.8329135799939124, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.264705882352941, |
|
"grad_norm": 0.3466050915406474, |
|
"learning_rate": 6.483505627862632e-06, |
|
"loss": 0.5305, |
|
"mean_token_accuracy": 0.8308944335567917, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.2941176470588234, |
|
"grad_norm": 0.3480592845097812, |
|
"learning_rate": 6.292041808040393e-06, |
|
"loss": 0.5312, |
|
"mean_token_accuracy": 0.8311541824181544, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.323529411764706, |
|
"grad_norm": 0.3476466845454666, |
|
"learning_rate": 6.102141267073207e-06, |
|
"loss": 0.5294, |
|
"mean_token_accuracy": 0.8318951179091675, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.3529411764705883, |
|
"grad_norm": 0.5241508868023107, |
|
"learning_rate": 5.913884067217686e-06, |
|
"loss": 0.5528, |
|
"mean_token_accuracy": 0.824183505569461, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.3823529411764706, |
|
"grad_norm": 0.35062363129549806, |
|
"learning_rate": 5.727349577896194e-06, |
|
"loss": 0.5397, |
|
"mean_token_accuracy": 0.8283691203284675, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.411764705882353, |
|
"grad_norm": 0.3347143755417769, |
|
"learning_rate": 5.542616442234618e-06, |
|
"loss": 0.5346, |
|
"mean_token_accuracy": 0.8302798791058473, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.4411764705882355, |
|
"grad_norm": 0.35274599242118565, |
|
"learning_rate": 5.3597625439063685e-06, |
|
"loss": 0.5367, |
|
"mean_token_accuracy": 0.8289645071509449, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.4705882352941178, |
|
"grad_norm": 0.3420158033073416, |
|
"learning_rate": 5.178864974296511e-06, |
|
"loss": 0.5337, |
|
"mean_token_accuracy": 0.8301539169568171, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.3454361511498841, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.5221, |
|
"mean_token_accuracy": 0.8342361720674832, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 0.33581256443628404, |
|
"learning_rate": 4.823243030667576e-06, |
|
"loss": 0.5441, |
|
"mean_token_accuracy": 0.8269164229088842, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.5588235294117645, |
|
"grad_norm": 0.34226274444007343, |
|
"learning_rate": 4.648668587212998e-06, |
|
"loss": 0.54, |
|
"mean_token_accuracy": 0.8282965213330156, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.588235294117647, |
|
"grad_norm": 0.3500745490149332, |
|
"learning_rate": 4.476350270394942e-06, |
|
"loss": 0.5365, |
|
"mean_token_accuracy": 0.8292353029731997, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.6176470588235294, |
|
"grad_norm": 0.3397747170148032, |
|
"learning_rate": 4.306360729786867e-06, |
|
"loss": 0.531, |
|
"mean_token_accuracy": 0.8312139710361102, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.6470588235294117, |
|
"grad_norm": 0.3441348350064406, |
|
"learning_rate": 4.138771633147856e-06, |
|
"loss": 0.5401, |
|
"mean_token_accuracy": 0.8284560238696452, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.6764705882352944, |
|
"grad_norm": 0.34066437903851693, |
|
"learning_rate": 3.973653636207437e-06, |
|
"loss": 0.5389, |
|
"mean_token_accuracy": 0.8286709573008434, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.7058823529411766, |
|
"grad_norm": 0.3435442485915975, |
|
"learning_rate": 3.8110763528770543e-06, |
|
"loss": 0.5328, |
|
"mean_token_accuracy": 0.8307493440913218, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.735294117647059, |
|
"grad_norm": 0.33871491756685385, |
|
"learning_rate": 3.651108325900773e-06, |
|
"loss": 0.5396, |
|
"mean_token_accuracy": 0.8285779483750313, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 3.764705882352941, |
|
"grad_norm": 0.3322716149770833, |
|
"learning_rate": 3.493816997957582e-06, |
|
"loss": 0.5281, |
|
"mean_token_accuracy": 0.8320226506861953, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.7941176470588234, |
|
"grad_norm": 0.3408913341899395, |
|
"learning_rate": 3.339268683227499e-06, |
|
"loss": 0.5276, |
|
"mean_token_accuracy": 0.8323494901801878, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 3.8235294117647056, |
|
"grad_norm": 0.33820541461548986, |
|
"learning_rate": 3.1875285394334575e-06, |
|
"loss": 0.5382, |
|
"mean_token_accuracy": 0.8290421087951524, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.8529411764705883, |
|
"grad_norm": 0.3274191844325364, |
|
"learning_rate": 3.0386605403707347e-06, |
|
"loss": 0.5314, |
|
"mean_token_accuracy": 0.8310860589974585, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 3.8823529411764706, |
|
"grad_norm": 0.3290584011649337, |
|
"learning_rate": 2.8927274489355296e-06, |
|
"loss": 0.5423, |
|
"mean_token_accuracy": 0.8274875320572039, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.911764705882353, |
|
"grad_norm": 0.331251976751198, |
|
"learning_rate": 2.749790790664074e-06, |
|
"loss": 0.5321, |
|
"mean_token_accuracy": 0.8305707371259743, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 3.9411764705882355, |
|
"grad_norm": 0.3357781073651013, |
|
"learning_rate": 2.6099108277934105e-06, |
|
"loss": 0.53, |
|
"mean_token_accuracy": 0.8316958534732434, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.9705882352941178, |
|
"grad_norm": 0.36159917760376725, |
|
"learning_rate": 2.4731465338547556e-06, |
|
"loss": 0.5347, |
|
"mean_token_accuracy": 0.8300342318417423, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.3308064618286195, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.5379, |
|
"mean_token_accuracy": 0.8293708444456257, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.029411764705882, |
|
"grad_norm": 0.40943584673721745, |
|
"learning_rate": 2.209194254743295e-06, |
|
"loss": 0.4867, |
|
"mean_token_accuracy": 0.8454998608013117, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.0588235294117645, |
|
"grad_norm": 0.45370431209989043, |
|
"learning_rate": 2.0821175521134208e-06, |
|
"loss": 0.4913, |
|
"mean_token_accuracy": 0.8430829216918625, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.088235294117647, |
|
"grad_norm": 0.37717302254595253, |
|
"learning_rate": 1.9583790365845823e-06, |
|
"loss": 0.4873, |
|
"mean_token_accuracy": 0.8438923476557101, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.117647058823529, |
|
"grad_norm": 0.37698941484112286, |
|
"learning_rate": 1.8380308764377841e-06, |
|
"loss": 0.4836, |
|
"mean_token_accuracy": 0.8452770314043809, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.147058823529412, |
|
"grad_norm": 0.34607890458991425, |
|
"learning_rate": 1.7211238105768213e-06, |
|
"loss": 0.4981, |
|
"mean_token_accuracy": 0.8410555546918426, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.176470588235294, |
|
"grad_norm": 0.35787679802722067, |
|
"learning_rate": 1.607707127136734e-06, |
|
"loss": 0.4906, |
|
"mean_token_accuracy": 0.8431163617148336, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.205882352941177, |
|
"grad_norm": 0.34960387882531924, |
|
"learning_rate": 1.4978286427038602e-06, |
|
"loss": 0.4842, |
|
"mean_token_accuracy": 0.8452673287333395, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.235294117647059, |
|
"grad_norm": 0.3499750570371467, |
|
"learning_rate": 1.3915346821563235e-06, |
|
"loss": 0.4905, |
|
"mean_token_accuracy": 0.8434864507543587, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.264705882352941, |
|
"grad_norm": 0.3491087040345698, |
|
"learning_rate": 1.2888700591334225e-06, |
|
"loss": 0.4853, |
|
"mean_token_accuracy": 0.8449809552096237, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.294117647058823, |
|
"grad_norm": 0.34592474384102057, |
|
"learning_rate": 1.1898780571421554e-06, |
|
"loss": 0.4959, |
|
"mean_token_accuracy": 0.8415862048252866, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.323529411764706, |
|
"grad_norm": 0.35193064175626443, |
|
"learning_rate": 1.0946004113088381e-06, |
|
"loss": 0.4779, |
|
"mean_token_accuracy": 0.8476395134466322, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.352941176470588, |
|
"grad_norm": 0.3494094701083019, |
|
"learning_rate": 1.0030772907835484e-06, |
|
"loss": 0.4811, |
|
"mean_token_accuracy": 0.8462678766991031, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.382352941176471, |
|
"grad_norm": 0.3418341304288849, |
|
"learning_rate": 9.153472818047627e-07, |
|
"loss": 0.4784, |
|
"mean_token_accuracy": 0.8473868594161684, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 4.411764705882353, |
|
"grad_norm": 0.46400060241908164, |
|
"learning_rate": 8.31447371431372e-07, |
|
"loss": 0.4902, |
|
"mean_token_accuracy": 0.8432360408003623, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.4411764705882355, |
|
"grad_norm": 0.34372164358517193, |
|
"learning_rate": 7.514129319488839e-07, |
|
"loss": 0.4901, |
|
"mean_token_accuracy": 0.8436843577092926, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 4.470588235294118, |
|
"grad_norm": 0.3433127486725145, |
|
"learning_rate": 6.752777059564431e-07, |
|
"loss": 0.4876, |
|
"mean_token_accuracy": 0.8438663347463565, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.3512358511844125, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.4761, |
|
"mean_token_accuracy": 0.8475494992121411, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 4.529411764705882, |
|
"grad_norm": 0.47038019186648355, |
|
"learning_rate": 5.348316317440549e-07, |
|
"loss": 0.4856, |
|
"mean_token_accuracy": 0.8448994452927483, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.5588235294117645, |
|
"grad_norm": 1.9678500078373677, |
|
"learning_rate": 4.7057999572843516e-07, |
|
"loss": 0.4939, |
|
"mean_token_accuracy": 0.8421438229981003, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.588235294117647, |
|
"grad_norm": 0.3407337138700973, |
|
"learning_rate": 4.103459726475889e-07, |
|
"loss": 0.4827, |
|
"mean_token_accuracy": 0.845469943467501, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.617647058823529, |
|
"grad_norm": 0.3414928705541955, |
|
"learning_rate": 3.541549572254488e-07, |
|
"loss": 0.4888, |
|
"mean_token_accuracy": 0.8437554763885953, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 4.647058823529412, |
|
"grad_norm": 0.34079162834625754, |
|
"learning_rate": 3.020306396499062e-07, |
|
"loss": 0.4824, |
|
"mean_token_accuracy": 0.8457629333628377, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.676470588235294, |
|
"grad_norm": 0.3367904168904605, |
|
"learning_rate": 2.539949955849985e-07, |
|
"loss": 0.4891, |
|
"mean_token_accuracy": 0.8440021811993889, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 0.3439387496930163, |
|
"learning_rate": 2.1006827690595478e-07, |
|
"loss": 0.4829, |
|
"mean_token_accuracy": 0.8455398197222916, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.735294117647059, |
|
"grad_norm": 0.348751695583376, |
|
"learning_rate": 1.7026900316098217e-07, |
|
"loss": 0.4905, |
|
"mean_token_accuracy": 0.8436304876776248, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 4.764705882352941, |
|
"grad_norm": 0.34277352712561143, |
|
"learning_rate": 1.3461395376340502e-07, |
|
"loss": 0.4795, |
|
"mean_token_accuracy": 0.8471346559845427, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.794117647058823, |
|
"grad_norm": 0.34865285423056336, |
|
"learning_rate": 1.0311816091744698e-07, |
|
"loss": 0.4887, |
|
"mean_token_accuracy": 0.8439427585239713, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 4.823529411764706, |
|
"grad_norm": 0.3510593438971943, |
|
"learning_rate": 7.579490328064265e-08, |
|
"loss": 0.4833, |
|
"mean_token_accuracy": 0.8458809389490854, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.852941176470588, |
|
"grad_norm": 0.3463646553260396, |
|
"learning_rate": 5.265570036553813e-08, |
|
"loss": 0.4891, |
|
"mean_token_accuracy": 0.8441209681319158, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.882352941176471, |
|
"grad_norm": 0.3425423146063826, |
|
"learning_rate": 3.371030768305583e-08, |
|
"loss": 0.4918, |
|
"mean_token_accuracy": 0.8428475797274417, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.911764705882353, |
|
"grad_norm": 0.3417126353423841, |
|
"learning_rate": 1.896671262955896e-08, |
|
"loss": 0.4892, |
|
"mean_token_accuracy": 0.8436010165370922, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 4.9411764705882355, |
|
"grad_norm": 0.3578021424188356, |
|
"learning_rate": 8.431131119361891e-09, |
|
"loss": 0.4852, |
|
"mean_token_accuracy": 0.8451092251253867, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.970588235294118, |
|
"grad_norm": 0.3424641211768565, |
|
"learning_rate": 2.108004964086474e-09, |
|
"loss": 0.4894, |
|
"mean_token_accuracy": 0.8437161999232033, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.3466047787737126, |
|
"learning_rate": 0.0, |
|
"loss": 0.4888, |
|
"mean_token_accuracy": 0.8429997643925609, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 850, |
|
"total_flos": 355893069742080.0, |
|
"train_loss": 0.6545549502092249, |
|
"train_runtime": 47744.7612, |
|
"train_samples_per_second": 2.278, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 850, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 355893069742080.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|