|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9946777054997042, |
|
"eval_steps": 500, |
|
"global_step": 1266, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02365464222353637, |
|
"grad_norm": 1.5665423397587683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8891, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04730928444707274, |
|
"grad_norm": 1.0589767378232773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7949, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0709639266706091, |
|
"grad_norm": 0.8838983419870171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7599, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09461856889414548, |
|
"grad_norm": 0.9501620192903069, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7391, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11827321111768184, |
|
"grad_norm": 0.7634636002964827, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7272, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1419278533412182, |
|
"grad_norm": 0.6753406016779676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7159, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16558249556475457, |
|
"grad_norm": 0.6614325908691252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7118, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18923713778829096, |
|
"grad_norm": 0.7719184686061289, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7019, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21289178001182732, |
|
"grad_norm": 0.7524231413666251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6872, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23654642223536368, |
|
"grad_norm": 0.8110229180711027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6853, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26020106445890007, |
|
"grad_norm": 0.4971482406443161, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6909, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2838557066824364, |
|
"grad_norm": 0.9464051864732146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6782, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3075103489059728, |
|
"grad_norm": 0.5071907565316945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6744, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33116499112950915, |
|
"grad_norm": 0.9224817650868851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6828, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.35481963335304556, |
|
"grad_norm": 0.8885554899363501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3784742755765819, |
|
"grad_norm": 0.4781037463107871, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6701, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4021289178001183, |
|
"grad_norm": 0.6023227514998295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6677, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.42578356002365464, |
|
"grad_norm": 0.5603961021476538, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6742, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 0.4984868681428728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6711, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.47309284447072736, |
|
"grad_norm": 0.6312594557804502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6689, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4967474866942638, |
|
"grad_norm": 0.9550758404721197, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6707, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5204021289178001, |
|
"grad_norm": 0.6169594889049308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6699, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5440567711413364, |
|
"grad_norm": 0.46636130254439206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6615, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5677114133648729, |
|
"grad_norm": 0.5215033819505257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6657, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5913660555884093, |
|
"grad_norm": 0.5111456784576656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6633, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6150206978119456, |
|
"grad_norm": 0.517604760100503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6531, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.638675340035482, |
|
"grad_norm": 0.6357411311158894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6652, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6623299822590183, |
|
"grad_norm": 0.6476086780436829, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6676, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6859846244825547, |
|
"grad_norm": 0.46209126279187906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6601, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7096392667060911, |
|
"grad_norm": 0.521950153282793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6554, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7332939089296274, |
|
"grad_norm": 0.47211201942387604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6617, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7569485511531638, |
|
"grad_norm": 0.5341122235375386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6563, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7806031933767001, |
|
"grad_norm": 0.5708931772664904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.654, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8042578356002366, |
|
"grad_norm": 0.6195560307499095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6543, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8279124778237729, |
|
"grad_norm": 0.5625110791293699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6621, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8515671200473093, |
|
"grad_norm": 0.4510061091403496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6546, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8752217622708457, |
|
"grad_norm": 0.46670984016010486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6576, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.5044563574335732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6489, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9225310467179184, |
|
"grad_norm": 0.6276485740632245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6548, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9461856889414547, |
|
"grad_norm": 0.4849232290323015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6473, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9698403311649911, |
|
"grad_norm": 0.47066212467927304, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6504, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9934949733885275, |
|
"grad_norm": 0.5498376915549745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6489, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9982259018332348, |
|
"eval_loss": 0.6508141756057739, |
|
"eval_runtime": 226.6795, |
|
"eval_samples_per_second": 50.243, |
|
"eval_steps_per_second": 0.393, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.0171496156120639, |
|
"grad_norm": 0.5216981625298186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6225, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0408042578356003, |
|
"grad_norm": 0.5287278292361843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6077, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0644589000591367, |
|
"grad_norm": 0.7304842680236713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0881135422826729, |
|
"grad_norm": 0.5644902109246774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6138, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1117681845062093, |
|
"grad_norm": 0.4753652134651053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6129, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1354228267297457, |
|
"grad_norm": 0.7165345582019033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6125, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1590774689532821, |
|
"grad_norm": 0.4641812019487026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6106, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1827321111768185, |
|
"grad_norm": 0.4846409340358244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6065, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2063867534003547, |
|
"grad_norm": 0.5624538054911821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2300413956238911, |
|
"grad_norm": 0.5069531857984526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6114, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2536960378474276, |
|
"grad_norm": 0.5605749208808489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6117, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.277350680070964, |
|
"grad_norm": 0.5081677690669225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6133, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3010053222945004, |
|
"grad_norm": 0.5493923133675747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6101, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3246599645180366, |
|
"grad_norm": 0.44898345952125707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6096, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"grad_norm": 0.6655768726116028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6094, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3719692489651094, |
|
"grad_norm": 0.4649052455972333, |
|
"learning_rate": 5e-06, |
|
"loss": 0.614, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3956238911886458, |
|
"grad_norm": 0.43539888089438844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6138, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4192785334121822, |
|
"grad_norm": 0.5826398325628406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.601, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4429331756357184, |
|
"grad_norm": 0.4128719638682357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6022, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4665878178592548, |
|
"grad_norm": 0.45529767094349805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6052, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4902424600827913, |
|
"grad_norm": 0.4330632579708222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6156, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5138971023063275, |
|
"grad_norm": 0.44938881612128456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6152, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.537551744529864, |
|
"grad_norm": 0.46628155511438146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6041, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5612063867534003, |
|
"grad_norm": 0.5843372770531808, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6091, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5848610289769367, |
|
"grad_norm": 0.5865805864061058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6055, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6085156712004731, |
|
"grad_norm": 0.6180152508932449, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6098, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6321703134240093, |
|
"grad_norm": 0.5250880775614019, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6035, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.655824955647546, |
|
"grad_norm": 0.478978441549023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6107, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6794795978710821, |
|
"grad_norm": 0.5161048108398052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6106, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7031342400946186, |
|
"grad_norm": 0.5444636992966723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6123, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.726788882318155, |
|
"grad_norm": 0.49014325863511293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6131, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7504435245416912, |
|
"grad_norm": 0.6313993012312799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6059, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7740981667652278, |
|
"grad_norm": 0.5158091820023666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6116, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.797752808988764, |
|
"grad_norm": 0.47029237511942096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.617, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8214074512123004, |
|
"grad_norm": 0.4280088668307211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6167, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8450620934358368, |
|
"grad_norm": 0.42280798454326274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6094, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.868716735659373, |
|
"grad_norm": 0.5015010241124669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.606, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8923713778829097, |
|
"grad_norm": 0.4647527250682549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.614, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9160260201064458, |
|
"grad_norm": 0.45335597096154956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9396806623299823, |
|
"grad_norm": 0.4377674877634198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6009, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.9633353045535187, |
|
"grad_norm": 0.4922578419992401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6108, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9869899467770549, |
|
"grad_norm": 0.4440378004846697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5988, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.9988172678888232, |
|
"eval_loss": 0.6402984857559204, |
|
"eval_runtime": 227.8867, |
|
"eval_samples_per_second": 49.977, |
|
"eval_steps_per_second": 0.391, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.0106445890005915, |
|
"grad_norm": 0.6249254373352294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5864, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.0342992312241277, |
|
"grad_norm": 0.5229366411844095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5668, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.057953873447664, |
|
"grad_norm": 0.5273129182521392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.569, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.0816085156712005, |
|
"grad_norm": 0.5807006714823286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5605, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.5316074498783832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5644, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.1289178001182734, |
|
"grad_norm": 0.571954438502547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.1525724423418096, |
|
"grad_norm": 0.4503797360191138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5625, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.1762270845653457, |
|
"grad_norm": 0.49334190713186854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5648, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.1998817267888824, |
|
"grad_norm": 0.5335672716582398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2235363690124186, |
|
"grad_norm": 0.5933317390318844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5662, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 0.5891028799938252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5716, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.2708456534594914, |
|
"grad_norm": 0.5459106423210626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5651, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.2945002956830276, |
|
"grad_norm": 0.5290952299819603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.568, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3181549379065642, |
|
"grad_norm": 0.612560828315047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5695, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3418095801301004, |
|
"grad_norm": 0.4941237525745503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5689, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.365464222353637, |
|
"grad_norm": 0.5650925849128444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5688, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.3891188645771733, |
|
"grad_norm": 0.4960493342953518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.568, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4127735068007095, |
|
"grad_norm": 0.5675556976300212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5656, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.436428149024246, |
|
"grad_norm": 0.4679420975237042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5658, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.4600827912477823, |
|
"grad_norm": 0.5934681961848346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5699, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.483737433471319, |
|
"grad_norm": 0.4994164056392902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5705, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.507392075694855, |
|
"grad_norm": 0.553203563454583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5692, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5310467179183913, |
|
"grad_norm": 0.4828644604519897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5741, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.554701360141928, |
|
"grad_norm": 0.49982783159481065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5729, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.578356002365464, |
|
"grad_norm": 0.5028480314612812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5688, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.6020106445890008, |
|
"grad_norm": 0.47037814252850013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5693, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.625665286812537, |
|
"grad_norm": 0.5106941109306207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5671, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.649319929036073, |
|
"grad_norm": 0.6215547951000225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5792, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.67297457125961, |
|
"grad_norm": 0.5708225359288293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5667, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"grad_norm": 0.4694908632163934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5666, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7202838557066826, |
|
"grad_norm": 0.5487886870493401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.569, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.743938497930219, |
|
"grad_norm": 0.5178726800202738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5755, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.767593140153755, |
|
"grad_norm": 0.572400968003655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.7912477823772917, |
|
"grad_norm": 0.4505102109858482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5694, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.814902424600828, |
|
"grad_norm": 0.4721423964551259, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5711, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8385570668243645, |
|
"grad_norm": 0.45313707571101325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.568, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.8622117090479007, |
|
"grad_norm": 0.4905572612804298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5708, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.885866351271437, |
|
"grad_norm": 0.4488283221129483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5644, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.9095209934949735, |
|
"grad_norm": 0.5086777707270197, |
|
"learning_rate": 5e-06, |
|
"loss": 0.57, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.9331756357185097, |
|
"grad_norm": 0.5014035211864805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5719, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.9568302779420463, |
|
"grad_norm": 0.46427442721882733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5812, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.9804849201655825, |
|
"grad_norm": 0.4933592342573123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5728, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.9946777054997042, |
|
"eval_loss": 0.6413019895553589, |
|
"eval_runtime": 227.2584, |
|
"eval_samples_per_second": 50.115, |
|
"eval_steps_per_second": 0.392, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 2.9946777054997042, |
|
"step": 1266, |
|
"total_flos": 2120178393415680.0, |
|
"train_loss": 0.6201754979801027, |
|
"train_runtime": 38082.1567, |
|
"train_samples_per_second": 17.045, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1266, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2120178393415680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|