|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9961549837326236, |
|
"eval_steps": 500, |
|
"global_step": 1266, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.023661638568470866, |
|
"grad_norm": 5.716220186648855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8897, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04732327713694173, |
|
"grad_norm": 0.8490804690852378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7877, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0709849157054126, |
|
"grad_norm": 0.8359497191663584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7533, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09464655427388347, |
|
"grad_norm": 2.872573622069312, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7354, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11830819284235433, |
|
"grad_norm": 1.2235372630281218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.726, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1419698314108252, |
|
"grad_norm": 0.7914495021160732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7152, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16563146997929606, |
|
"grad_norm": 0.6440056541571992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7101, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18929310854776693, |
|
"grad_norm": 1.2255311186058144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7007, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2129547471162378, |
|
"grad_norm": 0.5863287056241707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6856, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23661638568470866, |
|
"grad_norm": 0.6673942726540693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.685, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26027802425317953, |
|
"grad_norm": 0.6052355221282993, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6898, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2839396628216504, |
|
"grad_norm": 0.6230907972132269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6772, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.30760130139012126, |
|
"grad_norm": 0.5418727176057381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6733, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33126293995859213, |
|
"grad_norm": 0.6020619371732497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6819, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.354924578527063, |
|
"grad_norm": 0.8518067973190109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6648, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.37858621709553386, |
|
"grad_norm": 0.5115922350127938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6691, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4022478556640047, |
|
"grad_norm": 0.7022530591693269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.667, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4259094942324756, |
|
"grad_norm": 0.5834704070363212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6739, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.44957113280094646, |
|
"grad_norm": 0.4653603355749383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6698, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4732327713694173, |
|
"grad_norm": 0.568232089382732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6688, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4968944099378882, |
|
"grad_norm": 1.0573970656294764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6697, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5205560485063591, |
|
"grad_norm": 0.7158198022991291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6681, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.54421768707483, |
|
"grad_norm": 0.43788884341448125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6603, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5678793256433008, |
|
"grad_norm": 0.4725698050393869, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6645, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5915409642117717, |
|
"grad_norm": 0.6197309721248793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6616, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6152026027802425, |
|
"grad_norm": 0.5419735846310992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.652, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6388642413487134, |
|
"grad_norm": 0.7492458117282386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6641, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6625258799171843, |
|
"grad_norm": 0.6330397603875498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6652, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6861875184856552, |
|
"grad_norm": 0.5065865148226276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6597, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.709849157054126, |
|
"grad_norm": 0.6357867453691719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6535, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7335107956225969, |
|
"grad_norm": 0.5149227689012754, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6619, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7571724341910677, |
|
"grad_norm": 0.44037598215405893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6556, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7808340727595386, |
|
"grad_norm": 0.5181894268105267, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6541, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8044957113280095, |
|
"grad_norm": 0.6098945421576618, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6538, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8281573498964804, |
|
"grad_norm": 0.5384169075272255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6602, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8518189884649512, |
|
"grad_norm": 0.47214684121487954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6538, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8754806270334221, |
|
"grad_norm": 0.4904597024034496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6567, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8991422656018929, |
|
"grad_norm": 0.4775889282770322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6478, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9228039041703638, |
|
"grad_norm": 0.6785347844213175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6553, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9464655427388347, |
|
"grad_norm": 0.48131943447081366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6466, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9701271813073056, |
|
"grad_norm": 0.4819882000845211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6491, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9937888198757764, |
|
"grad_norm": 0.48326887732044893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.648, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9985211475894705, |
|
"eval_loss": 0.6503860950469971, |
|
"eval_runtime": 295.5868, |
|
"eval_samples_per_second": 38.53, |
|
"eval_steps_per_second": 0.602, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.0177462289263532, |
|
"grad_norm": 0.5034024020075702, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6519, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.041407867494824, |
|
"grad_norm": 0.5181852823487637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6054, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0650695060632949, |
|
"grad_norm": 0.8086692425168592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6036, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0887311446317658, |
|
"grad_norm": 0.7087824451639375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6138, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1123927832002367, |
|
"grad_norm": 0.6293561803711789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.611, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1360544217687074, |
|
"grad_norm": 0.8003707364488958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6114, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1597160603371783, |
|
"grad_norm": 0.5556436203083284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6083, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1833776989056493, |
|
"grad_norm": 0.5222890058260059, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6038, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2070393374741202, |
|
"grad_norm": 0.5834395725334883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6038, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2307009760425909, |
|
"grad_norm": 0.6523486232667094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6103, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2543626146110618, |
|
"grad_norm": 0.509097134236641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6129, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2780242531795327, |
|
"grad_norm": 0.6109585275321274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6125, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3016858917480034, |
|
"grad_norm": 0.4671830560905703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6094, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3253475303164743, |
|
"grad_norm": 0.43619649641699987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6076, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3490091688849453, |
|
"grad_norm": 0.9334517894855655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3726708074534162, |
|
"grad_norm": 0.6158441071105143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6114, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.396332446021887, |
|
"grad_norm": 0.4177935072924951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6118, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4199940845903578, |
|
"grad_norm": 0.5261498840709943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6012, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4436557231588287, |
|
"grad_norm": 0.4118403315344034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6006, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4673173617272997, |
|
"grad_norm": 0.4500509679639768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6041, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4909790002957704, |
|
"grad_norm": 0.47480411557819086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6161, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5146406388642415, |
|
"grad_norm": 0.44192140481751196, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6104, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5383022774327122, |
|
"grad_norm": 0.43019141037435493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6044, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5619639160011831, |
|
"grad_norm": 0.5928638480003088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6093, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.585625554569654, |
|
"grad_norm": 0.680185794067505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6018, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6092871931381247, |
|
"grad_norm": 0.4446498601791125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6103, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6329488317065957, |
|
"grad_norm": 0.5328567150499224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6013, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.6566104702750666, |
|
"grad_norm": 0.5840475503667699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6109, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6802721088435373, |
|
"grad_norm": 0.5235562677113405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6094, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7039337474120084, |
|
"grad_norm": 0.6484686160336842, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6122, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7275953859804791, |
|
"grad_norm": 0.5923644814057659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.612, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.75125702454895, |
|
"grad_norm": 0.4896255525659027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6042, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.774918663117421, |
|
"grad_norm": 0.47313474988505516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6105, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.7985803016858917, |
|
"grad_norm": 0.4971344193994014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6146, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8222419402543626, |
|
"grad_norm": 0.42650481882946567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6161, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8459035788228335, |
|
"grad_norm": 0.45409425789151847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.607, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8695652173913042, |
|
"grad_norm": 0.47548804917566284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6057, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8932268559597754, |
|
"grad_norm": 0.49936009431407224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.614, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.916888494528246, |
|
"grad_norm": 0.42095008482708873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6118, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.940550133096717, |
|
"grad_norm": 0.469345021146904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.964211771665188, |
|
"grad_norm": 0.4821117072104332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6112, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9878734102336586, |
|
"grad_norm": 0.42444873739826294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5984, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.999704229517894, |
|
"eval_loss": 0.640017032623291, |
|
"eval_runtime": 291.0784, |
|
"eval_samples_per_second": 39.127, |
|
"eval_steps_per_second": 0.612, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.0118308192842353, |
|
"grad_norm": 0.743615658405185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.612, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.0354924578527065, |
|
"grad_norm": 0.6307060227030052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5665, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.059154096421177, |
|
"grad_norm": 0.4909469687109977, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5669, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.082815734989648, |
|
"grad_norm": 0.4982651206960173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5576, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.106477373558119, |
|
"grad_norm": 0.5189496162921208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5632, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.1301390121265897, |
|
"grad_norm": 0.5290452662480181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5673, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.1538006506950604, |
|
"grad_norm": 0.4298930748962063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5608, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.1774622892635316, |
|
"grad_norm": 0.46653862184558975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5622, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.2011239278320023, |
|
"grad_norm": 0.47565404277144174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5634, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2247855664004734, |
|
"grad_norm": 0.45977306537518353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5642, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.248447204968944, |
|
"grad_norm": 0.6195624778560548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5701, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.272108843537415, |
|
"grad_norm": 0.5625286018926492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5624, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.295770482105886, |
|
"grad_norm": 0.5516894450156284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5688, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3194321206743567, |
|
"grad_norm": 0.46703213745702465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3430937592428274, |
|
"grad_norm": 0.4529577298482224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5664, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.3667553978112985, |
|
"grad_norm": 0.5294804903105876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5679, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.390417036379769, |
|
"grad_norm": 0.476104510415757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5661, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4140786749482404, |
|
"grad_norm": 0.50214160643677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5632, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.437740313516711, |
|
"grad_norm": 0.4770392958453329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.4614019520851818, |
|
"grad_norm": 0.4746961197857643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5663, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.485063590653653, |
|
"grad_norm": 0.5075465690483602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5705, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.5087252292221236, |
|
"grad_norm": 0.5735236324477879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5692, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5323868677905947, |
|
"grad_norm": 0.7357338280472365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5693, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.5560485063590654, |
|
"grad_norm": 0.6312728091303266, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5721, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.579710144927536, |
|
"grad_norm": 0.6063258814160167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5668, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.603371783496007, |
|
"grad_norm": 0.512061567286246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5706, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.627033422064478, |
|
"grad_norm": 0.4805538505321553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5643, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.6506950606329487, |
|
"grad_norm": 0.47434096978031415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5784, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.67435669920142, |
|
"grad_norm": 0.48174002692295587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5642, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.6980183377698905, |
|
"grad_norm": 0.5206174302871078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7216799763383612, |
|
"grad_norm": 0.5231066430924299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.7453416149068324, |
|
"grad_norm": 0.5493755325406757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5722, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.769003253475303, |
|
"grad_norm": 0.4710457798970187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5673, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.792664892043774, |
|
"grad_norm": 0.6017685441197285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5673, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.816326530612245, |
|
"grad_norm": 0.48667322523587175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5712, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8399881691807156, |
|
"grad_norm": 0.6136328812041214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5689, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.8636498077491868, |
|
"grad_norm": 0.4848336832061156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5683, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.8873114463176575, |
|
"grad_norm": 0.6143451558754445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5611, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.9109730848861286, |
|
"grad_norm": 0.45186971294615735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5713, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.9346347234545993, |
|
"grad_norm": 0.45118967147278705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5703, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.95829636202307, |
|
"grad_norm": 0.5082137766462498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5791, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.9819580005915407, |
|
"grad_norm": 0.5446211548606755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.9961549837326236, |
|
"eval_loss": 0.6407967805862427, |
|
"eval_runtime": 281.5097, |
|
"eval_samples_per_second": 40.457, |
|
"eval_steps_per_second": 0.632, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 2.9961549837326236, |
|
"step": 1266, |
|
"total_flos": 2120387773071360.0, |
|
"train_loss": 0.6192890667425701, |
|
"train_runtime": 41807.2835, |
|
"train_samples_per_second": 15.526, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1266, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2120387773071360.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|