|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.903614457831326, |
|
"eval_steps": 500, |
|
"global_step": 822, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12048192771084337, |
|
"grad_norm": 2.6048758029937744, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 1.2322, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 2.073004722595215, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 0.4195, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3614457831325301, |
|
"grad_norm": 1.133252501487732, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.1977, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 0.7059364914894104, |
|
"learning_rate": 0.00019047619047619048, |
|
"loss": 0.1374, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 0.7211808562278748, |
|
"learning_rate": 0.0001999480933568615, |
|
"loss": 0.1198, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 0.5436549782752991, |
|
"learning_rate": 0.00019973731496914914, |
|
"loss": 0.0994, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 0.612585186958313, |
|
"learning_rate": 0.00019936476229183133, |
|
"loss": 0.096, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.6169544458389282, |
|
"learning_rate": 0.00019883103960671305, |
|
"loss": 0.086, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0843373493975903, |
|
"grad_norm": 0.5241321921348572, |
|
"learning_rate": 0.00019813701261394136, |
|
"loss": 0.0838, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 0.3530048429965973, |
|
"learning_rate": 0.00019728380702783643, |
|
"loss": 0.0685, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3253012048192772, |
|
"grad_norm": 0.4779577851295471, |
|
"learning_rate": 0.00019627280675097908, |
|
"loss": 0.0667, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4457831325301205, |
|
"grad_norm": 0.7595181465148926, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.0626, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5662650602409638, |
|
"grad_norm": 0.4125242531299591, |
|
"learning_rate": 0.00019378423479332046, |
|
"loss": 0.0563, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6867469879518073, |
|
"grad_norm": 0.3203200697898865, |
|
"learning_rate": 0.0001923106995853349, |
|
"loss": 0.0621, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8072289156626506, |
|
"grad_norm": 0.3810570240020752, |
|
"learning_rate": 0.00019068743608505455, |
|
"loss": 0.0582, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.927710843373494, |
|
"grad_norm": 0.2916155457496643, |
|
"learning_rate": 0.00018891707723181294, |
|
"loss": 0.0598, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.0481927710843375, |
|
"grad_norm": 0.5267001986503601, |
|
"learning_rate": 0.00018700249455414394, |
|
"loss": 0.0541, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.1686746987951806, |
|
"grad_norm": 0.33983558416366577, |
|
"learning_rate": 0.0001849467935121521, |
|
"loss": 0.0524, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.289156626506024, |
|
"grad_norm": 0.36935296654701233, |
|
"learning_rate": 0.000182753308460445, |
|
"loss": 0.0526, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 0.4720439016819, |
|
"learning_rate": 0.0001804255972397977, |
|
"loss": 0.0478, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5301204819277108, |
|
"grad_norm": 0.572353720664978, |
|
"learning_rate": 0.00017796743540632223, |
|
"loss": 0.0504, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.6506024096385543, |
|
"grad_norm": 0.35790249705314636, |
|
"learning_rate": 0.0001753828101075017, |
|
"loss": 0.0476, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.7710843373493974, |
|
"grad_norm": 0.3537259101867676, |
|
"learning_rate": 0.00017267591361502232, |
|
"loss": 0.0456, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.891566265060241, |
|
"grad_norm": 0.34377923607826233, |
|
"learning_rate": 0.00016985113652489374, |
|
"loss": 0.0473, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.0120481927710845, |
|
"grad_norm": 0.32719871401786804, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 0.046, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.1325301204819276, |
|
"grad_norm": 0.30426010489463806, |
|
"learning_rate": 0.0001638664515178348, |
|
"loss": 0.045, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.253012048192771, |
|
"grad_norm": 0.4393266439437866, |
|
"learning_rate": 0.00016071625078187114, |
|
"loss": 0.0407, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.3734939759036147, |
|
"grad_norm": 0.48441895842552185, |
|
"learning_rate": 0.00015746756806510838, |
|
"loss": 0.0403, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.4939759036144578, |
|
"grad_norm": 0.2893245220184326, |
|
"learning_rate": 0.00015412567274279316, |
|
"loss": 0.039, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.6144578313253013, |
|
"grad_norm": 0.33335432410240173, |
|
"learning_rate": 0.00015069598538135906, |
|
"loss": 0.0374, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.734939759036145, |
|
"grad_norm": 0.36576047539711, |
|
"learning_rate": 0.0001471840689462482, |
|
"loss": 0.0424, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.855421686746988, |
|
"grad_norm": 0.2801503539085388, |
|
"learning_rate": 0.00014359561977876102, |
|
"loss": 0.0332, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.9759036144578315, |
|
"grad_norm": 0.29735037684440613, |
|
"learning_rate": 0.00013993645835656953, |
|
"loss": 0.0355, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.096385542168675, |
|
"grad_norm": 0.43135517835617065, |
|
"learning_rate": 0.0001362125198528817, |
|
"loss": 0.0442, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.216867469879518, |
|
"grad_norm": 0.38527563214302063, |
|
"learning_rate": 0.00013242984450956828, |
|
"loss": 0.0356, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.337349397590361, |
|
"grad_norm": 0.3635711073875427, |
|
"learning_rate": 0.00012859456783986893, |
|
"loss": 0.041, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.457831325301205, |
|
"grad_norm": 0.2927173972129822, |
|
"learning_rate": 0.00012471291067656697, |
|
"loss": 0.034, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.578313253012048, |
|
"grad_norm": 0.407322496175766, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 0.0327, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.698795180722891, |
|
"grad_norm": 0.33329063653945923, |
|
"learning_rate": 0.00011683570413470383, |
|
"loss": 0.0339, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.8192771084337345, |
|
"grad_norm": 0.2710469663143158, |
|
"learning_rate": 0.00011285293161395946, |
|
"loss": 0.0299, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.9397590361445785, |
|
"grad_norm": 0.308614045381546, |
|
"learning_rate": 0.00010884931159113586, |
|
"loss": 0.0386, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.0602409638554215, |
|
"grad_norm": 0.3434526324272156, |
|
"learning_rate": 0.00010483133795255071, |
|
"loss": 0.0367, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.180722891566265, |
|
"grad_norm": 0.3069072961807251, |
|
"learning_rate": 0.00010080552786613899, |
|
"loss": 0.0356, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.301204819277109, |
|
"grad_norm": 0.2935517430305481, |
|
"learning_rate": 9.677841121058273e-05, |
|
"loss": 0.0304, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.421686746987952, |
|
"grad_norm": 0.3041512072086334, |
|
"learning_rate": 9.275651998382377e-05, |
|
"loss": 0.0289, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.542168674698795, |
|
"grad_norm": 0.30504339933395386, |
|
"learning_rate": 8.874637770813946e-05, |
|
"loss": 0.0302, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.662650602409639, |
|
"grad_norm": 0.20522096753120422, |
|
"learning_rate": 8.475448884896547e-05, |
|
"loss": 0.029, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.783132530120482, |
|
"grad_norm": 0.2943919897079468, |
|
"learning_rate": 8.078732826462915e-05, |
|
"loss": 0.0288, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.903614457831325, |
|
"grad_norm": 0.328308641910553, |
|
"learning_rate": 7.685133070410571e-05, |
|
"loss": 0.0237, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.024096385542169, |
|
"grad_norm": 0.425624281167984, |
|
"learning_rate": 7.295288036983163e-05, |
|
"loss": 0.0303, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.144578313253012, |
|
"grad_norm": 0.2587184011936188, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.0246, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.265060240963855, |
|
"grad_norm": 0.35053542256355286, |
|
"learning_rate": 6.52938434246697e-05, |
|
"loss": 0.0273, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.385542168674699, |
|
"grad_norm": 0.2893577516078949, |
|
"learning_rate": 6.154567979971493e-05, |
|
"loss": 0.026, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.506024096385542, |
|
"grad_norm": 0.22769379615783691, |
|
"learning_rate": 5.785988922274711e-05, |
|
"loss": 0.029, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.626506024096385, |
|
"grad_norm": 0.3005225956439972, |
|
"learning_rate": 5.424245005956048e-05, |
|
"loss": 0.0246, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.746987951807229, |
|
"grad_norm": 0.2721225321292877, |
|
"learning_rate": 5.069922980970626e-05, |
|
"loss": 0.0256, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.867469879518072, |
|
"grad_norm": 0.23859107494354248, |
|
"learning_rate": 4.723597558938672e-05, |
|
"loss": 0.0295, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.9879518072289155, |
|
"grad_norm": 0.2128613144159317, |
|
"learning_rate": 4.385830480961192e-05, |
|
"loss": 0.0271, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.108433734939759, |
|
"grad_norm": 0.15234726667404175, |
|
"learning_rate": 4.057169606473827e-05, |
|
"loss": 0.0209, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.228915662650603, |
|
"grad_norm": 0.1998305469751358, |
|
"learning_rate": 3.738148024616863e-05, |
|
"loss": 0.0236, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.349397590361446, |
|
"grad_norm": 0.16628368198871613, |
|
"learning_rate": 3.429283189562694e-05, |
|
"loss": 0.022, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.469879518072289, |
|
"grad_norm": 0.22349688410758972, |
|
"learning_rate": 3.131076081203247e-05, |
|
"loss": 0.0218, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.590361445783133, |
|
"grad_norm": 0.1814289093017578, |
|
"learning_rate": 2.84401039255879e-05, |
|
"loss": 0.0201, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.710843373493976, |
|
"grad_norm": 0.228335902094841, |
|
"learning_rate": 2.5685517452260567e-05, |
|
"loss": 0.0224, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.831325301204819, |
|
"grad_norm": 0.1545294225215912, |
|
"learning_rate": 2.3051469341383402e-05, |
|
"loss": 0.0238, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.951807228915663, |
|
"grad_norm": 0.1964937150478363, |
|
"learning_rate": 2.0542232028624586e-05, |
|
"loss": 0.0232, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 8.072289156626505, |
|
"grad_norm": 0.15069320797920227, |
|
"learning_rate": 1.8161875506081293e-05, |
|
"loss": 0.0206, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.19277108433735, |
|
"grad_norm": 0.13602782785892487, |
|
"learning_rate": 1.5914260720737795e-05, |
|
"loss": 0.0187, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.313253012048193, |
|
"grad_norm": 0.19349129498004913, |
|
"learning_rate": 1.3803033311995072e-05, |
|
"loss": 0.0192, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.433734939759036, |
|
"grad_norm": 0.1513078659772873, |
|
"learning_rate": 1.1831617698430609e-05, |
|
"loss": 0.0171, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.55421686746988, |
|
"grad_norm": 0.15297076106071472, |
|
"learning_rate": 1.0003211523378796e-05, |
|
"loss": 0.0187, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.674698795180722, |
|
"grad_norm": 0.1606190949678421, |
|
"learning_rate": 8.32078046834176e-06, |
|
"loss": 0.0198, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 8.795180722891565, |
|
"grad_norm": 0.11450409889221191, |
|
"learning_rate": 6.787053442643232e-06, |
|
"loss": 0.0196, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.91566265060241, |
|
"grad_norm": 0.23408889770507812, |
|
"learning_rate": 5.40451815712748e-06, |
|
"loss": 0.0197, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 9.036144578313253, |
|
"grad_norm": 0.19213688373565674, |
|
"learning_rate": 4.175417089083378e-06, |
|
"loss": 0.0193, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 9.156626506024097, |
|
"grad_norm": 0.13061603903770447, |
|
"learning_rate": 3.1017438449379434e-06, |
|
"loss": 0.0217, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 9.27710843373494, |
|
"grad_norm": 0.20497524738311768, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 0.0206, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 9.397590361445783, |
|
"grad_norm": 0.16401517391204834, |
|
"learning_rate": 1.4273919068349184e-06, |
|
"loss": 0.0218, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.518072289156626, |
|
"grad_norm": 0.08886178582906723, |
|
"learning_rate": 8.294290178437969e-07, |
|
"loss": 0.0181, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.638554216867469, |
|
"grad_norm": 0.1821727752685547, |
|
"learning_rate": 3.923211576387087e-07, |
|
"loss": 0.0182, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.759036144578314, |
|
"grad_norm": 0.17254865169525146, |
|
"learning_rate": 1.1677731676733584e-07, |
|
"loss": 0.0193, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 9.879518072289157, |
|
"grad_norm": 0.18676318228244781, |
|
"learning_rate": 3.244428347204398e-09, |
|
"loss": 0.0161, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.903614457831326, |
|
"step": 822, |
|
"total_flos": 1.0767742262918784e+17, |
|
"train_loss": 0.05996827910361934, |
|
"train_runtime": 931.6223, |
|
"train_samples_per_second": 56.469, |
|
"train_steps_per_second": 0.882 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 822, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0767742262918784e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|