|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 48.57142857142857, |
|
"eval_steps": 500, |
|
"global_step": 3740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 4405.81884765625, |
|
"learning_rate": 3.436363636363636e-07, |
|
"loss": 1357.5645, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 5894.37841796875, |
|
"learning_rate": 6.936363636363637e-07, |
|
"loss": 1290.9688, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 3667.852294921875, |
|
"learning_rate": 1.0436363636363635e-06, |
|
"loss": 1140.9515, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 3578.294677734375, |
|
"learning_rate": 1.3936363636363637e-06, |
|
"loss": 942.618, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 2097.637939453125, |
|
"learning_rate": 1.7436363636363636e-06, |
|
"loss": 721.2856, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 1840.612548828125, |
|
"learning_rate": 2.0936363636363636e-06, |
|
"loss": 525.8542, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1230.875, |
|
"learning_rate": 2.4436363636363636e-06, |
|
"loss": 397.7722, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 1280.699951171875, |
|
"learning_rate": 2.7936363636363637e-06, |
|
"loss": 313.8322, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.428571428571429, |
|
"grad_norm": 4265.28759765625, |
|
"learning_rate": 3.1436363636363637e-06, |
|
"loss": 254.1449, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 967.1757202148438, |
|
"learning_rate": 3.4936363636363633e-06, |
|
"loss": 218.4529, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 7.857142857142857, |
|
"grad_norm": 765.2302856445312, |
|
"learning_rate": 3.843636363636364e-06, |
|
"loss": 197.915, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 700.331787109375, |
|
"learning_rate": 4.193636363636364e-06, |
|
"loss": 170.9019, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 9.285714285714286, |
|
"grad_norm": 613.8758544921875, |
|
"learning_rate": 4.543636363636363e-06, |
|
"loss": 154.6195, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 3405.25732421875, |
|
"learning_rate": 4.893636363636364e-06, |
|
"loss": 136.8394, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 10.714285714285714, |
|
"grad_norm": 1219.8994140625, |
|
"learning_rate": 5.243636363636364e-06, |
|
"loss": 116.0246, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 11.428571428571429, |
|
"grad_norm": 306.2723083496094, |
|
"learning_rate": 5.593636363636363e-06, |
|
"loss": 104.2055, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 12.142857142857142, |
|
"grad_norm": 288.2547302246094, |
|
"learning_rate": 5.943636363636364e-06, |
|
"loss": 88.9812, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 12.857142857142858, |
|
"grad_norm": 1016.5924682617188, |
|
"learning_rate": 6.293636363636363e-06, |
|
"loss": 79.9017, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 13.571428571428571, |
|
"grad_norm": 239.49191284179688, |
|
"learning_rate": 6.643636363636363e-06, |
|
"loss": 73.6932, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 14.285714285714286, |
|
"grad_norm": 193.92002868652344, |
|
"learning_rate": 6.993636363636364e-06, |
|
"loss": 62.2018, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 179.0568084716797, |
|
"learning_rate": 7.343636363636363e-06, |
|
"loss": 54.0213, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 15.714285714285714, |
|
"grad_norm": 166.02944946289062, |
|
"learning_rate": 7.693636363636364e-06, |
|
"loss": 48.1955, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 16.428571428571427, |
|
"grad_norm": 126.53916931152344, |
|
"learning_rate": 8.043636363636364e-06, |
|
"loss": 43.5431, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 17.142857142857142, |
|
"grad_norm": 145.61166381835938, |
|
"learning_rate": 8.393636363636363e-06, |
|
"loss": 41.4399, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 17.857142857142858, |
|
"grad_norm": 122.2297134399414, |
|
"learning_rate": 8.743636363636363e-06, |
|
"loss": 35.2278, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 18.571428571428573, |
|
"grad_norm": 117.88919067382812, |
|
"learning_rate": 9.093636363636363e-06, |
|
"loss": 31.2827, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 19.285714285714285, |
|
"grad_norm": 88.52986907958984, |
|
"learning_rate": 9.443636363636364e-06, |
|
"loss": 28.8076, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 88.49090576171875, |
|
"learning_rate": 9.793636363636364e-06, |
|
"loss": 28.232, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 20.714285714285715, |
|
"grad_norm": 114.45001983642578, |
|
"learning_rate": 1.0143636363636363e-05, |
|
"loss": 26.0885, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 21.428571428571427, |
|
"grad_norm": 74.16987609863281, |
|
"learning_rate": 1.0493636363636363e-05, |
|
"loss": 24.2689, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 22.142857142857142, |
|
"grad_norm": 82.25133514404297, |
|
"learning_rate": 1.0843636363636363e-05, |
|
"loss": 22.6963, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 22.857142857142858, |
|
"grad_norm": 74.76679229736328, |
|
"learning_rate": 1.1193636363636363e-05, |
|
"loss": 21.3257, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 23.571428571428573, |
|
"grad_norm": 68.91163635253906, |
|
"learning_rate": 1.1543636363636365e-05, |
|
"loss": 20.2199, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 24.285714285714285, |
|
"grad_norm": 59.86214828491211, |
|
"learning_rate": 1.1893636363636363e-05, |
|
"loss": 19.3769, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 76.9339370727539, |
|
"learning_rate": 1.2243636363636363e-05, |
|
"loss": 18.0471, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 25.714285714285715, |
|
"grad_norm": 66.16152954101562, |
|
"learning_rate": 1.2593636363636363e-05, |
|
"loss": 17.496, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 26.428571428571427, |
|
"grad_norm": 58.77778244018555, |
|
"learning_rate": 1.2943636363636363e-05, |
|
"loss": 16.9468, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 27.142857142857142, |
|
"grad_norm": 71.95783233642578, |
|
"learning_rate": 1.3293636363636363e-05, |
|
"loss": 16.3026, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 27.857142857142858, |
|
"grad_norm": 131.7191619873047, |
|
"learning_rate": 1.3643636363636363e-05, |
|
"loss": 16.008, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 28.571428571428573, |
|
"grad_norm": 81.78388977050781, |
|
"learning_rate": 1.3993636363636363e-05, |
|
"loss": 15.4616, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 29.285714285714285, |
|
"grad_norm": 60.10112380981445, |
|
"learning_rate": 1.3628501228501228e-05, |
|
"loss": 15.195, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 77.59001922607422, |
|
"learning_rate": 1.325012285012285e-05, |
|
"loss": 14.7557, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 30.714285714285715, |
|
"grad_norm": 77.3856201171875, |
|
"learning_rate": 1.287174447174447e-05, |
|
"loss": 14.2851, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 31.428571428571427, |
|
"grad_norm": 73.75492095947266, |
|
"learning_rate": 1.2493366093366094e-05, |
|
"loss": 13.867, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 32.142857142857146, |
|
"grad_norm": 78.76831817626953, |
|
"learning_rate": 1.2114987714987713e-05, |
|
"loss": 13.6677, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 32.857142857142854, |
|
"grad_norm": 92.99346160888672, |
|
"learning_rate": 1.1736609336609336e-05, |
|
"loss": 13.1851, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 33.57142857142857, |
|
"grad_norm": 97.57978820800781, |
|
"learning_rate": 1.1358230958230958e-05, |
|
"loss": 13.5215, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 34.285714285714285, |
|
"grad_norm": 63.118141174316406, |
|
"learning_rate": 1.0979852579852579e-05, |
|
"loss": 12.8525, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 63.68766403198242, |
|
"learning_rate": 1.06014742014742e-05, |
|
"loss": 12.441, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 35.714285714285715, |
|
"grad_norm": 70.33533477783203, |
|
"learning_rate": 1.0223095823095823e-05, |
|
"loss": 12.2331, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 36.42857142857143, |
|
"grad_norm": 64.16197967529297, |
|
"learning_rate": 9.844717444717445e-06, |
|
"loss": 11.7547, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 37.142857142857146, |
|
"grad_norm": 85.83612823486328, |
|
"learning_rate": 9.466339066339066e-06, |
|
"loss": 11.8633, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 37.857142857142854, |
|
"grad_norm": 89.40387725830078, |
|
"learning_rate": 9.087960687960689e-06, |
|
"loss": 11.5795, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 38.57142857142857, |
|
"grad_norm": 76.38651275634766, |
|
"learning_rate": 8.709582309582309e-06, |
|
"loss": 11.334, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 39.285714285714285, |
|
"grad_norm": 110.11483764648438, |
|
"learning_rate": 8.331203931203932e-06, |
|
"loss": 10.805, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 60.31315612792969, |
|
"learning_rate": 7.952825552825553e-06, |
|
"loss": 10.9056, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 40.714285714285715, |
|
"grad_norm": 107.96589660644531, |
|
"learning_rate": 7.574447174447175e-06, |
|
"loss": 10.5828, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 41.42857142857143, |
|
"grad_norm": 74.8004379272461, |
|
"learning_rate": 7.1960687960687955e-06, |
|
"loss": 10.2818, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 42.142857142857146, |
|
"grad_norm": 54.093475341796875, |
|
"learning_rate": 6.817690417690418e-06, |
|
"loss": 10.0785, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 42.857142857142854, |
|
"grad_norm": 115.28564453125, |
|
"learning_rate": 6.439312039312039e-06, |
|
"loss": 9.8794, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 43.57142857142857, |
|
"grad_norm": 52.57551956176758, |
|
"learning_rate": 6.06093366093366e-06, |
|
"loss": 9.7072, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 44.285714285714285, |
|
"grad_norm": 51.994075775146484, |
|
"learning_rate": 5.682555282555282e-06, |
|
"loss": 9.6015, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 73.08104705810547, |
|
"learning_rate": 5.304176904176904e-06, |
|
"loss": 9.305, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 45.714285714285715, |
|
"grad_norm": 75.5807876586914, |
|
"learning_rate": 4.925798525798525e-06, |
|
"loss": 9.2281, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 46.42857142857143, |
|
"grad_norm": 53.319637298583984, |
|
"learning_rate": 4.547420147420147e-06, |
|
"loss": 8.996, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 47.142857142857146, |
|
"grad_norm": 72.28289794921875, |
|
"learning_rate": 4.1690417690417685e-06, |
|
"loss": 9.0072, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 47.857142857142854, |
|
"grad_norm": 66.79710388183594, |
|
"learning_rate": 3.7906633906633902e-06, |
|
"loss": 8.7311, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 48.57142857142857, |
|
"grad_norm": 83.55674743652344, |
|
"learning_rate": 3.4122850122850124e-06, |
|
"loss": 8.5959, |
|
"step": 3740 |
|
} |
|
], |
|
"logging_steps": 55, |
|
"max_steps": 4235, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 55, |
|
"save_steps": 220, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.8876427477621146e+19, |
|
"train_batch_size": 28, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|