| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 63.1578947368421, | |
| "eval_steps": 500, | |
| "global_step": 2400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 8679.8642578125, | |
| "learning_rate": 7.312500000000001e-07, | |
| "loss": 1384.9168, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 4251.486328125, | |
| "learning_rate": 1.48125e-06, | |
| "loss": 1290.1081, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 3.1578947368421053, | |
| "grad_norm": 4015.17138671875, | |
| "learning_rate": 2.23125e-06, | |
| "loss": 1097.6352, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 4.2105263157894735, | |
| "grad_norm": 2110.156982421875, | |
| "learning_rate": 2.98125e-06, | |
| "loss": 846.727, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.2631578947368425, | |
| "grad_norm": 1568.4326171875, | |
| "learning_rate": 3.73125e-06, | |
| "loss": 578.2698, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.315789473684211, | |
| "grad_norm": 1924.275146484375, | |
| "learning_rate": 4.4812500000000005e-06, | |
| "loss": 412.4179, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 7.368421052631579, | |
| "grad_norm": 862.5057373046875, | |
| "learning_rate": 5.23125e-06, | |
| "loss": 311.6504, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 8.421052631578947, | |
| "grad_norm": 1463.6671142578125, | |
| "learning_rate": 5.98125e-06, | |
| "loss": 251.8019, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 9.473684210526315, | |
| "grad_norm": 1956.26708984375, | |
| "learning_rate": 6.73125e-06, | |
| "loss": 197.8886, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 10.526315789473685, | |
| "grad_norm": 404.4883728027344, | |
| "learning_rate": 7.481250000000001e-06, | |
| "loss": 155.3046, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 11.578947368421053, | |
| "grad_norm": 319.11956787109375, | |
| "learning_rate": 8.23125e-06, | |
| "loss": 127.7263, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 12.631578947368421, | |
| "grad_norm": 289.92144775390625, | |
| "learning_rate": 8.98125e-06, | |
| "loss": 101.4958, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 13.68421052631579, | |
| "grad_norm": 238.7841033935547, | |
| "learning_rate": 9.731250000000001e-06, | |
| "loss": 80.8442, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 14.736842105263158, | |
| "grad_norm": 194.82733154296875, | |
| "learning_rate": 1.048125e-05, | |
| "loss": 62.7129, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 15.789473684210526, | |
| "grad_norm": 154.37205505371094, | |
| "learning_rate": 1.123125e-05, | |
| "loss": 50.3538, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 16.842105263157894, | |
| "grad_norm": 208.374267578125, | |
| "learning_rate": 1.198125e-05, | |
| "loss": 41.4226, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 17.894736842105264, | |
| "grad_norm": 101.1446304321289, | |
| "learning_rate": 1.2731250000000001e-05, | |
| "loss": 32.4618, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 18.94736842105263, | |
| "grad_norm": 147.55865478515625, | |
| "learning_rate": 1.348125e-05, | |
| "loss": 27.0828, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 112.81380462646484, | |
| "learning_rate": 1.423125e-05, | |
| "loss": 22.4964, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 21.05263157894737, | |
| "grad_norm": 73.30062866210938, | |
| "learning_rate": 1.4981250000000002e-05, | |
| "loss": 18.7236, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 22.105263157894736, | |
| "grad_norm": 64.0571517944336, | |
| "learning_rate": 1.5731250000000003e-05, | |
| "loss": 16.3388, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 23.157894736842106, | |
| "grad_norm": 76.62981414794922, | |
| "learning_rate": 1.6481249999999997e-05, | |
| "loss": 14.0038, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 24.210526315789473, | |
| "grad_norm": 83.39350891113281, | |
| "learning_rate": 1.723125e-05, | |
| "loss": 12.6697, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 25.263157894736842, | |
| "grad_norm": 49.30060577392578, | |
| "learning_rate": 1.798125e-05, | |
| "loss": 11.5521, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 26.31578947368421, | |
| "grad_norm": 49.78151321411133, | |
| "learning_rate": 1.873125e-05, | |
| "loss": 10.8067, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 27.36842105263158, | |
| "grad_norm": 64.97428894042969, | |
| "learning_rate": 1.9481250000000003e-05, | |
| "loss": 9.9502, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 28.42105263157895, | |
| "grad_norm": 96.64849853515625, | |
| "learning_rate": 2.023125e-05, | |
| "loss": 9.6148, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 29.473684210526315, | |
| "grad_norm": 65.55039978027344, | |
| "learning_rate": 2.098125e-05, | |
| "loss": 9.462, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 30.526315789473685, | |
| "grad_norm": 85.08692169189453, | |
| "learning_rate": 2.173125e-05, | |
| "loss": 8.8002, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 31.57894736842105, | |
| "grad_norm": 124.7302017211914, | |
| "learning_rate": 2.248125e-05, | |
| "loss": 8.7508, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 32.63157894736842, | |
| "grad_norm": 152.3477020263672, | |
| "learning_rate": 2.3231250000000002e-05, | |
| "loss": 8.5803, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 33.68421052631579, | |
| "grad_norm": 162.75997924804688, | |
| "learning_rate": 2.398125e-05, | |
| "loss": 8.1324, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 34.73684210526316, | |
| "grad_norm": 78.276611328125, | |
| "learning_rate": 2.4731249999999998e-05, | |
| "loss": 8.4976, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 35.78947368421053, | |
| "grad_norm": 151.76458740234375, | |
| "learning_rate": 2.548125e-05, | |
| "loss": 8.9999, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 36.8421052631579, | |
| "grad_norm": 67.1139144897461, | |
| "learning_rate": 2.623125e-05, | |
| "loss": 8.0035, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 37.89473684210526, | |
| "grad_norm": 61.43220901489258, | |
| "learning_rate": 2.6981250000000002e-05, | |
| "loss": 7.8756, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 38.94736842105263, | |
| "grad_norm": 68.46588134765625, | |
| "learning_rate": 2.773125e-05, | |
| "loss": 7.6236, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 52.86519241333008, | |
| "learning_rate": 2.848125e-05, | |
| "loss": 7.3731, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 41.05263157894737, | |
| "grad_norm": 32.2613525390625, | |
| "learning_rate": 2.923125e-05, | |
| "loss": 7.5533, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 42.10526315789474, | |
| "grad_norm": 28.07533836364746, | |
| "learning_rate": 2.998125e-05, | |
| "loss": 7.2446, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 43.1578947368421, | |
| "grad_norm": 51.62834167480469, | |
| "learning_rate": 2.91875e-05, | |
| "loss": 7.2844, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 44.21052631578947, | |
| "grad_norm": 94.51573944091797, | |
| "learning_rate": 2.8354166666666667e-05, | |
| "loss": 7.3347, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 45.26315789473684, | |
| "grad_norm": 33.503990173339844, | |
| "learning_rate": 2.7520833333333333e-05, | |
| "loss": 7.1209, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 46.31578947368421, | |
| "grad_norm": 74.56465148925781, | |
| "learning_rate": 2.6687499999999998e-05, | |
| "loss": 6.7968, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 47.36842105263158, | |
| "grad_norm": 32.059810638427734, | |
| "learning_rate": 2.5854166666666667e-05, | |
| "loss": 6.8781, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 48.421052631578945, | |
| "grad_norm": 97.38982391357422, | |
| "learning_rate": 2.5020833333333336e-05, | |
| "loss": 6.7608, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 49.473684210526315, | |
| "grad_norm": 34.4022102355957, | |
| "learning_rate": 2.4187500000000002e-05, | |
| "loss": 6.4979, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 50.526315789473685, | |
| "grad_norm": 35.79924011230469, | |
| "learning_rate": 2.3354166666666667e-05, | |
| "loss": 6.5747, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 51.578947368421055, | |
| "grad_norm": 100.87942504882812, | |
| "learning_rate": 2.2520833333333333e-05, | |
| "loss": 6.1757, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 52.63157894736842, | |
| "grad_norm": 159.83860778808594, | |
| "learning_rate": 2.16875e-05, | |
| "loss": 6.4399, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 53.68421052631579, | |
| "grad_norm": 58.15259552001953, | |
| "learning_rate": 2.0854166666666668e-05, | |
| "loss": 6.3292, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 54.73684210526316, | |
| "grad_norm": 23.384042739868164, | |
| "learning_rate": 2.0020833333333333e-05, | |
| "loss": 6.2464, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 55.78947368421053, | |
| "grad_norm": 22.12693977355957, | |
| "learning_rate": 1.91875e-05, | |
| "loss": 6.1714, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 56.8421052631579, | |
| "grad_norm": 17.758420944213867, | |
| "learning_rate": 1.8354166666666668e-05, | |
| "loss": 5.9527, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 57.89473684210526, | |
| "grad_norm": 39.71998977661133, | |
| "learning_rate": 1.7520833333333333e-05, | |
| "loss": 5.9018, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 58.94736842105263, | |
| "grad_norm": 28.227684020996094, | |
| "learning_rate": 1.6687500000000002e-05, | |
| "loss": 5.8687, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 18.77845573425293, | |
| "learning_rate": 1.5854166666666668e-05, | |
| "loss": 5.7978, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 61.05263157894737, | |
| "grad_norm": 25.35810089111328, | |
| "learning_rate": 1.5020833333333334e-05, | |
| "loss": 5.8052, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 62.10526315789474, | |
| "grad_norm": 92.96112823486328, | |
| "learning_rate": 1.41875e-05, | |
| "loss": 5.7386, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 63.1578947368421, | |
| "grad_norm": 35.39582824707031, | |
| "learning_rate": 1.3354166666666667e-05, | |
| "loss": 5.6511, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 40, | |
| "max_steps": 3040, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 80, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.310662478164132e+19, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |