diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0348ea97130c017c407fcfb6fd4003859f17b84c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +checkpoint-*/ \ No newline at end of file diff --git a/checkpoint-106496/config.json b/checkpoint-106496/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8dd006e2c45b4178c8c43247c12d14ba89fc4b44 --- /dev/null +++ b/checkpoint-106496/config.json @@ -0,0 +1,32 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 512, + "n_head": 8, + "n_inner": null, + "n_layer": 8, + "n_positions": 2048, + "pad_token_id": 1, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.26.0.dev0", + "use_cache": true, + "vocab_size": 299 +} diff --git a/checkpoint-106496/optimizer.pt b/checkpoint-106496/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..abfe06285cfa794aa6a95854fef530243663e007 --- /dev/null +++ b/checkpoint-106496/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8953e89f02666525bd9a498c7c03aaeec5b1e62af8b1eb3461cb6932dff6e5b0 +size 211432837 diff --git a/checkpoint-106496/pytorch_model.bin b/checkpoint-106496/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..50eec3428b49fa6f0437260c9dfa70f71c4a37cd --- /dev/null +++ b/checkpoint-106496/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5562c193530a7b41f4b1ea4f3c8f66a7c3ca0885b673e4653afbb4e4e42a466a +size 139279005 diff --git a/checkpoint-106496/rng_state.pth b/checkpoint-106496/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..83604cf0e773574a50b626ce0aba09a578ad966f --- /dev/null +++ b/checkpoint-106496/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04127a62b9953a815a2b8b0e80ea0aed36762c9a4402334a3728a971ad89d02a +size 15597 diff --git a/checkpoint-106496/scaler.pt b/checkpoint-106496/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d0bd6016a027a484b31b47e9d9cbddeba621e23 --- /dev/null +++ b/checkpoint-106496/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6757561c1e760e6da06aa4557681c0a6a9b0e2ee848768a273634675848aa501 +size 557 diff --git a/checkpoint-106496/scheduler.pt b/checkpoint-106496/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e17af72a2bcd899f423c04694115a7583b20b499 --- /dev/null +++ b/checkpoint-106496/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe670698963718269271b891ad22eb717988aca0bb6ff0290b216e356014cd1 +size 627 diff --git a/checkpoint-106496/trainer_state.json b/checkpoint-106496/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6756b0a4c29e3bc9a38f4cf35f4bdf03a9a62293 --- /dev/null +++ b/checkpoint-106496/trainer_state.json @@ -0,0 +1,744 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.183056792121948, + "global_step": 106496, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.14, + "learning_rate": 0.00020480000000000002, + "loss": 2.1719, + "step": 2048 + }, + { + "epoch": 0.14, + "eval_loss": 1.0833024978637695, + "eval_runtime": 15.7887, + "eval_samples_per_second": 132.5, + "eval_steps_per_second": 8.297, + "step": 2048 + }, + { + "epoch": 0.28, + "learning_rate": 0.00040960000000000004, + "loss": 0.9468, + "step": 4096 + }, + { + "epoch": 0.28, + "eval_loss": 0.8000818490982056, + "eval_runtime": 17.2632, + "eval_samples_per_second": 121.183, + "eval_steps_per_second": 7.588, + "step": 4096 + }, + { + "epoch": 0.41, + "learning_rate": 0.0004998749142723946, + "loss": 0.7542, + "step": 6144 + }, + { + "epoch": 0.41, + "eval_loss": 0.7439925074577332, + "eval_runtime": 17.4134, + "eval_samples_per_second": 120.138, + "eval_steps_per_second": 7.523, + "step": 6144 + }, + { + "epoch": 0.55, + "learning_rate": 0.0004990273340312486, + "loss": 0.6756, + "step": 8192 + }, + { + "epoch": 0.55, + "eval_loss": 0.7018134593963623, + "eval_runtime": 15.8601, + "eval_samples_per_second": 131.904, + "eval_steps_per_second": 8.26, + "step": 8192 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004973820371438889, + "loss": 0.631, + "step": 10240 + }, + { + "epoch": 0.69, + "eval_loss": 0.684688925743103, + "eval_runtime": 18.0314, + "eval_samples_per_second": 116.02, + "eval_steps_per_second": 7.265, + "step": 10240 + }, + { + "epoch": 0.83, + "learning_rate": 0.0004949442940386407, + "loss": 0.5989, + "step": 12288 + }, + { + "epoch": 0.83, + "eval_loss": 0.6822534203529358, + "eval_runtime": 17.4182, + "eval_samples_per_second": 120.104, + "eval_steps_per_second": 7.521, + "step": 12288 + }, + { + "epoch": 0.97, + "learning_rate": 0.0004917201492451735, + "loss": 0.5776, + "step": 14336 + }, + { + "epoch": 0.97, + "eval_loss": 0.6674544215202332, + "eval_runtime": 17.7487, + "eval_samples_per_second": 117.868, + "eval_steps_per_second": 7.381, + "step": 14336 + }, + { + "epoch": 1.11, + "learning_rate": 0.0004877230785006913, + "loss": 0.5525, + "step": 16384 + }, + { + "epoch": 1.11, + "eval_loss": 0.6640163660049438, + "eval_runtime": 15.8951, + "eval_samples_per_second": 131.613, + "eval_steps_per_second": 8.242, + "step": 16384 + }, + { + "epoch": 1.24, + "learning_rate": 0.0004829670105577666, + "loss": 0.5373, + "step": 18432 + }, + { + "epoch": 1.24, + "eval_loss": 0.6663170456886292, + "eval_runtime": 17.5, + "eval_samples_per_second": 119.543, + "eval_steps_per_second": 7.486, + "step": 18432 + }, + { + "epoch": 1.38, + "learning_rate": 0.0004774596641323791, + "loss": 0.5261, + "step": 20480 + }, + { + "epoch": 1.38, + "eval_loss": 0.6518951654434204, + "eval_runtime": 17.1326, + "eval_samples_per_second": 122.107, + "eval_steps_per_second": 7.646, + "step": 20480 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004712261976082475, + "loss": 0.5115, + "step": 22528 + }, + { + "epoch": 1.52, + "eval_loss": 0.6658991575241089, + "eval_runtime": 17.6874, + "eval_samples_per_second": 118.277, + "eval_steps_per_second": 7.406, + "step": 22528 + }, + { + "epoch": 1.66, + "learning_rate": 0.00046428407064289515, + "loss": 0.5033, + "step": 24576 + }, + { + "epoch": 1.66, + "eval_loss": 0.6602604389190674, + "eval_runtime": 15.8845, + "eval_samples_per_second": 131.701, + "eval_steps_per_second": 8.247, + "step": 24576 + }, + { + "epoch": 1.8, + "learning_rate": 0.00045665163060732317, + "loss": 0.4925, + "step": 26624 + }, + { + "epoch": 1.8, + "eval_loss": 0.6551440954208374, + "eval_runtime": 17.7902, + "eval_samples_per_second": 117.593, + "eval_steps_per_second": 7.364, + "step": 26624 + }, + { + "epoch": 1.93, + "learning_rate": 0.000448360778288479, + "loss": 0.4868, + "step": 28672 + }, + { + "epoch": 1.93, + "eval_loss": 0.6580803394317627, + "eval_runtime": 15.7841, + "eval_samples_per_second": 132.538, + "eval_steps_per_second": 8.299, + "step": 28672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00043942999964293453, + "loss": 0.4741, + "step": 30720 + }, + { + "epoch": 2.07, + "eval_loss": 0.661230742931366, + "eval_runtime": 18.015, + "eval_samples_per_second": 116.126, + "eval_steps_per_second": 7.272, + "step": 30720 + }, + { + "epoch": 2.21, + "learning_rate": 0.0004298966220346151, + "loss": 0.4632, + "step": 32768 + }, + { + "epoch": 2.21, + "eval_loss": 0.6618030071258545, + "eval_runtime": 17.9073, + "eval_samples_per_second": 116.824, + "eval_steps_per_second": 7.315, + "step": 32768 + }, + { + "epoch": 2.35, + "learning_rate": 0.00041978697624050446, + "loss": 0.4584, + "step": 34816 + }, + { + "epoch": 2.35, + "eval_loss": 0.6579039096832275, + "eval_runtime": 17.8241, + "eval_samples_per_second": 117.369, + "eval_steps_per_second": 7.35, + "step": 34816 + }, + { + "epoch": 2.49, + "learning_rate": 0.0004091334467888659, + "loss": 0.451, + "step": 36864 + }, + { + "epoch": 2.49, + "eval_loss": 0.6627541780471802, + "eval_runtime": 17.6822, + "eval_samples_per_second": 118.311, + "eval_steps_per_second": 7.409, + "step": 36864 + }, + { + "epoch": 2.62, + "learning_rate": 0.00039796458815002033, + "loss": 0.445, + "step": 38912 + }, + { + "epoch": 2.62, + "eval_loss": 0.656867265701294, + "eval_runtime": 15.8659, + "eval_samples_per_second": 131.855, + "eval_steps_per_second": 8.257, + "step": 38912 + }, + { + "epoch": 2.76, + "learning_rate": 0.0003863212870708643, + "loss": 0.4404, + "step": 40960 + }, + { + "epoch": 2.76, + "eval_loss": 0.6653844118118286, + "eval_runtime": 17.9192, + "eval_samples_per_second": 116.747, + "eval_steps_per_second": 7.311, + "step": 40960 + }, + { + "epoch": 2.9, + "learning_rate": 0.00037424687637876156, + "loss": 0.4334, + "step": 43008 + }, + { + "epoch": 2.9, + "eval_loss": 0.6553301811218262, + "eval_runtime": 15.9304, + "eval_samples_per_second": 131.321, + "eval_steps_per_second": 8.223, + "step": 43008 + }, + { + "epoch": 3.04, + "learning_rate": 0.00036178064571954134, + "loss": 0.4278, + "step": 45056 + }, + { + "epoch": 3.04, + "eval_loss": 0.6766741871833801, + "eval_runtime": 16.9834, + "eval_samples_per_second": 123.179, + "eval_steps_per_second": 7.713, + "step": 45056 + }, + { + "epoch": 3.18, + "learning_rate": 0.0003489439971353363, + "loss": 0.4152, + "step": 47104 + }, + { + "epoch": 3.18, + "eval_loss": 0.6645193696022034, + "eval_runtime": 15.816, + "eval_samples_per_second": 132.271, + "eval_steps_per_second": 8.283, + "step": 47104 + }, + { + "epoch": 3.32, + "learning_rate": 0.0003357965820476752, + "loss": 0.411, + "step": 49152 + }, + { + "epoch": 3.32, + "eval_loss": 0.6748972535133362, + "eval_runtime": 17.7507, + "eval_samples_per_second": 117.855, + "eval_steps_per_second": 7.38, + "step": 49152 + }, + { + "epoch": 3.45, + "learning_rate": 0.0003223743322236833, + "loss": 0.4085, + "step": 51200 + }, + { + "epoch": 3.45, + "eval_loss": 0.6703893542289734, + "eval_runtime": 17.341, + "eval_samples_per_second": 120.639, + "eval_steps_per_second": 7.554, + "step": 51200 + }, + { + "epoch": 3.59, + "learning_rate": 0.0003087269633577651, + "loss": 0.4037, + "step": 53248 + }, + { + "epoch": 3.59, + "eval_loss": 0.6561577916145325, + "eval_runtime": 15.9295, + "eval_samples_per_second": 131.329, + "eval_steps_per_second": 8.224, + "step": 53248 + }, + { + "epoch": 3.73, + "learning_rate": 0.0002948780545870247, + "loss": 0.3975, + "step": 55296 + }, + { + "epoch": 3.73, + "eval_loss": 0.6698916554450989, + "eval_runtime": 17.9008, + "eval_samples_per_second": 116.866, + "eval_steps_per_second": 7.318, + "step": 55296 + }, + { + "epoch": 3.87, + "learning_rate": 0.0002808921064161573, + "loss": 0.3917, + "step": 57344 + }, + { + "epoch": 3.87, + "eval_loss": 0.7029281854629517, + "eval_runtime": 17.8146, + "eval_samples_per_second": 117.432, + "eval_steps_per_second": 7.354, + "step": 57344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00026680720064442787, + "loss": 0.3876, + "step": 59392 + }, + { + "epoch": 4.01, + "eval_loss": 0.6837398409843445, + "eval_runtime": 17.4354, + "eval_samples_per_second": 119.986, + "eval_steps_per_second": 7.513, + "step": 59392 + }, + { + "epoch": 4.14, + "learning_rate": 0.00025266845586830784, + "loss": 0.3751, + "step": 61440 + }, + { + "epoch": 4.14, + "eval_loss": 0.690500795841217, + "eval_runtime": 17.8964, + "eval_samples_per_second": 116.895, + "eval_steps_per_second": 7.32, + "step": 61440 + }, + { + "epoch": 4.28, + "learning_rate": 0.00023851425721450398, + "loss": 0.3719, + "step": 63488 + }, + { + "epoch": 4.28, + "eval_loss": 0.6938452124595642, + "eval_runtime": 15.7962, + "eval_samples_per_second": 132.437, + "eval_steps_per_second": 8.293, + "step": 63488 + }, + { + "epoch": 4.42, + "learning_rate": 0.0002244106409269819, + "loss": 0.3644, + "step": 65536 + }, + { + "epoch": 4.42, + "eval_loss": 0.6987010836601257, + "eval_runtime": 15.9556, + "eval_samples_per_second": 131.114, + "eval_steps_per_second": 8.21, + "step": 65536 + }, + { + "epoch": 4.56, + "learning_rate": 0.00021037526400320187, + "loss": 0.357, + "step": 67584 + }, + { + "epoch": 4.56, + "eval_loss": 0.6973423957824707, + "eval_runtime": 15.8273, + "eval_samples_per_second": 132.177, + "eval_steps_per_second": 8.277, + "step": 67584 + }, + { + "epoch": 4.7, + "learning_rate": 0.0001964736950807942, + "loss": 0.357, + "step": 69632 + }, + { + "epoch": 4.7, + "eval_loss": 0.6949540972709656, + "eval_runtime": 16.1065, + "eval_samples_per_second": 129.885, + "eval_steps_per_second": 8.133, + "step": 69632 + }, + { + "epoch": 4.83, + "learning_rate": 0.00018274358855873096, + "loss": 0.3502, + "step": 71680 + }, + { + "epoch": 4.83, + "eval_loss": 0.6959769129753113, + "eval_runtime": 16.2001, + "eval_samples_per_second": 129.135, + "eval_steps_per_second": 8.086, + "step": 71680 + }, + { + "epoch": 4.97, + "learning_rate": 0.00016922892649452222, + "loss": 0.3458, + "step": 73728 + }, + { + "epoch": 4.97, + "eval_loss": 0.7017838358879089, + "eval_runtime": 18.0467, + "eval_samples_per_second": 115.922, + "eval_steps_per_second": 7.259, + "step": 73728 + }, + { + "epoch": 5.11, + "learning_rate": 0.00015597300080605504, + "loss": 0.3302, + "step": 75776 + }, + { + "epoch": 5.11, + "eval_loss": 0.7075567841529846, + "eval_runtime": 16.7136, + "eval_samples_per_second": 125.168, + "eval_steps_per_second": 7.838, + "step": 75776 + }, + { + "epoch": 5.25, + "learning_rate": 0.0001430182745933093, + "loss": 0.3278, + "step": 77824 + }, + { + "epoch": 5.25, + "eval_loss": 0.7178707718849182, + "eval_runtime": 15.7714, + "eval_samples_per_second": 132.645, + "eval_steps_per_second": 8.306, + "step": 77824 + }, + { + "epoch": 5.39, + "learning_rate": 0.00013040017526934073, + "loss": 0.3224, + "step": 79872 + }, + { + "epoch": 5.39, + "eval_loss": 0.725006639957428, + "eval_runtime": 15.8618, + "eval_samples_per_second": 131.889, + "eval_steps_per_second": 8.259, + "step": 79872 + }, + { + "epoch": 5.53, + "learning_rate": 0.00011817144183980649, + "loss": 0.3181, + "step": 81920 + }, + { + "epoch": 5.53, + "eval_loss": 0.7247176766395569, + "eval_runtime": 17.8123, + "eval_samples_per_second": 117.447, + "eval_steps_per_second": 7.354, + "step": 81920 + }, + { + "epoch": 5.66, + "learning_rate": 0.00010636499874117036, + "loss": 0.3154, + "step": 83968 + }, + { + "epoch": 5.66, + "eval_loss": 0.7248900532722473, + "eval_runtime": 17.751, + "eval_samples_per_second": 117.852, + "eval_steps_per_second": 7.38, + "step": 83968 + }, + { + "epoch": 5.8, + "learning_rate": 9.501866590283475e-05, + "loss": 0.3105, + "step": 86016 + }, + { + "epoch": 5.8, + "eval_loss": 0.7352888584136963, + "eval_runtime": 18.0338, + "eval_samples_per_second": 116.004, + "eval_steps_per_second": 7.264, + "step": 86016 + }, + { + "epoch": 5.94, + "learning_rate": 8.416361604489855e-05, + "loss": 0.3035, + "step": 88064 + }, + { + "epoch": 5.94, + "eval_loss": 0.7256708741188049, + "eval_runtime": 15.94, + "eval_samples_per_second": 131.242, + "eval_steps_per_second": 8.218, + "step": 88064 + }, + { + "epoch": 6.08, + "learning_rate": 7.384521927589935e-05, + "loss": 0.294, + "step": 90112 + }, + { + "epoch": 6.08, + "eval_loss": 0.7387065291404724, + "eval_runtime": 16.8562, + "eval_samples_per_second": 124.109, + "eval_steps_per_second": 7.772, + "step": 90112 + }, + { + "epoch": 6.22, + "learning_rate": 6.409110434142392e-05, + "loss": 0.2866, + "step": 92160 + }, + { + "epoch": 6.22, + "eval_loss": 0.7492235898971558, + "eval_runtime": 17.2872, + "eval_samples_per_second": 121.015, + "eval_steps_per_second": 7.578, + "step": 92160 + }, + { + "epoch": 6.35, + "learning_rate": 5.492819313147518e-05, + "loss": 0.2836, + "step": 94208 + }, + { + "epoch": 6.35, + "eval_loss": 0.7440558671951294, + "eval_runtime": 17.5257, + "eval_samples_per_second": 119.367, + "eval_steps_per_second": 7.475, + "step": 94208 + }, + { + "epoch": 6.49, + "learning_rate": 4.6394783238561305e-05, + "loss": 0.2811, + "step": 96256 + }, + { + "epoch": 6.49, + "eval_loss": 0.7577848434448242, + "eval_runtime": 15.8536, + "eval_samples_per_second": 131.957, + "eval_steps_per_second": 8.263, + "step": 96256 + }, + { + "epoch": 6.63, + "learning_rate": 3.851358797621554e-05, + "loss": 0.2774, + "step": 98304 + }, + { + "epoch": 6.63, + "eval_loss": 0.7483424544334412, + "eval_runtime": 15.8485, + "eval_samples_per_second": 132.0, + "eval_steps_per_second": 8.266, + "step": 98304 + }, + { + "epoch": 6.77, + "learning_rate": 3.130985341100834e-05, + "loss": 0.2736, + "step": 100352 + }, + { + "epoch": 6.77, + "eval_loss": 0.7481484413146973, + "eval_runtime": 17.9168, + "eval_samples_per_second": 116.762, + "eval_steps_per_second": 7.312, + "step": 100352 + }, + { + "epoch": 6.91, + "learning_rate": 2.4803653223119228e-05, + "loss": 0.27, + "step": 102400 + }, + { + "epoch": 6.91, + "eval_loss": 0.7659121751785278, + "eval_runtime": 15.8798, + "eval_samples_per_second": 131.739, + "eval_steps_per_second": 8.249, + "step": 102400 + }, + { + "epoch": 7.04, + "learning_rate": 1.902218093192909e-05, + "loss": 0.2636, + "step": 104448 + }, + { + "epoch": 7.04, + "eval_loss": 0.7734333276748657, + "eval_runtime": 17.8497, + "eval_samples_per_second": 117.201, + "eval_steps_per_second": 7.339, + "step": 104448 + }, + { + "epoch": 7.18, + "learning_rate": 1.398060674025281e-05, + "loss": 0.2583, + "step": 106496 + }, + { + "epoch": 7.18, + "eval_loss": 0.7774596810340881, + "eval_runtime": 17.9006, + "eval_samples_per_second": 116.868, + "eval_steps_per_second": 7.318, + "step": 106496 + } + ], + "max_steps": 118608, + "num_train_epochs": 8, + "total_flos": 4.366669386207068e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-106496/training_args.bin b/checkpoint-106496/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f9ee3ad348e277019b0ce52cdd8e1d7773f2852 --- /dev/null +++ b/checkpoint-106496/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0200139492e444a9d322a4f90a96e6dde09c7a882f05b816c2345dade5ea0f98 +size 3515 diff --git a/checkpoint-114688/config.json b/checkpoint-114688/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8dd006e2c45b4178c8c43247c12d14ba89fc4b44 --- /dev/null +++ b/checkpoint-114688/config.json @@ -0,0 +1,32 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 512, + "n_head": 8, + "n_inner": null, + "n_layer": 8, + "n_positions": 2048, + "pad_token_id": 1, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.26.0.dev0", + "use_cache": true, + "vocab_size": 299 +} diff --git a/checkpoint-114688/optimizer.pt b/checkpoint-114688/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..52b56d6ed6909d720206a5aadc5026f56c50c92e --- /dev/null +++ b/checkpoint-114688/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b68c6edb60414cfe637e2ec074a5f4acec75f5337e31730978b158998228eac5 +size 211432837 diff --git a/checkpoint-114688/pytorch_model.bin b/checkpoint-114688/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..77abcefd4e75bc1f85b6e4b6708d0c0cd5c66f36 --- /dev/null +++ b/checkpoint-114688/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6849dc98fcc9e4181c102fa0bb6dd1d70c54edae9d1a8440b604c6f98ba117b4 +size 139279005 diff --git a/checkpoint-114688/rng_state.pth b/checkpoint-114688/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f15b10ad0304e9115898b23027efb56c525323fc --- /dev/null +++ b/checkpoint-114688/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc866ce242256f98027b6016930d35e9d45293516285c23fdbadea7b58591d8a +size 15597 diff --git a/checkpoint-114688/scaler.pt b/checkpoint-114688/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..49fee0cb854dd3cb6629ec412757ffb1a06aa5dc --- /dev/null +++ b/checkpoint-114688/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9778e4932801d6fcefc7836eddd9fd5635b262a07b6eb7a75700d5c35643bb19 +size 557 diff --git a/checkpoint-114688/scheduler.pt b/checkpoint-114688/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..63700b85b549866d07a647aa6e1338bff0f326c4 --- /dev/null +++ b/checkpoint-114688/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fa9e591dbfa043fe5b9503f81bacab889f557e7055edb15e3594fae6f28701b +size 627 diff --git a/checkpoint-114688/trainer_state.json b/checkpoint-114688/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..211c3ae25852be85e28eeae092d2d4fe0ba8286e --- /dev/null +++ b/checkpoint-114688/trainer_state.json @@ -0,0 +1,800 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.735599622285175, + "global_step": 114688, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.14, + "learning_rate": 0.00020480000000000002, + "loss": 2.1719, + "step": 2048 + }, + { + "epoch": 0.14, + "eval_loss": 1.0833024978637695, + "eval_runtime": 15.7887, + "eval_samples_per_second": 132.5, + "eval_steps_per_second": 8.297, + "step": 2048 + }, + { + "epoch": 0.28, + "learning_rate": 0.00040960000000000004, + "loss": 0.9468, + "step": 4096 + }, + { + "epoch": 0.28, + "eval_loss": 0.8000818490982056, + "eval_runtime": 17.2632, + "eval_samples_per_second": 121.183, + "eval_steps_per_second": 7.588, + "step": 4096 + }, + { + "epoch": 0.41, + "learning_rate": 0.0004998749142723946, + "loss": 0.7542, + "step": 6144 + }, + { + "epoch": 0.41, + "eval_loss": 0.7439925074577332, + "eval_runtime": 17.4134, + "eval_samples_per_second": 120.138, + "eval_steps_per_second": 7.523, + "step": 6144 + }, + { + "epoch": 0.55, + "learning_rate": 0.0004990273340312486, + "loss": 0.6756, + "step": 8192 + }, + { + "epoch": 0.55, + "eval_loss": 0.7018134593963623, + "eval_runtime": 15.8601, + "eval_samples_per_second": 131.904, + "eval_steps_per_second": 8.26, + "step": 8192 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004973820371438889, + "loss": 0.631, + "step": 10240 + }, + { + "epoch": 0.69, + "eval_loss": 0.684688925743103, + "eval_runtime": 18.0314, + "eval_samples_per_second": 116.02, + "eval_steps_per_second": 7.265, + "step": 10240 + }, + { + "epoch": 0.83, + "learning_rate": 0.0004949442940386407, + "loss": 0.5989, + "step": 12288 + }, + { + "epoch": 0.83, + "eval_loss": 0.6822534203529358, + "eval_runtime": 17.4182, + "eval_samples_per_second": 120.104, + "eval_steps_per_second": 7.521, + "step": 12288 + }, + { + "epoch": 0.97, + "learning_rate": 0.0004917201492451735, + "loss": 0.5776, + "step": 14336 + }, + { + "epoch": 0.97, + "eval_loss": 0.6674544215202332, + "eval_runtime": 17.7487, + "eval_samples_per_second": 117.868, + "eval_steps_per_second": 7.381, + "step": 14336 + }, + { + "epoch": 1.11, + "learning_rate": 0.0004877230785006913, + "loss": 0.5525, + "step": 16384 + }, + { + "epoch": 1.11, + "eval_loss": 0.6640163660049438, + "eval_runtime": 15.8951, + "eval_samples_per_second": 131.613, + "eval_steps_per_second": 8.242, + "step": 16384 + }, + { + "epoch": 1.24, + "learning_rate": 0.0004829670105577666, + "loss": 0.5373, + "step": 18432 + }, + { + "epoch": 1.24, + "eval_loss": 0.6663170456886292, + "eval_runtime": 17.5, + "eval_samples_per_second": 119.543, + "eval_steps_per_second": 7.486, + "step": 18432 + }, + { + "epoch": 1.38, + "learning_rate": 0.0004774596641323791, + "loss": 0.5261, + "step": 20480 + }, + { + "epoch": 1.38, + "eval_loss": 0.6518951654434204, + "eval_runtime": 17.1326, + "eval_samples_per_second": 122.107, + "eval_steps_per_second": 7.646, + "step": 20480 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004712261976082475, + "loss": 0.5115, + "step": 22528 + }, + { + "epoch": 1.52, + "eval_loss": 0.6658991575241089, + "eval_runtime": 17.6874, + "eval_samples_per_second": 118.277, + "eval_steps_per_second": 7.406, + "step": 22528 + }, + { + "epoch": 1.66, + "learning_rate": 0.00046428407064289515, + "loss": 0.5033, + "step": 24576 + }, + { + "epoch": 1.66, + "eval_loss": 0.6602604389190674, + "eval_runtime": 15.8845, + "eval_samples_per_second": 131.701, + "eval_steps_per_second": 8.247, + "step": 24576 + }, + { + "epoch": 1.8, + "learning_rate": 0.00045665163060732317, + "loss": 0.4925, + "step": 26624 + }, + { + "epoch": 1.8, + "eval_loss": 0.6551440954208374, + "eval_runtime": 17.7902, + "eval_samples_per_second": 117.593, + "eval_steps_per_second": 7.364, + "step": 26624 + }, + { + "epoch": 1.93, + "learning_rate": 0.000448360778288479, + "loss": 0.4868, + "step": 28672 + }, + { + "epoch": 1.93, + "eval_loss": 0.6580803394317627, + "eval_runtime": 15.7841, + "eval_samples_per_second": 132.538, + "eval_steps_per_second": 8.299, + "step": 28672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00043942999964293453, + "loss": 0.4741, + "step": 30720 + }, + { + "epoch": 2.07, + "eval_loss": 0.661230742931366, + "eval_runtime": 18.015, + "eval_samples_per_second": 116.126, + "eval_steps_per_second": 7.272, + "step": 30720 + }, + { + "epoch": 2.21, + "learning_rate": 0.0004298966220346151, + "loss": 0.4632, + "step": 32768 + }, + { + "epoch": 2.21, + "eval_loss": 0.6618030071258545, + "eval_runtime": 17.9073, + "eval_samples_per_second": 116.824, + "eval_steps_per_second": 7.315, + "step": 32768 + }, + { + "epoch": 2.35, + "learning_rate": 0.00041978697624050446, + "loss": 0.4584, + "step": 34816 + }, + { + "epoch": 2.35, + "eval_loss": 0.6579039096832275, + "eval_runtime": 17.8241, + "eval_samples_per_second": 117.369, + "eval_steps_per_second": 7.35, + "step": 34816 + }, + { + "epoch": 2.49, + "learning_rate": 0.0004091334467888659, + "loss": 0.451, + "step": 36864 + }, + { + "epoch": 2.49, + "eval_loss": 0.6627541780471802, + "eval_runtime": 17.6822, + "eval_samples_per_second": 118.311, + "eval_steps_per_second": 7.409, + "step": 36864 + }, + { + "epoch": 2.62, + "learning_rate": 0.00039796458815002033, + "loss": 0.445, + "step": 38912 + }, + { + "epoch": 2.62, + "eval_loss": 0.656867265701294, + "eval_runtime": 15.8659, + "eval_samples_per_second": 131.855, + "eval_steps_per_second": 8.257, + "step": 38912 + }, + { + "epoch": 2.76, + "learning_rate": 0.0003863212870708643, + "loss": 0.4404, + "step": 40960 + }, + { + "epoch": 2.76, + "eval_loss": 0.6653844118118286, + "eval_runtime": 17.9192, + "eval_samples_per_second": 116.747, + "eval_steps_per_second": 7.311, + "step": 40960 + }, + { + "epoch": 2.9, + "learning_rate": 0.00037424687637876156, + "loss": 0.4334, + "step": 43008 + }, + { + "epoch": 2.9, + "eval_loss": 0.6553301811218262, + "eval_runtime": 15.9304, + "eval_samples_per_second": 131.321, + "eval_steps_per_second": 8.223, + "step": 43008 + }, + { + "epoch": 3.04, + "learning_rate": 0.00036178064571954134, + "loss": 0.4278, + "step": 45056 + }, + { + "epoch": 3.04, + "eval_loss": 0.6766741871833801, + "eval_runtime": 16.9834, + "eval_samples_per_second": 123.179, + "eval_steps_per_second": 7.713, + "step": 45056 + }, + { + "epoch": 3.18, + "learning_rate": 0.0003489439971353363, + "loss": 0.4152, + "step": 47104 + }, + { + "epoch": 3.18, + "eval_loss": 0.6645193696022034, + "eval_runtime": 15.816, + "eval_samples_per_second": 132.271, + "eval_steps_per_second": 8.283, + "step": 47104 + }, + { + "epoch": 3.32, + "learning_rate": 0.0003357965820476752, + "loss": 0.411, + "step": 49152 + }, + { + "epoch": 3.32, + "eval_loss": 0.6748972535133362, + "eval_runtime": 17.7507, + "eval_samples_per_second": 117.855, + "eval_steps_per_second": 7.38, + "step": 49152 + }, + { + "epoch": 3.45, + "learning_rate": 0.0003223743322236833, + "loss": 0.4085, + "step": 51200 + }, + { + "epoch": 3.45, + "eval_loss": 0.6703893542289734, + "eval_runtime": 17.341, + "eval_samples_per_second": 120.639, + "eval_steps_per_second": 7.554, + "step": 51200 + }, + { + "epoch": 3.59, + "learning_rate": 0.0003087269633577651, + "loss": 0.4037, + "step": 53248 + }, + { + "epoch": 3.59, + "eval_loss": 0.6561577916145325, + "eval_runtime": 15.9295, + "eval_samples_per_second": 131.329, + "eval_steps_per_second": 8.224, + "step": 53248 + }, + { + "epoch": 3.73, + "learning_rate": 0.0002948780545870247, + "loss": 0.3975, + "step": 55296 + }, + { + "epoch": 3.73, + "eval_loss": 0.6698916554450989, + "eval_runtime": 17.9008, + "eval_samples_per_second": 116.866, + "eval_steps_per_second": 7.318, + "step": 55296 + }, + { + "epoch": 3.87, + "learning_rate": 0.0002808921064161573, + "loss": 0.3917, + "step": 57344 + }, + { + "epoch": 3.87, + "eval_loss": 0.7029281854629517, + "eval_runtime": 17.8146, + "eval_samples_per_second": 117.432, + "eval_steps_per_second": 7.354, + "step": 57344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00026680720064442787, + "loss": 0.3876, + "step": 59392 + }, + { + "epoch": 4.01, + "eval_loss": 0.6837398409843445, + "eval_runtime": 17.4354, + "eval_samples_per_second": 119.986, + "eval_steps_per_second": 7.513, + "step": 59392 + }, + { + "epoch": 4.14, + "learning_rate": 0.00025266845586830784, + "loss": 0.3751, + "step": 61440 + }, + { + "epoch": 4.14, + "eval_loss": 0.690500795841217, + "eval_runtime": 17.8964, + "eval_samples_per_second": 116.895, + "eval_steps_per_second": 7.32, + "step": 61440 + }, + { + "epoch": 4.28, + "learning_rate": 0.00023851425721450398, + "loss": 0.3719, + "step": 63488 + }, + { + "epoch": 4.28, + "eval_loss": 0.6938452124595642, + "eval_runtime": 15.7962, + "eval_samples_per_second": 132.437, + "eval_steps_per_second": 8.293, + "step": 63488 + }, + { + "epoch": 4.42, + "learning_rate": 0.0002244106409269819, + "loss": 0.3644, + "step": 65536 + }, + { + "epoch": 4.42, + "eval_loss": 0.6987010836601257, + "eval_runtime": 15.9556, + "eval_samples_per_second": 131.114, + "eval_steps_per_second": 8.21, + "step": 65536 + }, + { + "epoch": 4.56, + "learning_rate": 0.00021037526400320187, + "loss": 0.357, + "step": 67584 + }, + { + "epoch": 4.56, + "eval_loss": 0.6973423957824707, + "eval_runtime": 15.8273, + "eval_samples_per_second": 132.177, + "eval_steps_per_second": 8.277, + "step": 67584 + }, + { + "epoch": 4.7, + "learning_rate": 0.0001964736950807942, + "loss": 0.357, + "step": 69632 + }, + { + "epoch": 4.7, + "eval_loss": 0.6949540972709656, + "eval_runtime": 16.1065, + "eval_samples_per_second": 129.885, + "eval_steps_per_second": 8.133, + "step": 69632 + }, + { + "epoch": 4.83, + "learning_rate": 0.00018274358855873096, + "loss": 0.3502, + "step": 71680 + }, + { + "epoch": 4.83, + "eval_loss": 0.6959769129753113, + "eval_runtime": 16.2001, + "eval_samples_per_second": 129.135, + "eval_steps_per_second": 8.086, + "step": 71680 + }, + { + "epoch": 4.97, + "learning_rate": 0.00016922892649452222, + "loss": 0.3458, + "step": 73728 + }, + { + "epoch": 4.97, + "eval_loss": 0.7017838358879089, + "eval_runtime": 18.0467, + "eval_samples_per_second": 115.922, + "eval_steps_per_second": 7.259, + "step": 73728 + }, + { + "epoch": 5.11, + "learning_rate": 0.00015597300080605504, + "loss": 0.3302, + "step": 75776 + }, + { + "epoch": 5.11, + "eval_loss": 0.7075567841529846, + "eval_runtime": 16.7136, + "eval_samples_per_second": 125.168, + "eval_steps_per_second": 7.838, + "step": 75776 + }, + { + "epoch": 5.25, + "learning_rate": 0.0001430182745933093, + "loss": 0.3278, + "step": 77824 + }, + { + "epoch": 5.25, + "eval_loss": 0.7178707718849182, + "eval_runtime": 15.7714, + "eval_samples_per_second": 132.645, + "eval_steps_per_second": 8.306, + "step": 77824 + }, + { + "epoch": 5.39, + "learning_rate": 0.00013040017526934073, + "loss": 0.3224, + "step": 79872 + }, + { + "epoch": 5.39, + "eval_loss": 0.725006639957428, + "eval_runtime": 15.8618, + "eval_samples_per_second": 131.889, + "eval_steps_per_second": 8.259, + "step": 79872 + }, + { + "epoch": 5.53, + "learning_rate": 0.00011817144183980649, + "loss": 0.3181, + "step": 81920 + }, + { + "epoch": 5.53, + "eval_loss": 0.7247176766395569, + "eval_runtime": 17.8123, + "eval_samples_per_second": 117.447, + "eval_steps_per_second": 7.354, + "step": 81920 + }, + { + "epoch": 5.66, + "learning_rate": 0.00010636499874117036, + "loss": 0.3154, + "step": 83968 + }, + { + "epoch": 5.66, + "eval_loss": 0.7248900532722473, + "eval_runtime": 17.751, + "eval_samples_per_second": 117.852, + "eval_steps_per_second": 7.38, + "step": 83968 + }, + { + "epoch": 5.8, + "learning_rate": 9.501866590283475e-05, + "loss": 0.3105, + "step": 86016 + }, + { + "epoch": 5.8, + "eval_loss": 0.7352888584136963, + "eval_runtime": 18.0338, + "eval_samples_per_second": 116.004, + "eval_steps_per_second": 7.264, + "step": 86016 + }, + { + "epoch": 5.94, + "learning_rate": 8.416361604489855e-05, + "loss": 0.3035, + "step": 88064 + }, + { + "epoch": 5.94, + "eval_loss": 0.7256708741188049, + "eval_runtime": 15.94, + "eval_samples_per_second": 131.242, + "eval_steps_per_second": 8.218, + "step": 88064 + }, + { + "epoch": 6.08, + "learning_rate": 7.384521927589935e-05, + "loss": 0.294, + "step": 90112 + }, + { + "epoch": 6.08, + "eval_loss": 0.7387065291404724, + "eval_runtime": 16.8562, + "eval_samples_per_second": 124.109, + "eval_steps_per_second": 7.772, + "step": 90112 + }, + { + "epoch": 6.22, + "learning_rate": 6.409110434142392e-05, + "loss": 0.2866, + "step": 92160 + }, + { + "epoch": 6.22, + "eval_loss": 0.7492235898971558, + "eval_runtime": 17.2872, + "eval_samples_per_second": 121.015, + "eval_steps_per_second": 7.578, + "step": 92160 + }, + { + "epoch": 6.35, + "learning_rate": 5.492819313147518e-05, + "loss": 0.2836, + "step": 94208 + }, + { + "epoch": 6.35, + "eval_loss": 0.7440558671951294, + "eval_runtime": 17.5257, + "eval_samples_per_second": 119.367, + "eval_steps_per_second": 7.475, + "step": 94208 + }, + { + "epoch": 6.49, + "learning_rate": 4.6394783238561305e-05, + "loss": 0.2811, + "step": 96256 + }, + { + "epoch": 6.49, + "eval_loss": 0.7577848434448242, + "eval_runtime": 15.8536, + "eval_samples_per_second": 131.957, + "eval_steps_per_second": 8.263, + "step": 96256 + }, + { + "epoch": 6.63, + "learning_rate": 3.851358797621554e-05, + "loss": 0.2774, + "step": 98304 + }, + { + "epoch": 6.63, + "eval_loss": 0.7483424544334412, + "eval_runtime": 15.8485, + "eval_samples_per_second": 132.0, + "eval_steps_per_second": 8.266, + "step": 98304 + }, + { + "epoch": 6.77, + "learning_rate": 3.130985341100834e-05, + "loss": 0.2736, + "step": 100352 + }, + { + "epoch": 6.77, + "eval_loss": 0.7481484413146973, + "eval_runtime": 17.9168, + "eval_samples_per_second": 116.762, + "eval_steps_per_second": 7.312, + "step": 100352 + }, + { + "epoch": 6.91, + "learning_rate": 2.4803653223119228e-05, + "loss": 0.27, + "step": 102400 + }, + { + "epoch": 6.91, + "eval_loss": 0.7659121751785278, + "eval_runtime": 15.8798, + "eval_samples_per_second": 131.739, + "eval_steps_per_second": 8.249, + "step": 102400 + }, + { + "epoch": 7.04, + "learning_rate": 1.902218093192909e-05, + "loss": 0.2636, + "step": 104448 + }, + { + "epoch": 7.04, + "eval_loss": 0.7734333276748657, + "eval_runtime": 17.8497, + "eval_samples_per_second": 117.201, + "eval_steps_per_second": 7.339, + "step": 104448 + }, + { + "epoch": 7.18, + "learning_rate": 1.398060674025281e-05, + "loss": 0.2583, + "step": 106496 + }, + { + "epoch": 7.18, + "eval_loss": 0.7774596810340881, + "eval_runtime": 17.9006, + "eval_samples_per_second": 116.868, + "eval_steps_per_second": 7.318, + "step": 106496 + }, + { + "epoch": 7.32, + "learning_rate": 9.695080472251094e-06, + "loss": 0.2577, + "step": 108544 + }, + { + "epoch": 7.32, + "eval_loss": 0.7828282117843628, + "eval_runtime": 15.9506, + "eval_samples_per_second": 131.155, + "eval_steps_per_second": 8.213, + "step": 108544 + }, + { + "epoch": 7.46, + "learning_rate": 6.177802621565725e-06, + "loss": 0.2553, + "step": 110592 + }, + { + "epoch": 7.46, + "eval_loss": 0.7818995714187622, + "eval_runtime": 15.975, + "eval_samples_per_second": 130.955, + "eval_steps_per_second": 8.2, + "step": 110592 + }, + { + "epoch": 7.6, + "learning_rate": 3.4434741135661028e-06, + "loss": 0.2545, + "step": 112640 + }, + { + "epoch": 7.6, + "eval_loss": 0.7817807197570801, + "eval_runtime": 15.9873, + "eval_samples_per_second": 130.854, + "eval_steps_per_second": 8.194, + "step": 112640 + }, + { + "epoch": 7.74, + "learning_rate": 1.498947438756143e-06, + "loss": 0.2536, + "step": 114688 + }, + { + "epoch": 7.74, + "eval_loss": 0.7830276489257812, + "eval_runtime": 17.6639, + "eval_samples_per_second": 118.434, + "eval_steps_per_second": 7.416, + "step": 114688 + } + ], + "max_steps": 118608, + "num_train_epochs": 8, + "total_flos": 4.702934009537618e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-114688/training_args.bin b/checkpoint-114688/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f9ee3ad348e277019b0ce52cdd8e1d7773f2852 --- /dev/null +++ b/checkpoint-114688/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0200139492e444a9d322a4f90a96e6dde09c7a882f05b816c2345dade5ea0f98 +size 3515 diff --git a/checkpoint-81920/config.json b/checkpoint-81920/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8dd006e2c45b4178c8c43247c12d14ba89fc4b44 --- /dev/null +++ b/checkpoint-81920/config.json @@ -0,0 +1,32 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 512, + "n_head": 8, + "n_inner": null, + "n_layer": 8, + "n_positions": 2048, + "pad_token_id": 1, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.26.0.dev0", + "use_cache": true, + "vocab_size": 299 +} diff --git a/checkpoint-81920/optimizer.pt b/checkpoint-81920/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0d7ab5515a97d837fd5e9701e60ea3bf28e535d --- /dev/null +++ b/checkpoint-81920/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea33963d1ff00b69c9b95a75bb66e6037e0f7327b98d60140a4bd4462b7b6ea +size 211432837 diff --git a/checkpoint-81920/pytorch_model.bin b/checkpoint-81920/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..70ebef489e59340e273fc665cf96c6add1250845 --- /dev/null +++ b/checkpoint-81920/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a1f35d58c7e8e917620a1f737aa8a38a873486b8360ff6a31809734fc09e9d1 +size 139279005 diff --git a/checkpoint-81920/rng_state.pth b/checkpoint-81920/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e4a05372c86f7d383f787d3dc5e14036e2f2947 --- /dev/null +++ b/checkpoint-81920/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b15af6bf46f127b839408de433ccac01f207f8dc309c7de790456737ff556c2 +size 15597 diff --git a/checkpoint-81920/scaler.pt b/checkpoint-81920/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f12044d605ffd20544f7a694beef3f74007b022 --- /dev/null +++ b/checkpoint-81920/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83456236364b987a3fb6423d38cc8c9760be6421738dbf4def0fd71bff91206d +size 557 diff --git a/checkpoint-81920/scheduler.pt b/checkpoint-81920/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea845e62e867a7c4b58806de13e7afed1649ed7e --- /dev/null +++ b/checkpoint-81920/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8da89aecb8a313e489684dbdef383d10e3af94945ff708bb72b8626d826e9512 +size 627 diff --git a/checkpoint-81920/trainer_state.json b/checkpoint-81920/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0e75f6c07f941eaae2c3c221343318e6e80a31e6 --- /dev/null +++ b/checkpoint-81920/trainer_state.json @@ -0,0 +1,576 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.525428301632267, + "global_step": 81920, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.14, + "learning_rate": 0.00020480000000000002, + "loss": 2.1719, + "step": 2048 + }, + { + "epoch": 0.14, + "eval_loss": 1.0833024978637695, + "eval_runtime": 15.7887, + "eval_samples_per_second": 132.5, + "eval_steps_per_second": 8.297, + "step": 2048 + }, + { + "epoch": 0.28, + "learning_rate": 0.00040960000000000004, + "loss": 0.9468, + "step": 4096 + }, + { + "epoch": 0.28, + "eval_loss": 0.8000818490982056, + "eval_runtime": 17.2632, + "eval_samples_per_second": 121.183, + "eval_steps_per_second": 7.588, + "step": 4096 + }, + { + "epoch": 0.41, + "learning_rate": 0.0004998749142723946, + "loss": 0.7542, + "step": 6144 + }, + { + "epoch": 0.41, + "eval_loss": 0.7439925074577332, + "eval_runtime": 17.4134, + "eval_samples_per_second": 120.138, + "eval_steps_per_second": 7.523, + "step": 6144 + }, + { + "epoch": 0.55, + "learning_rate": 0.0004990273340312486, + "loss": 0.6756, + "step": 8192 + }, + { + "epoch": 0.55, + "eval_loss": 0.7018134593963623, + "eval_runtime": 15.8601, + "eval_samples_per_second": 131.904, + "eval_steps_per_second": 8.26, + "step": 8192 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004973820371438889, + "loss": 0.631, + "step": 10240 + }, + { + "epoch": 0.69, + "eval_loss": 0.684688925743103, + "eval_runtime": 18.0314, + "eval_samples_per_second": 116.02, + "eval_steps_per_second": 7.265, + "step": 10240 + }, + { + "epoch": 0.83, + "learning_rate": 0.0004949442940386407, + "loss": 0.5989, + "step": 12288 + }, + { + "epoch": 0.83, + "eval_loss": 0.6822534203529358, + "eval_runtime": 17.4182, + "eval_samples_per_second": 120.104, + "eval_steps_per_second": 7.521, + "step": 12288 + }, + { + "epoch": 0.97, + "learning_rate": 0.0004917201492451735, + "loss": 0.5776, + "step": 14336 + }, + { + "epoch": 0.97, + "eval_loss": 0.6674544215202332, + "eval_runtime": 17.7487, + "eval_samples_per_second": 117.868, + "eval_steps_per_second": 7.381, + "step": 14336 + }, + { + "epoch": 1.11, + "learning_rate": 0.0004877230785006913, + "loss": 0.5525, + "step": 16384 + }, + { + "epoch": 1.11, + "eval_loss": 0.6640163660049438, + "eval_runtime": 15.8951, + "eval_samples_per_second": 131.613, + "eval_steps_per_second": 8.242, + "step": 16384 + }, + { + "epoch": 1.24, + "learning_rate": 0.0004829670105577666, + "loss": 0.5373, + "step": 18432 + }, + { + "epoch": 1.24, + "eval_loss": 0.6663170456886292, + "eval_runtime": 17.5, + "eval_samples_per_second": 119.543, + "eval_steps_per_second": 7.486, + "step": 18432 + }, + { + "epoch": 1.38, + "learning_rate": 0.0004774596641323791, + "loss": 0.5261, + "step": 20480 + }, + { + "epoch": 1.38, + "eval_loss": 0.6518951654434204, + "eval_runtime": 17.1326, + "eval_samples_per_second": 122.107, + "eval_steps_per_second": 7.646, + "step": 20480 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004712261976082475, + "loss": 0.5115, + "step": 22528 + }, + { + "epoch": 1.52, + "eval_loss": 0.6658991575241089, + "eval_runtime": 17.6874, + "eval_samples_per_second": 118.277, + "eval_steps_per_second": 7.406, + "step": 22528 + }, + { + "epoch": 1.66, + "learning_rate": 0.00046428407064289515, + "loss": 0.5033, + "step": 24576 + }, + { + "epoch": 1.66, + "eval_loss": 0.6602604389190674, + "eval_runtime": 15.8845, + "eval_samples_per_second": 131.701, + "eval_steps_per_second": 8.247, + "step": 24576 + }, + { + "epoch": 1.8, + "learning_rate": 0.00045665163060732317, + "loss": 0.4925, + "step": 26624 + }, + { + "epoch": 1.8, + "eval_loss": 0.6551440954208374, + "eval_runtime": 17.7902, + "eval_samples_per_second": 117.593, + "eval_steps_per_second": 7.364, + "step": 26624 + }, + { + "epoch": 1.93, + "learning_rate": 0.000448360778288479, + "loss": 0.4868, + "step": 28672 + }, + { + "epoch": 1.93, + "eval_loss": 0.6580803394317627, + "eval_runtime": 15.7841, + "eval_samples_per_second": 132.538, + "eval_steps_per_second": 8.299, + "step": 28672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00043942999964293453, + "loss": 0.4741, + "step": 30720 + }, + { + "epoch": 2.07, + "eval_loss": 0.661230742931366, + "eval_runtime": 18.015, + "eval_samples_per_second": 116.126, + "eval_steps_per_second": 7.272, + "step": 30720 + }, + { + "epoch": 2.21, + "learning_rate": 0.0004298966220346151, + "loss": 0.4632, + "step": 32768 + }, + { + "epoch": 2.21, + "eval_loss": 0.6618030071258545, + "eval_runtime": 17.9073, + "eval_samples_per_second": 116.824, + "eval_steps_per_second": 7.315, + "step": 32768 + }, + { + "epoch": 2.35, + "learning_rate": 0.00041978697624050446, + "loss": 0.4584, + "step": 34816 + }, + { + "epoch": 2.35, + "eval_loss": 0.6579039096832275, + "eval_runtime": 17.8241, + "eval_samples_per_second": 117.369, + "eval_steps_per_second": 7.35, + "step": 34816 + }, + { + "epoch": 2.49, + "learning_rate": 0.0004091334467888659, + "loss": 0.451, + "step": 36864 + }, + { + "epoch": 2.49, + "eval_loss": 0.6627541780471802, + "eval_runtime": 17.6822, + "eval_samples_per_second": 118.311, + "eval_steps_per_second": 7.409, + "step": 36864 + }, + { + "epoch": 2.62, + "learning_rate": 0.00039796458815002033, + "loss": 0.445, + "step": 38912 + }, + { + "epoch": 2.62, + "eval_loss": 0.656867265701294, + "eval_runtime": 15.8659, + "eval_samples_per_second": 131.855, + "eval_steps_per_second": 8.257, + "step": 38912 + }, + { + "epoch": 2.76, + "learning_rate": 0.0003863212870708643, + "loss": 0.4404, + "step": 40960 + }, + { + "epoch": 2.76, + "eval_loss": 0.6653844118118286, + "eval_runtime": 17.9192, + "eval_samples_per_second": 116.747, + "eval_steps_per_second": 7.311, + "step": 40960 + }, + { + "epoch": 2.9, + "learning_rate": 0.00037424687637876156, + "loss": 0.4334, + "step": 43008 + }, + { + "epoch": 2.9, + "eval_loss": 0.6553301811218262, + "eval_runtime": 15.9304, + "eval_samples_per_second": 131.321, + "eval_steps_per_second": 8.223, + "step": 43008 + }, + { + "epoch": 3.04, + "learning_rate": 0.00036178064571954134, + "loss": 0.4278, + "step": 45056 + }, + { + "epoch": 3.04, + "eval_loss": 0.6766741871833801, + "eval_runtime": 16.9834, + "eval_samples_per_second": 123.179, + "eval_steps_per_second": 7.713, + "step": 45056 + }, + { + "epoch": 3.18, + "learning_rate": 0.0003489439971353363, + "loss": 0.4152, + "step": 47104 + }, + { + "epoch": 3.18, + "eval_loss": 0.6645193696022034, + "eval_runtime": 15.816, + "eval_samples_per_second": 132.271, + "eval_steps_per_second": 8.283, + "step": 47104 + }, + { + "epoch": 3.32, + "learning_rate": 0.0003357965820476752, + "loss": 0.411, + "step": 49152 + }, + { + "epoch": 3.32, + "eval_loss": 0.6748972535133362, + "eval_runtime": 17.7507, + "eval_samples_per_second": 117.855, + "eval_steps_per_second": 7.38, + "step": 49152 + }, + { + "epoch": 3.45, + "learning_rate": 0.0003223743322236833, + "loss": 0.4085, + "step": 51200 + }, + { + "epoch": 3.45, + "eval_loss": 0.6703893542289734, + "eval_runtime": 17.341, + "eval_samples_per_second": 120.639, + "eval_steps_per_second": 7.554, + "step": 51200 + }, + { + "epoch": 3.59, + "learning_rate": 0.0003087269633577651, + "loss": 0.4037, + "step": 53248 + }, + { + "epoch": 3.59, + "eval_loss": 0.6561577916145325, + "eval_runtime": 15.9295, + "eval_samples_per_second": 131.329, + "eval_steps_per_second": 8.224, + "step": 53248 + }, + { + "epoch": 3.73, + "learning_rate": 0.0002948780545870247, + "loss": 0.3975, + "step": 55296 + }, + { + "epoch": 3.73, + "eval_loss": 0.6698916554450989, + "eval_runtime": 17.9008, + "eval_samples_per_second": 116.866, + "eval_steps_per_second": 7.318, + "step": 55296 + }, + { + "epoch": 3.87, + "learning_rate": 0.0002808921064161573, + "loss": 0.3917, + "step": 57344 + }, + { + "epoch": 3.87, + "eval_loss": 0.7029281854629517, + "eval_runtime": 17.8146, + "eval_samples_per_second": 117.432, + "eval_steps_per_second": 7.354, + "step": 57344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00026680720064442787, + "loss": 0.3876, + "step": 59392 + }, + { + "epoch": 4.01, + "eval_loss": 0.6837398409843445, + "eval_runtime": 17.4354, + "eval_samples_per_second": 119.986, + "eval_steps_per_second": 7.513, + "step": 59392 + }, + { + "epoch": 4.14, + "learning_rate": 0.00025266845586830784, + "loss": 0.3751, + "step": 61440 + }, + { + "epoch": 4.14, + "eval_loss": 0.690500795841217, + "eval_runtime": 17.8964, + "eval_samples_per_second": 116.895, + "eval_steps_per_second": 7.32, + "step": 61440 + }, + { + "epoch": 4.28, + "learning_rate": 0.00023851425721450398, + "loss": 0.3719, + "step": 63488 + }, + { + "epoch": 4.28, + "eval_loss": 0.6938452124595642, + "eval_runtime": 15.7962, + "eval_samples_per_second": 132.437, + "eval_steps_per_second": 8.293, + "step": 63488 + }, + { + "epoch": 4.42, + "learning_rate": 0.0002244106409269819, + "loss": 0.3644, + "step": 65536 + }, + { + "epoch": 4.42, + "eval_loss": 0.6987010836601257, + "eval_runtime": 15.9556, + "eval_samples_per_second": 131.114, + "eval_steps_per_second": 8.21, + "step": 65536 + }, + { + "epoch": 4.56, + "learning_rate": 0.00021037526400320187, + "loss": 0.357, + "step": 67584 + }, + { + "epoch": 4.56, + "eval_loss": 0.6973423957824707, + "eval_runtime": 15.8273, + "eval_samples_per_second": 132.177, + "eval_steps_per_second": 8.277, + "step": 67584 + }, + { + "epoch": 4.7, + "learning_rate": 0.0001964736950807942, + "loss": 0.357, + "step": 69632 + }, + { + "epoch": 4.7, + "eval_loss": 0.6949540972709656, + "eval_runtime": 16.1065, + "eval_samples_per_second": 129.885, + "eval_steps_per_second": 8.133, + "step": 69632 + }, + { + "epoch": 4.83, + "learning_rate": 0.00018274358855873096, + "loss": 0.3502, + "step": 71680 + }, + { + "epoch": 4.83, + "eval_loss": 0.6959769129753113, + "eval_runtime": 16.2001, + "eval_samples_per_second": 129.135, + "eval_steps_per_second": 8.086, + "step": 71680 + }, + { + "epoch": 4.97, + "learning_rate": 0.00016922892649452222, + "loss": 0.3458, + "step": 73728 + }, + { + "epoch": 4.97, + "eval_loss": 0.7017838358879089, + "eval_runtime": 18.0467, + "eval_samples_per_second": 115.922, + "eval_steps_per_second": 7.259, + "step": 73728 + }, + { + "epoch": 5.11, + "learning_rate": 0.00015597300080605504, + "loss": 0.3302, + "step": 75776 + }, + { + "epoch": 5.11, + "eval_loss": 0.7075567841529846, + "eval_runtime": 16.7136, + "eval_samples_per_second": 125.168, + "eval_steps_per_second": 7.838, + "step": 75776 + }, + { + "epoch": 5.25, + "learning_rate": 0.0001430182745933093, + "loss": 0.3278, + "step": 77824 + }, + { + "epoch": 5.25, + "eval_loss": 0.7178707718849182, + "eval_runtime": 15.7714, + "eval_samples_per_second": 132.645, + "eval_steps_per_second": 8.306, + "step": 77824 + }, + { + "epoch": 5.39, + "learning_rate": 0.00013040017526934073, + "loss": 0.3224, + "step": 79872 + }, + { + "epoch": 5.39, + "eval_loss": 0.725006639957428, + "eval_runtime": 15.8618, + "eval_samples_per_second": 131.889, + "eval_steps_per_second": 8.259, + "step": 79872 + }, + { + "epoch": 5.53, + "learning_rate": 0.00011817144183980649, + "loss": 0.3181, + "step": 81920 + }, + { + "epoch": 5.53, + "eval_loss": 0.7247176766395569, + "eval_runtime": 17.8123, + "eval_samples_per_second": 117.447, + "eval_steps_per_second": 7.354, + "step": 81920 + } + ], + "max_steps": 118608, + "num_train_epochs": 8, + "total_flos": 3.358824123721482e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-81920/training_args.bin b/checkpoint-81920/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f9ee3ad348e277019b0ce52cdd8e1d7773f2852 --- /dev/null +++ b/checkpoint-81920/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0200139492e444a9d322a4f90a96e6dde09c7a882f05b816c2345dade5ea0f98 +size 3515 diff --git a/checkpoint-90112/config.json b/checkpoint-90112/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8dd006e2c45b4178c8c43247c12d14ba89fc4b44 --- /dev/null +++ b/checkpoint-90112/config.json @@ -0,0 +1,32 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 512, + "n_head": 8, + "n_inner": null, + "n_layer": 8, + "n_positions": 2048, + "pad_token_id": 1, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.26.0.dev0", + "use_cache": true, + "vocab_size": 299 +} diff --git a/checkpoint-90112/optimizer.pt b/checkpoint-90112/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8edca00c73935896ecad2980fe178859d3f64deb --- /dev/null +++ b/checkpoint-90112/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9348484461aa0efc6ef43d1cd9f68f1780cc6f0a8d16df3dba1a94e0a24334c9 +size 211432837 diff --git a/checkpoint-90112/pytorch_model.bin b/checkpoint-90112/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..27dc69af2e3ceedb69b1b3f0448ac832b20c9e45 --- /dev/null +++ b/checkpoint-90112/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13bca6d5041775e0ef37e300c12fa2ef95343fea9202def32ad383199bfdc35b +size 139279005 diff --git a/checkpoint-90112/rng_state.pth b/checkpoint-90112/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9eb8fe9036b2faf03b9e60c4a5cff8a06414d289 --- /dev/null +++ b/checkpoint-90112/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957f5daa73c63386d7677bd24415e97f9a46fa12239f90b764d0b1e53483b6a4 +size 15597 diff --git a/checkpoint-90112/scaler.pt b/checkpoint-90112/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..15916d67f5cfb0827684e66b4267238718a6bce9 --- /dev/null +++ b/checkpoint-90112/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65677617c481c4dea5774baaa3893a9d30edaa0722420de801d77f9d5102f186 +size 557 diff --git a/checkpoint-90112/scheduler.pt b/checkpoint-90112/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6fd79c11971aedec7b9a23789d5463322284a0d --- /dev/null +++ b/checkpoint-90112/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d58edf844986f37be7d4bc128a885c7e910ca1227f1e618aed7827dd6a504ce +size 627 diff --git a/checkpoint-90112/trainer_state.json b/checkpoint-90112/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..49e370185f325b2a22311ca68d398df6188bd5cb --- /dev/null +++ b/checkpoint-90112/trainer_state.json @@ -0,0 +1,632 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.077971131795494, + "global_step": 90112, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.14, + "learning_rate": 0.00020480000000000002, + "loss": 2.1719, + "step": 2048 + }, + { + "epoch": 0.14, + "eval_loss": 1.0833024978637695, + "eval_runtime": 15.7887, + "eval_samples_per_second": 132.5, + "eval_steps_per_second": 8.297, + "step": 2048 + }, + { + "epoch": 0.28, + "learning_rate": 0.00040960000000000004, + "loss": 0.9468, + "step": 4096 + }, + { + "epoch": 0.28, + "eval_loss": 0.8000818490982056, + "eval_runtime": 17.2632, + "eval_samples_per_second": 121.183, + "eval_steps_per_second": 7.588, + "step": 4096 + }, + { + "epoch": 0.41, + "learning_rate": 0.0004998749142723946, + "loss": 0.7542, + "step": 6144 + }, + { + "epoch": 0.41, + "eval_loss": 0.7439925074577332, + "eval_runtime": 17.4134, + "eval_samples_per_second": 120.138, + "eval_steps_per_second": 7.523, + "step": 6144 + }, + { + "epoch": 0.55, + "learning_rate": 0.0004990273340312486, + "loss": 0.6756, + "step": 8192 + }, + { + "epoch": 0.55, + "eval_loss": 0.7018134593963623, + "eval_runtime": 15.8601, + "eval_samples_per_second": 131.904, + "eval_steps_per_second": 8.26, + "step": 8192 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004973820371438889, + "loss": 0.631, + "step": 10240 + }, + { + "epoch": 0.69, + "eval_loss": 0.684688925743103, + "eval_runtime": 18.0314, + "eval_samples_per_second": 116.02, + "eval_steps_per_second": 7.265, + "step": 10240 + }, + { + "epoch": 0.83, + "learning_rate": 0.0004949442940386407, + "loss": 0.5989, + "step": 12288 + }, + { + "epoch": 0.83, + "eval_loss": 0.6822534203529358, + "eval_runtime": 17.4182, + "eval_samples_per_second": 120.104, + "eval_steps_per_second": 7.521, + "step": 12288 + }, + { + "epoch": 0.97, + "learning_rate": 0.0004917201492451735, + "loss": 0.5776, + "step": 14336 + }, + { + "epoch": 0.97, + "eval_loss": 0.6674544215202332, + "eval_runtime": 17.7487, + "eval_samples_per_second": 117.868, + "eval_steps_per_second": 7.381, + "step": 14336 + }, + { + "epoch": 1.11, + "learning_rate": 0.0004877230785006913, + "loss": 0.5525, + "step": 16384 + }, + { + "epoch": 1.11, + "eval_loss": 0.6640163660049438, + "eval_runtime": 15.8951, + "eval_samples_per_second": 131.613, + "eval_steps_per_second": 8.242, + "step": 16384 + }, + { + "epoch": 1.24, + "learning_rate": 0.0004829670105577666, + "loss": 0.5373, + "step": 18432 + }, + { + "epoch": 1.24, + "eval_loss": 0.6663170456886292, + "eval_runtime": 17.5, + "eval_samples_per_second": 119.543, + "eval_steps_per_second": 7.486, + "step": 18432 + }, + { + "epoch": 1.38, + "learning_rate": 0.0004774596641323791, + "loss": 0.5261, + "step": 20480 + }, + { + "epoch": 1.38, + "eval_loss": 0.6518951654434204, + "eval_runtime": 17.1326, + "eval_samples_per_second": 122.107, + "eval_steps_per_second": 7.646, + "step": 20480 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004712261976082475, + "loss": 0.5115, + "step": 22528 + }, + { + "epoch": 1.52, + "eval_loss": 0.6658991575241089, + "eval_runtime": 17.6874, + "eval_samples_per_second": 118.277, + "eval_steps_per_second": 7.406, + "step": 22528 + }, + { + "epoch": 1.66, + "learning_rate": 0.00046428407064289515, + "loss": 0.5033, + "step": 24576 + }, + { + "epoch": 1.66, + "eval_loss": 0.6602604389190674, + "eval_runtime": 15.8845, + "eval_samples_per_second": 131.701, + "eval_steps_per_second": 8.247, + "step": 24576 + }, + { + "epoch": 1.8, + "learning_rate": 0.00045665163060732317, + "loss": 0.4925, + "step": 26624 + }, + { + "epoch": 1.8, + "eval_loss": 0.6551440954208374, + "eval_runtime": 17.7902, + "eval_samples_per_second": 117.593, + "eval_steps_per_second": 7.364, + "step": 26624 + }, + { + "epoch": 1.93, + "learning_rate": 0.000448360778288479, + "loss": 0.4868, + "step": 28672 + }, + { + "epoch": 1.93, + "eval_loss": 0.6580803394317627, + "eval_runtime": 15.7841, + "eval_samples_per_second": 132.538, + "eval_steps_per_second": 8.299, + "step": 28672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00043942999964293453, + "loss": 0.4741, + "step": 30720 + }, + { + "epoch": 2.07, + "eval_loss": 0.661230742931366, + "eval_runtime": 18.015, + "eval_samples_per_second": 116.126, + "eval_steps_per_second": 7.272, + "step": 30720 + }, + { + "epoch": 2.21, + "learning_rate": 0.0004298966220346151, + "loss": 0.4632, + "step": 32768 + }, + { + "epoch": 2.21, + "eval_loss": 0.6618030071258545, + "eval_runtime": 17.9073, + "eval_samples_per_second": 116.824, + "eval_steps_per_second": 7.315, + "step": 32768 + }, + { + "epoch": 2.35, + "learning_rate": 0.00041978697624050446, + "loss": 0.4584, + "step": 34816 + }, + { + "epoch": 2.35, + "eval_loss": 0.6579039096832275, + "eval_runtime": 17.8241, + "eval_samples_per_second": 117.369, + "eval_steps_per_second": 7.35, + "step": 34816 + }, + { + "epoch": 2.49, + "learning_rate": 0.0004091334467888659, + "loss": 0.451, + "step": 36864 + }, + { + "epoch": 2.49, + "eval_loss": 0.6627541780471802, + "eval_runtime": 17.6822, + "eval_samples_per_second": 118.311, + "eval_steps_per_second": 7.409, + "step": 36864 + }, + { + "epoch": 2.62, + "learning_rate": 0.00039796458815002033, + "loss": 0.445, + "step": 38912 + }, + { + "epoch": 2.62, + "eval_loss": 0.656867265701294, + "eval_runtime": 15.8659, + "eval_samples_per_second": 131.855, + "eval_steps_per_second": 8.257, + "step": 38912 + }, + { + "epoch": 2.76, + "learning_rate": 0.0003863212870708643, + "loss": 0.4404, + "step": 40960 + }, + { + "epoch": 2.76, + "eval_loss": 0.6653844118118286, + "eval_runtime": 17.9192, + "eval_samples_per_second": 116.747, + "eval_steps_per_second": 7.311, + "step": 40960 + }, + { + "epoch": 2.9, + "learning_rate": 0.00037424687637876156, + "loss": 0.4334, + "step": 43008 + }, + { + "epoch": 2.9, + "eval_loss": 0.6553301811218262, + "eval_runtime": 15.9304, + "eval_samples_per_second": 131.321, + "eval_steps_per_second": 8.223, + "step": 43008 + }, + { + "epoch": 3.04, + "learning_rate": 0.00036178064571954134, + "loss": 0.4278, + "step": 45056 + }, + { + "epoch": 3.04, + "eval_loss": 0.6766741871833801, + "eval_runtime": 16.9834, + "eval_samples_per_second": 123.179, + "eval_steps_per_second": 7.713, + "step": 45056 + }, + { + "epoch": 3.18, + "learning_rate": 0.0003489439971353363, + "loss": 0.4152, + "step": 47104 + }, + { + "epoch": 3.18, + "eval_loss": 0.6645193696022034, + "eval_runtime": 15.816, + "eval_samples_per_second": 132.271, + "eval_steps_per_second": 8.283, + "step": 47104 + }, + { + "epoch": 3.32, + "learning_rate": 0.0003357965820476752, + "loss": 0.411, + "step": 49152 + }, + { + "epoch": 3.32, + "eval_loss": 0.6748972535133362, + "eval_runtime": 17.7507, + "eval_samples_per_second": 117.855, + "eval_steps_per_second": 7.38, + "step": 49152 + }, + { + "epoch": 3.45, + "learning_rate": 0.0003223743322236833, + "loss": 0.4085, + "step": 51200 + }, + { + "epoch": 3.45, + "eval_loss": 0.6703893542289734, + "eval_runtime": 17.341, + "eval_samples_per_second": 120.639, + "eval_steps_per_second": 7.554, + "step": 51200 + }, + { + "epoch": 3.59, + "learning_rate": 0.0003087269633577651, + "loss": 0.4037, + "step": 53248 + }, + { + "epoch": 3.59, + "eval_loss": 0.6561577916145325, + "eval_runtime": 15.9295, + "eval_samples_per_second": 131.329, + "eval_steps_per_second": 8.224, + "step": 53248 + }, + { + "epoch": 3.73, + "learning_rate": 0.0002948780545870247, + "loss": 0.3975, + "step": 55296 + }, + { + "epoch": 3.73, + "eval_loss": 0.6698916554450989, + "eval_runtime": 17.9008, + "eval_samples_per_second": 116.866, + "eval_steps_per_second": 7.318, + "step": 55296 + }, + { + "epoch": 3.87, + "learning_rate": 0.0002808921064161573, + "loss": 0.3917, + "step": 57344 + }, + { + "epoch": 3.87, + "eval_loss": 0.7029281854629517, + "eval_runtime": 17.8146, + "eval_samples_per_second": 117.432, + "eval_steps_per_second": 7.354, + "step": 57344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00026680720064442787, + "loss": 0.3876, + "step": 59392 + }, + { + "epoch": 4.01, + "eval_loss": 0.6837398409843445, + "eval_runtime": 17.4354, + "eval_samples_per_second": 119.986, + "eval_steps_per_second": 7.513, + "step": 59392 + }, + { + "epoch": 4.14, + "learning_rate": 0.00025266845586830784, + "loss": 0.3751, + "step": 61440 + }, + { + "epoch": 4.14, + "eval_loss": 0.690500795841217, + "eval_runtime": 17.8964, + "eval_samples_per_second": 116.895, + "eval_steps_per_second": 7.32, + "step": 61440 + }, + { + "epoch": 4.28, + "learning_rate": 0.00023851425721450398, + "loss": 0.3719, + "step": 63488 + }, + { + "epoch": 4.28, + "eval_loss": 0.6938452124595642, + "eval_runtime": 15.7962, + "eval_samples_per_second": 132.437, + "eval_steps_per_second": 8.293, + "step": 63488 + }, + { + "epoch": 4.42, + "learning_rate": 0.0002244106409269819, + "loss": 0.3644, + "step": 65536 + }, + { + "epoch": 4.42, + "eval_loss": 0.6987010836601257, + "eval_runtime": 15.9556, + "eval_samples_per_second": 131.114, + "eval_steps_per_second": 8.21, + "step": 65536 + }, + { + "epoch": 4.56, + "learning_rate": 0.00021037526400320187, + "loss": 0.357, + "step": 67584 + }, + { + "epoch": 4.56, + "eval_loss": 0.6973423957824707, + "eval_runtime": 15.8273, + "eval_samples_per_second": 132.177, + "eval_steps_per_second": 8.277, + "step": 67584 + }, + { + "epoch": 4.7, + "learning_rate": 0.0001964736950807942, + "loss": 0.357, + "step": 69632 + }, + { + "epoch": 4.7, + "eval_loss": 0.6949540972709656, + "eval_runtime": 16.1065, + "eval_samples_per_second": 129.885, + "eval_steps_per_second": 8.133, + "step": 69632 + }, + { + "epoch": 4.83, + "learning_rate": 0.00018274358855873096, + "loss": 0.3502, + "step": 71680 + }, + { + "epoch": 4.83, + "eval_loss": 0.6959769129753113, + "eval_runtime": 16.2001, + "eval_samples_per_second": 129.135, + "eval_steps_per_second": 8.086, + "step": 71680 + }, + { + "epoch": 4.97, + "learning_rate": 0.00016922892649452222, + "loss": 0.3458, + "step": 73728 + }, + { + "epoch": 4.97, + "eval_loss": 0.7017838358879089, + "eval_runtime": 18.0467, + "eval_samples_per_second": 115.922, + "eval_steps_per_second": 7.259, + "step": 73728 + }, + { + "epoch": 5.11, + "learning_rate": 0.00015597300080605504, + "loss": 0.3302, + "step": 75776 + }, + { + "epoch": 5.11, + "eval_loss": 0.7075567841529846, + "eval_runtime": 16.7136, + "eval_samples_per_second": 125.168, + "eval_steps_per_second": 7.838, + "step": 75776 + }, + { + "epoch": 5.25, + "learning_rate": 0.0001430182745933093, + "loss": 0.3278, + "step": 77824 + }, + { + "epoch": 5.25, + "eval_loss": 0.7178707718849182, + "eval_runtime": 15.7714, + "eval_samples_per_second": 132.645, + "eval_steps_per_second": 8.306, + "step": 77824 + }, + { + "epoch": 5.39, + "learning_rate": 0.00013040017526934073, + "loss": 0.3224, + "step": 79872 + }, + { + "epoch": 5.39, + "eval_loss": 0.725006639957428, + "eval_runtime": 15.8618, + "eval_samples_per_second": 131.889, + "eval_steps_per_second": 8.259, + "step": 79872 + }, + { + "epoch": 5.53, + "learning_rate": 0.00011817144183980649, + "loss": 0.3181, + "step": 81920 + }, + { + "epoch": 5.53, + "eval_loss": 0.7247176766395569, + "eval_runtime": 17.8123, + "eval_samples_per_second": 117.447, + "eval_steps_per_second": 7.354, + "step": 81920 + }, + { + "epoch": 5.66, + "learning_rate": 0.00010636499874117036, + "loss": 0.3154, + "step": 83968 + }, + { + "epoch": 5.66, + "eval_loss": 0.7248900532722473, + "eval_runtime": 17.751, + "eval_samples_per_second": 117.852, + "eval_steps_per_second": 7.38, + "step": 83968 + }, + { + "epoch": 5.8, + "learning_rate": 9.501866590283475e-05, + "loss": 0.3105, + "step": 86016 + }, + { + "epoch": 5.8, + "eval_loss": 0.7352888584136963, + "eval_runtime": 18.0338, + "eval_samples_per_second": 116.004, + "eval_steps_per_second": 7.264, + "step": 86016 + }, + { + "epoch": 5.94, + "learning_rate": 8.416361604489855e-05, + "loss": 0.3035, + "step": 88064 + }, + { + "epoch": 5.94, + "eval_loss": 0.7256708741188049, + "eval_runtime": 15.94, + "eval_samples_per_second": 131.242, + "eval_steps_per_second": 8.218, + "step": 88064 + }, + { + "epoch": 6.08, + "learning_rate": 7.384521927589935e-05, + "loss": 0.294, + "step": 90112 + }, + { + "epoch": 6.08, + "eval_loss": 0.7387065291404724, + "eval_runtime": 16.8562, + "eval_samples_per_second": 124.109, + "eval_steps_per_second": 7.772, + "step": 90112 + } + ], + "max_steps": 118608, + "num_train_epochs": 8, + "total_flos": 3.6946934765226394e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-90112/training_args.bin b/checkpoint-90112/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f9ee3ad348e277019b0ce52cdd8e1d7773f2852 --- /dev/null +++ b/checkpoint-90112/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0200139492e444a9d322a4f90a96e6dde09c7a882f05b816c2345dade5ea0f98 +size 3515 diff --git a/checkpoint-98304/config.json b/checkpoint-98304/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8dd006e2c45b4178c8c43247c12d14ba89fc4b44 --- /dev/null +++ b/checkpoint-98304/config.json @@ -0,0 +1,32 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 512, + "n_head": 8, + "n_inner": null, + "n_layer": 8, + "n_positions": 2048, + "pad_token_id": 1, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.26.0.dev0", + "use_cache": true, + "vocab_size": 299 +} diff --git a/checkpoint-98304/optimizer.pt b/checkpoint-98304/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fca1ad9ffb6edbfc6920035574e8e9b4d00eb958 --- /dev/null +++ b/checkpoint-98304/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7734a595a50ad41bcd32219a078b91e1114f3997fc3e799e26f06248256806 +size 211432837 diff --git a/checkpoint-98304/pytorch_model.bin b/checkpoint-98304/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fb33b7b543d02e4a373d13b398f3faf7b29a966a --- /dev/null +++ b/checkpoint-98304/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a99311b788b49a1b31c7ea78dbd1c0a61dbe1fccd8921a0e214881612326a75 +size 139279005 diff --git a/checkpoint-98304/rng_state.pth b/checkpoint-98304/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7674aea0a3dd434246376b5a1b531afa903c3ed0 --- /dev/null +++ b/checkpoint-98304/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c784673775d4d2de7e801421c9e7e86119774c857f824cd3e81863a68f877c6 +size 15597 diff --git a/checkpoint-98304/scaler.pt b/checkpoint-98304/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b18dc64de43759cc81672289702ff1b88877925 --- /dev/null +++ b/checkpoint-98304/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78b807021032a80d50f2569a9d524c7cc698851ef9731d8e200116f32fef0109 +size 557 diff --git a/checkpoint-98304/scheduler.pt b/checkpoint-98304/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..547ea5bb3280fbb85839e67875c87554cc6f34ae --- /dev/null +++ b/checkpoint-98304/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2f16428ba1197dd50c363ca86ce195e15241b8dd827371e6482f2b04693f68a +size 627 diff --git a/checkpoint-98304/trainer_state.json b/checkpoint-98304/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c0f027d8c428888e6e94661ef5c337d7f23cea09 --- /dev/null +++ b/checkpoint-98304/trainer_state.json @@ -0,0 +1,688 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.630513961958721, + "global_step": 98304, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.14, + "learning_rate": 0.00020480000000000002, + "loss": 2.1719, + "step": 2048 + }, + { + "epoch": 0.14, + "eval_loss": 1.0833024978637695, + "eval_runtime": 15.7887, + "eval_samples_per_second": 132.5, + "eval_steps_per_second": 8.297, + "step": 2048 + }, + { + "epoch": 0.28, + "learning_rate": 0.00040960000000000004, + "loss": 0.9468, + "step": 4096 + }, + { + "epoch": 0.28, + "eval_loss": 0.8000818490982056, + "eval_runtime": 17.2632, + "eval_samples_per_second": 121.183, + "eval_steps_per_second": 7.588, + "step": 4096 + }, + { + "epoch": 0.41, + "learning_rate": 0.0004998749142723946, + "loss": 0.7542, + "step": 6144 + }, + { + "epoch": 0.41, + "eval_loss": 0.7439925074577332, + "eval_runtime": 17.4134, + "eval_samples_per_second": 120.138, + "eval_steps_per_second": 7.523, + "step": 6144 + }, + { + "epoch": 0.55, + "learning_rate": 0.0004990273340312486, + "loss": 0.6756, + "step": 8192 + }, + { + "epoch": 0.55, + "eval_loss": 0.7018134593963623, + "eval_runtime": 15.8601, + "eval_samples_per_second": 131.904, + "eval_steps_per_second": 8.26, + "step": 8192 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004973820371438889, + "loss": 0.631, + "step": 10240 + }, + { + "epoch": 0.69, + "eval_loss": 0.684688925743103, + "eval_runtime": 18.0314, + "eval_samples_per_second": 116.02, + "eval_steps_per_second": 7.265, + "step": 10240 + }, + { + "epoch": 0.83, + "learning_rate": 0.0004949442940386407, + "loss": 0.5989, + "step": 12288 + }, + { + "epoch": 0.83, + "eval_loss": 0.6822534203529358, + "eval_runtime": 17.4182, + "eval_samples_per_second": 120.104, + "eval_steps_per_second": 7.521, + "step": 12288 + }, + { + "epoch": 0.97, + "learning_rate": 0.0004917201492451735, + "loss": 0.5776, + "step": 14336 + }, + { + "epoch": 0.97, + "eval_loss": 0.6674544215202332, + "eval_runtime": 17.7487, + "eval_samples_per_second": 117.868, + "eval_steps_per_second": 7.381, + "step": 14336 + }, + { + "epoch": 1.11, + "learning_rate": 0.0004877230785006913, + "loss": 0.5525, + "step": 16384 + }, + { + "epoch": 1.11, + "eval_loss": 0.6640163660049438, + "eval_runtime": 15.8951, + "eval_samples_per_second": 131.613, + "eval_steps_per_second": 8.242, + "step": 16384 + }, + { + "epoch": 1.24, + "learning_rate": 0.0004829670105577666, + "loss": 0.5373, + "step": 18432 + }, + { + "epoch": 1.24, + "eval_loss": 0.6663170456886292, + "eval_runtime": 17.5, + "eval_samples_per_second": 119.543, + "eval_steps_per_second": 7.486, + "step": 18432 + }, + { + "epoch": 1.38, + "learning_rate": 0.0004774596641323791, + "loss": 0.5261, + "step": 20480 + }, + { + "epoch": 1.38, + "eval_loss": 0.6518951654434204, + "eval_runtime": 17.1326, + "eval_samples_per_second": 122.107, + "eval_steps_per_second": 7.646, + "step": 20480 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004712261976082475, + "loss": 0.5115, + "step": 22528 + }, + { + "epoch": 1.52, + "eval_loss": 0.6658991575241089, + "eval_runtime": 17.6874, + "eval_samples_per_second": 118.277, + "eval_steps_per_second": 7.406, + "step": 22528 + }, + { + "epoch": 1.66, + "learning_rate": 0.00046428407064289515, + "loss": 0.5033, + "step": 24576 + }, + { + "epoch": 1.66, + "eval_loss": 0.6602604389190674, + "eval_runtime": 15.8845, + "eval_samples_per_second": 131.701, + "eval_steps_per_second": 8.247, + "step": 24576 + }, + { + "epoch": 1.8, + "learning_rate": 0.00045665163060732317, + "loss": 0.4925, + "step": 26624 + }, + { + "epoch": 1.8, + "eval_loss": 0.6551440954208374, + "eval_runtime": 17.7902, + "eval_samples_per_second": 117.593, + "eval_steps_per_second": 7.364, + "step": 26624 + }, + { + "epoch": 1.93, + "learning_rate": 0.000448360778288479, + "loss": 0.4868, + "step": 28672 + }, + { + "epoch": 1.93, + "eval_loss": 0.6580803394317627, + "eval_runtime": 15.7841, + "eval_samples_per_second": 132.538, + "eval_steps_per_second": 8.299, + "step": 28672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00043942999964293453, + "loss": 0.4741, + "step": 30720 + }, + { + "epoch": 2.07, + "eval_loss": 0.661230742931366, + "eval_runtime": 18.015, + "eval_samples_per_second": 116.126, + "eval_steps_per_second": 7.272, + "step": 30720 + }, + { + "epoch": 2.21, + "learning_rate": 0.0004298966220346151, + "loss": 0.4632, + "step": 32768 + }, + { + "epoch": 2.21, + "eval_loss": 0.6618030071258545, + "eval_runtime": 17.9073, + "eval_samples_per_second": 116.824, + "eval_steps_per_second": 7.315, + "step": 32768 + }, + { + "epoch": 2.35, + "learning_rate": 0.00041978697624050446, + "loss": 0.4584, + "step": 34816 + }, + { + "epoch": 2.35, + "eval_loss": 0.6579039096832275, + "eval_runtime": 17.8241, + "eval_samples_per_second": 117.369, + "eval_steps_per_second": 7.35, + "step": 34816 + }, + { + "epoch": 2.49, + "learning_rate": 0.0004091334467888659, + "loss": 0.451, + "step": 36864 + }, + { + "epoch": 2.49, + "eval_loss": 0.6627541780471802, + "eval_runtime": 17.6822, + "eval_samples_per_second": 118.311, + "eval_steps_per_second": 7.409, + "step": 36864 + }, + { + "epoch": 2.62, + "learning_rate": 0.00039796458815002033, + "loss": 0.445, + "step": 38912 + }, + { + "epoch": 2.62, + "eval_loss": 0.656867265701294, + "eval_runtime": 15.8659, + "eval_samples_per_second": 131.855, + "eval_steps_per_second": 8.257, + "step": 38912 + }, + { + "epoch": 2.76, + "learning_rate": 0.0003863212870708643, + "loss": 0.4404, + "step": 40960 + }, + { + "epoch": 2.76, + "eval_loss": 0.6653844118118286, + "eval_runtime": 17.9192, + "eval_samples_per_second": 116.747, + "eval_steps_per_second": 7.311, + "step": 40960 + }, + { + "epoch": 2.9, + "learning_rate": 0.00037424687637876156, + "loss": 0.4334, + "step": 43008 + }, + { + "epoch": 2.9, + "eval_loss": 0.6553301811218262, + "eval_runtime": 15.9304, + "eval_samples_per_second": 131.321, + "eval_steps_per_second": 8.223, + "step": 43008 + }, + { + "epoch": 3.04, + "learning_rate": 0.00036178064571954134, + "loss": 0.4278, + "step": 45056 + }, + { + "epoch": 3.04, + "eval_loss": 0.6766741871833801, + "eval_runtime": 16.9834, + "eval_samples_per_second": 123.179, + "eval_steps_per_second": 7.713, + "step": 45056 + }, + { + "epoch": 3.18, + "learning_rate": 0.0003489439971353363, + "loss": 0.4152, + "step": 47104 + }, + { + "epoch": 3.18, + "eval_loss": 0.6645193696022034, + "eval_runtime": 15.816, + "eval_samples_per_second": 132.271, + "eval_steps_per_second": 8.283, + "step": 47104 + }, + { + "epoch": 3.32, + "learning_rate": 0.0003357965820476752, + "loss": 0.411, + "step": 49152 + }, + { + "epoch": 3.32, + "eval_loss": 0.6748972535133362, + "eval_runtime": 17.7507, + "eval_samples_per_second": 117.855, + "eval_steps_per_second": 7.38, + "step": 49152 + }, + { + "epoch": 3.45, + "learning_rate": 0.0003223743322236833, + "loss": 0.4085, + "step": 51200 + }, + { + "epoch": 3.45, + "eval_loss": 0.6703893542289734, + "eval_runtime": 17.341, + "eval_samples_per_second": 120.639, + "eval_steps_per_second": 7.554, + "step": 51200 + }, + { + "epoch": 3.59, + "learning_rate": 0.0003087269633577651, + "loss": 0.4037, + "step": 53248 + }, + { + "epoch": 3.59, + "eval_loss": 0.6561577916145325, + "eval_runtime": 15.9295, + "eval_samples_per_second": 131.329, + "eval_steps_per_second": 8.224, + "step": 53248 + }, + { + "epoch": 3.73, + "learning_rate": 0.0002948780545870247, + "loss": 0.3975, + "step": 55296 + }, + { + "epoch": 3.73, + "eval_loss": 0.6698916554450989, + "eval_runtime": 17.9008, + "eval_samples_per_second": 116.866, + "eval_steps_per_second": 7.318, + "step": 55296 + }, + { + "epoch": 3.87, + "learning_rate": 0.0002808921064161573, + "loss": 0.3917, + "step": 57344 + }, + { + "epoch": 3.87, + "eval_loss": 0.7029281854629517, + "eval_runtime": 17.8146, + "eval_samples_per_second": 117.432, + "eval_steps_per_second": 7.354, + "step": 57344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00026680720064442787, + "loss": 0.3876, + "step": 59392 + }, + { + "epoch": 4.01, + "eval_loss": 0.6837398409843445, + "eval_runtime": 17.4354, + "eval_samples_per_second": 119.986, + "eval_steps_per_second": 7.513, + "step": 59392 + }, + { + "epoch": 4.14, + "learning_rate": 0.00025266845586830784, + "loss": 0.3751, + "step": 61440 + }, + { + "epoch": 4.14, + "eval_loss": 0.690500795841217, + "eval_runtime": 17.8964, + "eval_samples_per_second": 116.895, + "eval_steps_per_second": 7.32, + "step": 61440 + }, + { + "epoch": 4.28, + "learning_rate": 0.00023851425721450398, + "loss": 0.3719, + "step": 63488 + }, + { + "epoch": 4.28, + "eval_loss": 0.6938452124595642, + "eval_runtime": 15.7962, + "eval_samples_per_second": 132.437, + "eval_steps_per_second": 8.293, + "step": 63488 + }, + { + "epoch": 4.42, + "learning_rate": 0.0002244106409269819, + "loss": 0.3644, + "step": 65536 + }, + { + "epoch": 4.42, + "eval_loss": 0.6987010836601257, + "eval_runtime": 15.9556, + "eval_samples_per_second": 131.114, + "eval_steps_per_second": 8.21, + "step": 65536 + }, + { + "epoch": 4.56, + "learning_rate": 0.00021037526400320187, + "loss": 0.357, + "step": 67584 + }, + { + "epoch": 4.56, + "eval_loss": 0.6973423957824707, + "eval_runtime": 15.8273, + "eval_samples_per_second": 132.177, + "eval_steps_per_second": 8.277, + "step": 67584 + }, + { + "epoch": 4.7, + "learning_rate": 0.0001964736950807942, + "loss": 0.357, + "step": 69632 + }, + { + "epoch": 4.7, + "eval_loss": 0.6949540972709656, + "eval_runtime": 16.1065, + "eval_samples_per_second": 129.885, + "eval_steps_per_second": 8.133, + "step": 69632 + }, + { + "epoch": 4.83, + "learning_rate": 0.00018274358855873096, + "loss": 0.3502, + "step": 71680 + }, + { + "epoch": 4.83, + "eval_loss": 0.6959769129753113, + "eval_runtime": 16.2001, + "eval_samples_per_second": 129.135, + "eval_steps_per_second": 8.086, + "step": 71680 + }, + { + "epoch": 4.97, + "learning_rate": 0.00016922892649452222, + "loss": 0.3458, + "step": 73728 + }, + { + "epoch": 4.97, + "eval_loss": 0.7017838358879089, + "eval_runtime": 18.0467, + "eval_samples_per_second": 115.922, + "eval_steps_per_second": 7.259, + "step": 73728 + }, + { + "epoch": 5.11, + "learning_rate": 0.00015597300080605504, + "loss": 0.3302, + "step": 75776 + }, + { + "epoch": 5.11, + "eval_loss": 0.7075567841529846, + "eval_runtime": 16.7136, + "eval_samples_per_second": 125.168, + "eval_steps_per_second": 7.838, + "step": 75776 + }, + { + "epoch": 5.25, + "learning_rate": 0.0001430182745933093, + "loss": 0.3278, + "step": 77824 + }, + { + "epoch": 5.25, + "eval_loss": 0.7178707718849182, + "eval_runtime": 15.7714, + "eval_samples_per_second": 132.645, + "eval_steps_per_second": 8.306, + "step": 77824 + }, + { + "epoch": 5.39, + "learning_rate": 0.00013040017526934073, + "loss": 0.3224, + "step": 79872 + }, + { + "epoch": 5.39, + "eval_loss": 0.725006639957428, + "eval_runtime": 15.8618, + "eval_samples_per_second": 131.889, + "eval_steps_per_second": 8.259, + "step": 79872 + }, + { + "epoch": 5.53, + "learning_rate": 0.00011817144183980649, + "loss": 0.3181, + "step": 81920 + }, + { + "epoch": 5.53, + "eval_loss": 0.7247176766395569, + "eval_runtime": 17.8123, + "eval_samples_per_second": 117.447, + "eval_steps_per_second": 7.354, + "step": 81920 + }, + { + "epoch": 5.66, + "learning_rate": 0.00010636499874117036, + "loss": 0.3154, + "step": 83968 + }, + { + "epoch": 5.66, + "eval_loss": 0.7248900532722473, + "eval_runtime": 17.751, + "eval_samples_per_second": 117.852, + "eval_steps_per_second": 7.38, + "step": 83968 + }, + { + "epoch": 5.8, + "learning_rate": 9.501866590283475e-05, + "loss": 0.3105, + "step": 86016 + }, + { + "epoch": 5.8, + "eval_loss": 0.7352888584136963, + "eval_runtime": 18.0338, + "eval_samples_per_second": 116.004, + "eval_steps_per_second": 7.264, + "step": 86016 + }, + { + "epoch": 5.94, + "learning_rate": 8.416361604489855e-05, + "loss": 0.3035, + "step": 88064 + }, + { + "epoch": 5.94, + "eval_loss": 0.7256708741188049, + "eval_runtime": 15.94, + "eval_samples_per_second": 131.242, + "eval_steps_per_second": 8.218, + "step": 88064 + }, + { + "epoch": 6.08, + "learning_rate": 7.384521927589935e-05, + "loss": 0.294, + "step": 90112 + }, + { + "epoch": 6.08, + "eval_loss": 0.7387065291404724, + "eval_runtime": 16.8562, + "eval_samples_per_second": 124.109, + "eval_steps_per_second": 7.772, + "step": 90112 + }, + { + "epoch": 6.22, + "learning_rate": 6.409110434142392e-05, + "loss": 0.2866, + "step": 92160 + }, + { + "epoch": 6.22, + "eval_loss": 0.7492235898971558, + "eval_runtime": 17.2872, + "eval_samples_per_second": 121.015, + "eval_steps_per_second": 7.578, + "step": 92160 + }, + { + "epoch": 6.35, + "learning_rate": 5.492819313147518e-05, + "loss": 0.2836, + "step": 94208 + }, + { + "epoch": 6.35, + "eval_loss": 0.7440558671951294, + "eval_runtime": 17.5257, + "eval_samples_per_second": 119.367, + "eval_steps_per_second": 7.475, + "step": 94208 + }, + { + "epoch": 6.49, + "learning_rate": 4.6394783238561305e-05, + "loss": 0.2811, + "step": 96256 + }, + { + "epoch": 6.49, + "eval_loss": 0.7577848434448242, + "eval_runtime": 15.8536, + "eval_samples_per_second": 131.957, + "eval_steps_per_second": 8.263, + "step": 96256 + }, + { + "epoch": 6.63, + "learning_rate": 3.851358797621554e-05, + "loss": 0.2774, + "step": 98304 + }, + { + "epoch": 6.63, + "eval_loss": 0.7483424544334412, + "eval_runtime": 15.8485, + "eval_samples_per_second": 132.0, + "eval_steps_per_second": 8.266, + "step": 98304 + } + ], + "max_steps": 118608, + "num_train_epochs": 8, + "total_flos": 4.030582068221829e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-98304/training_args.bin b/checkpoint-98304/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f9ee3ad348e277019b0ce52cdd8e1d7773f2852 --- /dev/null +++ b/checkpoint-98304/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0200139492e444a9d322a4f90a96e6dde09c7a882f05b816c2345dade5ea0f98 +size 3515 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8dd006e2c45b4178c8c43247c12d14ba89fc4b44 --- /dev/null +++ b/config.json @@ -0,0 +1,32 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 512, + "n_head": 8, + "n_inner": null, + "n_layer": 8, + "n_positions": 2048, + "pad_token_id": 1, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.26.0.dev0", + "use_cache": true, + "vocab_size": 299 +} diff --git a/manual_upload/manual_upload/manual_upload/training_args.json b/manual_upload/manual_upload/manual_upload/training_args.json index c4f472294494263a6c72fc414cb357a783c38770..7e7e9d65223eb36a1c99dbb5c6555e1cfe202d63 100644 --- a/manual_upload/manual_upload/manual_upload/training_args.json +++ b/manual_upload/manual_upload/manual_upload/training_args.json @@ -6,7 +6,7 @@ "do_predict": false, "evaluation_strategy": "steps", "prediction_loss_only": false, - "per_device_train_batch_size": 8, + "per_device_train_batch_size": 10, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, diff --git a/manual_upload/manual_upload/training_args.json b/manual_upload/manual_upload/training_args.json index c78f56ac7ec8f199438eb8cb92960b7652fdabb9..c4f472294494263a6c72fc414cb357a783c38770 100644 --- a/manual_upload/manual_upload/training_args.json +++ b/manual_upload/manual_upload/training_args.json @@ -6,7 +6,7 @@ "do_predict": false, "evaluation_strategy": "steps", "prediction_loss_only": false, - "per_device_train_batch_size": 7, + "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, diff --git a/manual_upload/training_args.json b/manual_upload/training_args.json index 43d3345475bdda34754d00e52cd2fbc1a6556b64..c78f56ac7ec8f199438eb8cb92960b7652fdabb9 100644 --- a/manual_upload/training_args.json +++ b/manual_upload/training_args.json @@ -19,7 +19,7 @@ "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, - "num_train_epochs": 8, + "num_train_epochs": 10, "max_steps": -1, "lr_scheduler_type": "cosine", "warmup_ratio": 0.0, diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e47113d5da001f5b6370a9a27b60a5c11d0d539b --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c428cc7de0185bbb550e80aa69a32f25f05fe29d68c409a71c41069f58cbae30 +size 139279005 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..bd76b2b2c813ed0fbec7b73fe1e0d837191c707a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,3 @@ +{ + "pad_token": "[PAD]" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d59b8e3ef20bd0c29761a445c66f97023f98fe88 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,357 @@ +{ + "version": "1.0", + "truncation": { + "direction": "Right", + "max_length": 2048, + "strategy": "LongestFirst", + "stride": 0 + }, + "padding": { + "strategy": "BatchLongest", + "direction": "Right", + "pad_to_multiple_of": null, + "pad_id": 1, + "pad_type_id": 0, + "pad_token": "[PAD]" + }, + "added_tokens": [ + { + "id": 0, + "content": "[UNK]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "[PAD]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "[MASK]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "WhitespaceSplit" + }, + "post_processor": null, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "[UNK]": 0, + "[PAD]": 1, + "[MASK]": 2, + "TIME_DELTA=1": 3, + "TIME_DELTA=2": 4, + "BAR_END": 5, + "BAR_START": 6, + "NOTE_OFF=42": 7, + "NOTE_ON=42": 8, + "NOTE_OFF=36": 9, + "NOTE_ON=36": 10, + "TIME_DELTA=4": 11, + "NOTE_OFF=38": 12, + "NOTE_ON=38": 13, + "NOTE_OFF=54": 14, + "NOTE_ON=54": 15, + "NOTE_OFF=62": 16, + "NOTE_ON=62": 17, + "NOTE_OFF=64": 18, + "NOTE_ON=64": 19, + "TIME_DELTA=3": 20, + "NOTE_OFF=57": 21, + "NOTE_ON=57": 22, + "NOTE_OFF=69": 23, + "NOTE_ON=69": 24, + "TRACK_END": 25, + "TRACK_START": 26, + "NOTE_OFF=40": 27, + "NOTE_ON=40": 28, + "NOTE_OFF=60": 29, + "NOTE_ON=60": 30, + "NOTE_OFF=35": 31, + "NOTE_ON=35": 32, + "NOTE_OFF=59": 33, + "NOTE_ON=59": 34, + "NOTE_OFF=55": 35, + "NOTE_ON=55": 36, + "NOTE_OFF=46": 37, + "NOTE_ON=46": 38, + "NOTE_OFF=67": 39, + "NOTE_ON=67": 40, + "NOTE_OFF=70": 41, + "NOTE_ON=70": 42, + "NOTE_OFF=50": 43, + "NOTE_ON=50": 44, + "NOTE_OFF=44": 45, + "NOTE_ON=44": 46, + "NOTE_OFF=52": 47, + "NOTE_ON=52": 48, + "NOTE_OFF=61": 49, + "NOTE_ON=61": 50, + "NOTE_OFF=65": 51, + "NOTE_ON=65": 52, + "NOTE_OFF=63": 53, + "NOTE_ON=63": 54, + "NOTE_OFF=66": 55, + "NOTE_ON=66": 56, + "NOTE_OFF=45": 57, + "NOTE_ON=45": 58, + "NOTE_OFF=51": 59, + "NOTE_ON=51": 60, + "NOTE_OFF=43": 61, + "NOTE_ON=43": 62, + "NOTE_OFF=48": 63, + "NOTE_ON=48": 64, + "NOTE_OFF=58": 65, + "NOTE_ON=58": 66, + "NOTE_OFF=39": 67, + "NOTE_ON=39": 68, + "NOTE_OFF=53": 69, + "NOTE_ON=53": 70, + "NOTE_OFF=56": 71, + "NOTE_ON=56": 72, + "NOTE_OFF=47": 73, + "NOTE_ON=47": 74, + "NOTE_OFF=68": 75, + "NOTE_ON=68": 76, + "NOTE_OFF=49": 77, + "NOTE_ON=49": 78, + "NOTE_OFF=72": 79, + "NOTE_ON=72": 80, + "NOTE_OFF=71": 81, + "NOTE_ON=71": 82, + "NOTE_OFF=41": 83, + "NOTE_ON=41": 84, + "NOTE_OFF=74": 85, + "NOTE_ON=74": 86, + "NOTE_OFF=33": 87, + "NOTE_ON=33": 88, + "TIME_DELTA=6": 89, + "NOTE_OFF=82": 90, + "NOTE_ON=82": 91, + "TIME_DELTA=16": 92, + "TIME_DELTA=8": 93, + "NOTE_OFF=37": 94, + "NOTE_ON=37": 95, + "NOTE_OFF=31": 96, + "NOTE_ON=31": 97, + "NOTE_OFF=76": 98, + "NOTE_ON=76": 99, + "DENSITY=3": 100, + "NOTE_OFF=73": 101, + "NOTE_ON=73": 102, + "DENSITY=0": 103, + "NOTE_OFF=28": 104, + "NOTE_ON=28": 105, + "DENSITY=1": 106, + "DENSITY=2": 107, + "NOTE_OFF=34": 108, + "NOTE_ON=34": 109, + "INST=3": 110, + "NOTE_OFF=75": 111, + "NOTE_ON=75": 112, + "NOTE_OFF=77": 113, + "NOTE_ON=77": 114, + "PIECE_START": 115, + "NOTE_OFF=79": 116, + "NOTE_ON=79": 117, + "INST=DRUMS": 118, + "NOTE_OFF=32": 119, + "NOTE_ON=32": 120, + "NOTE_OFF=29": 121, + "NOTE_ON=29": 122, + "INST=4": 123, + "NOTE_OFF=81": 124, + "NOTE_ON=81": 125, + "TIME_DELTA=5": 126, + "NOTE_OFF=78": 127, + "NOTE_ON=78": 128, + "NOTE_OFF=30": 129, + "NOTE_ON=30": 130, + "NOTE_OFF=27": 131, + "NOTE_ON=27": 132, + "INST=6": 133, + "NOTE_OFF=80": 134, + "NOTE_ON=80": 135, + "TIME_DELTA=7": 136, + "NOTE_OFF=26": 137, + "NOTE_ON=26": 138, + "INST=0": 139, + "NOTE_OFF=83": 140, + "NOTE_ON=83": 141, + "TIME_DELTA=12": 142, + "TIME_DELTA=10": 143, + "NOTE_OFF=84": 144, + "NOTE_ON=84": 145, + "NOTE_OFF=86": 146, + "NOTE_ON=86": 147, + "INST=10": 148, + "NOTE_OFF=85": 149, + "NOTE_ON=85": 150, + "TIME_DELTA=14": 151, + "TIME_DELTA=15": 152, + "NOTE_OFF=88": 153, + "NOTE_ON=88": 154, + "INST=8": 155, + "INST=11": 156, + "NOTE_OFF=87": 157, + "NOTE_ON=87": 158, + "TIME_DELTA=9": 159, + "NOTE_OFF=24": 160, + "NOTE_ON=24": 161, + "INST=7": 162, + "NOTE_OFF=25": 163, + "NOTE_ON=25": 164, + "NOTE_OFF=89": 165, + "NOTE_ON=89": 166, + "NOTE_OFF=91": 167, + "NOTE_ON=91": 168, + "TIME_DELTA=11": 169, + "TIME_DELTA=13": 170, + "INST=2": 171, + "NOTE_OFF=93": 172, + "NOTE_ON=93": 173, + "NOTE_OFF=22": 174, + "NOTE_ON=22": 175, + "NOTE_OFF=23": 176, + "NOTE_ON=23": 177, + "NOTE_OFF=90": 178, + "NOTE_ON=90": 179, + "INST=9": 180, + "INST=5": 181, + "INST=1": 182, + "NOTE_OFF=94": 183, + "NOTE_ON=94": 184, + "INST=12": 185, + "INST=14": 186, + "NOTE_OFF=92": 187, + "NOTE_ON=92": 188, + "NOTE_OFF=96": 189, + "NOTE_ON=96": 190, + "NOTE_OFF=95": 191, + "NOTE_ON=95": 192, + "NOTE_OFF=98": 193, + "NOTE_ON=98": 194, + "INST=15": 195, + "NOTE_OFF=21": 196, + "NOTE_ON=21": 197, + "INST=13": 198, + "NOTE_OFF=19": 199, + "NOTE_ON=19": 200, + "NOTE_OFF=99": 201, + "NOTE_ON=99": 202, + "NOTE_OFF=97": 203, + "NOTE_ON=97": 204, + "NOTE_OFF=0": 205, + "NOTE_ON=0": 206, + "NOTE_OFF=100": 207, + "NOTE_ON=100": 208, + "NOTE_OFF=16": 209, + "NOTE_ON=16": 210, + "NOTE_OFF=13": 211, + "NOTE_ON=13": 212, + "NOTE_OFF=20": 213, + "NOTE_ON=20": 214, + "NOTE_OFF=105": 215, + "NOTE_ON=105": 216, + "NOTE_OFF=103": 217, + "NOTE_ON=103": 218, + "NOTE_OFF=101": 219, + "NOTE_ON=101": 220, + "NOTE_OFF=102": 221, + "NOTE_ON=102": 222, + "NOTE_OFF=17": 223, + "NOTE_ON=17": 224, + "NOTE_OFF=107": 225, + "NOTE_ON=107": 226, + "NOTE_OFF=108": 227, + "NOTE_ON=108": 228, + "NOTE_OFF=18": 229, + "NOTE_ON=18": 230, + "NOTE_OFF=126": 231, + "NOTE_ON=126": 232, + "NOTE_OFF=104": 233, + "NOTE_ON=104": 234, + "NOTE_OFF=8": 235, + "NOTE_ON=8": 236, + "NOTE_OFF=117": 237, + "NOTE_ON=117": 238, + "NOTE_OFF=106": 239, + "NOTE_ON=106": 240, + "NOTE_OFF=110": 241, + "NOTE_ON=110": 242, + "NOTE_OFF=112": 243, + "NOTE_ON=112": 244, + "NOTE_OFF=12": 245, + "NOTE_ON=12": 246, + "NOTE_OFF=9": 247, + "NOTE_ON=9": 248, + "NOTE_OFF=14": 249, + "NOTE_ON=14": 250, + "NOTE_OFF=113": 251, + "NOTE_ON=113": 252, + "NOTE_OFF=15": 253, + "NOTE_ON=15": 254, + "NOTE_OFF=125": 255, + "NOTE_ON=125": 256, + "NOTE_OFF=109": 257, + "NOTE_ON=109": 258, + "NOTE_OFF=115": 259, + "NOTE_ON=115": 260, + "NOTE_OFF=120": 261, + "NOTE_ON=120": 262, + "NOTE_OFF=119": 263, + "NOTE_ON=119": 264, + "NOTE_OFF=122": 265, + "NOTE_ON=122": 266, + "NOTE_OFF=124": 267, + "NOTE_OFF=127": 268, + "NOTE_ON=124": 269, + "NOTE_ON=127": 270, + "NOTE_OFF=11": 271, + "NOTE_ON=11": 272, + "NOTE_OFF=4": 273, + "NOTE_ON=4": 274, + "NOTE_OFF=10": 275, + "NOTE_ON=10": 276, + "NOTE_OFF=111": 277, + "NOTE_ON=111": 278, + "NOTE_OFF=5": 279, + "NOTE_OFF=6": 280, + "NOTE_ON=5": 281, + "NOTE_ON=6": 282, + "NOTE_OFF=1": 283, + "NOTE_ON=1": 284, + "NOTE_OFF=114": 285, + "NOTE_ON=114": 286, + "NOTE_OFF=2": 287, + "NOTE_ON=2": 288, + "NOTE_OFF=7": 289, + "NOTE_ON=7": 290, + "NOTE_OFF=3": 291, + "NOTE_ON=3": 292, + "NOTE_OFF=116": 293, + "NOTE_OFF=121": 294, + "NOTE_ON=116": 295, + "NOTE_ON=121": 296, + "NOTE_OFF=118": 297, + "NOTE_ON=118": 298 + }, + "unk_token": "[UNK]" + } +} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7dcb0fca6b937743ad5507e9f508766dcdf7b09d --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,4 @@ +{ + "model_max_length": 1000000000000000019884624838656, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..829a6020707fb734f7ff712053844682623b1750 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,15 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": null, + "global_step": 0, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [], + "max_steps": 0, + "num_train_epochs": 0, + "total_flos": 0, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f9ee3ad348e277019b0ce52cdd8e1d7773f2852 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0200139492e444a9d322a4f90a96e6dde09c7a882f05b816c2345dade5ea0f98 +size 3515 diff --git a/training_args.json b/training_args.json new file mode 100644 index 0000000000000000000000000000000000000000..43d3345475bdda34754d00e52cd2fbc1a6556b64 --- /dev/null +++ b/training_args.json @@ -0,0 +1,109 @@ +{ + "output_dir": "models/improved_4bars", + "overwrite_output_dir": true, + "do_train": false, + "do_eval": true, + "do_predict": false, + "evaluation_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 7, + "per_device_eval_batch_size": 8, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "learning_rate": 0.0005, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 8, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "warmup_ratio": 0.0, + "warmup_steps": 5000, + "log_level": "passive", + "log_level_replica": "passive", + "log_on_each_node": true, + "logging_dir": "models/improved_4bars/logs", + "logging_strategy": "steps", + "logging_first_step": false, + "logging_steps": 2048, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 8192, + "save_total_limit": 5, + "save_on_each_node": false, + "no_cuda": false, + "use_mps_device": false, + "seed": 42, + "data_seed": null, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": false, + "fp16": true, + "fp16_opt_level": "O1", + "half_precision_backend": "cuda_amp", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "xpu_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": [], + "dataloader_drop_last": false, + "eval_steps": 2048, + "dataloader_num_workers": 0, + "past_index": -1, + "run_name": "models/improved_4bars", + "disable_tqdm": false, + "remove_unused_columns": true, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": null, + "greater_is_better": null, + "ignore_data_skip": false, + "sharded_ddp": [], + "fsdp": [], + "fsdp_min_num_params": 0, + "fsdp_transformer_layer_cls_to_wrap": null, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_hf", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "dataloader_pin_memory": true, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": true, + "resume_from_checkpoint": null, + "hub_model_id": "JammyMachina/improved_4bars-mdl", + "hub_strategy": "every_save", + "hub_token": "", + "hub_private_repo": false, + "gradient_checkpointing": false, + "include_inputs_for_metrics": false, + "fp16_backend": "auto", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": "", + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null +} \ No newline at end of file