diff --git "a/log.txt" "b/log.txt" new file mode 100644--- /dev/null +++ "b/log.txt" @@ -0,0 +1,267 @@ +08/27/2022 00:02:42 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1 distributed training: True, 16-bits training: True +08/27/2022 00:02:42 - INFO - __main__ - Training/evaluation parameters OurTrainingArguments(output_dir='out/mabel-joint-cl-al1-mlm-bs-32-lr-5e-5-msl-128-ep-2', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluation_strategy=, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=, warmup_steps=0, logging_dir='runs/Aug27_00-02-42_a11-03.hpc.usc.edu', logging_first_step=False, logging_steps=500, save_steps=125, save_total_limit=None, no_cuda=False, seed=42, fp16=True, fp16_opt_level='O1', fp16_backend='auto', local_rank=0, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name='out/mabel-joint-cl-al1-mlm-bs-32-lr-5e-5-msl-128-ep-2', disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=True, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, sharded_ddp=False, deepspeed=None, label_smoothing_factor=0.0, adafactor=False, eval_transfer=False, report_to='wandb') +08/27/2022 00:02:42 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1 distributed training: True, 16-bits training: True +08/27/2022 00:02:42 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1 distributed training: True, 16-bits training: True +08/27/2022 00:02:42 - WARNING - __main__ - Process rank: 3, device: cuda:3, n_gpu: 1 distributed training: True, 16-bits training: True +08/27/2022 00:02:42 - WARNING - datasets.builder - Using custom data configuration default-2f6794b69ce47e79 +08/27/2022 00:02:42 - WARNING - datasets.builder - Using custom data configuration default-2f6794b69ce47e79 +08/27/2022 00:02:42 - WARNING - datasets.builder - Using custom data configuration default-2f6794b69ce47e79 +08/27/2022 00:02:43 - WARNING - datasets.builder - Using custom data configuration default-2f6794b69ce47e79 +08/27/2022 00:02:44 - WARNING - datasets.builder - Reusing dataset csv (.cache/csv/default-2f6794b69ce47e79/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a) + 0%| | 0/1 [00:00> loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at .cache/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e +[INFO|configuration_utils.py:481] 2022-08-27 00:02:44,796 >> Model config BertConfig { + "architectures": [ + "BertForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embedding_type": "absolute", + "transformers_version": "4.2.1", + "type_vocab_size": 2, + "use_cache": true, + "vocab_size": 30522 +} + +[INFO|configuration_utils.py:445] 2022-08-27 00:02:45,107 >> loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at .cache/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e +[INFO|configuration_utils.py:481] 2022-08-27 00:02:45,108 >> Model config BertConfig { + "architectures": [ + "BertForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embedding_type": "absolute", + "transformers_version": "4.2.1", + "type_vocab_size": 2, + "use_cache": true, + "vocab_size": 30522 +} + +[INFO|tokenization_utils_base.py:1766] 2022-08-27 00:02:45,697 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at .cache/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99 +[INFO|tokenization_utils_base.py:1766] 2022-08-27 00:02:45,697 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at .cache/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4 +[INFO|modeling_utils.py:1027] 2022-08-27 00:02:46,093 >> loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache at .cache/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f +Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMabel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias'] +- This IS expected if you are initializing BertForMabel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing BertForMabel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMabel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias'] +- This IS expected if you are initializing BertForMabel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing BertForMabel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +Some weights of BertForMabel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['lm_head.bias', 'lm_head.transform.dense.weight', 'lm_head.transform.dense.bias', 'lm_head.transform.LayerNorm.weight', 'lm_head.transform.LayerNorm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'mlp.dense1.weight', 'mlp.dense1.bias', 'mlp.dense2.weight', 'mlp.dense2.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +Some weights of BertForMabel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['lm_head.bias', 'lm_head.transform.dense.weight', 'lm_head.transform.dense.bias', 'lm_head.transform.LayerNorm.weight', 'lm_head.transform.LayerNorm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'mlp.dense1.weight', 'mlp.dense1.bias', 'mlp.dense2.weight', 'mlp.dense2.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +[WARNING|modeling_utils.py:1135] 2022-08-27 00:02:55,669 >> Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMabel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias'] +- This IS expected if you are initializing BertForMabel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing BertForMabel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +[WARNING|modeling_utils.py:1146] 2022-08-27 00:02:55,669 >> Some weights of BertForMabel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['lm_head.bias', 'lm_head.transform.dense.weight', 'lm_head.transform.dense.bias', 'lm_head.transform.LayerNorm.weight', 'lm_head.transform.LayerNorm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'mlp.dense1.weight', 'mlp.dense1.bias', 'mlp.dense2.weight', 'mlp.dense2.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMabel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias'] +- This IS expected if you are initializing BertForMabel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing BertForMabel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +Some weights of BertForMabel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['lm_head.bias', 'lm_head.transform.dense.weight', 'lm_head.transform.dense.bias', 'lm_head.transform.LayerNorm.weight', 'lm_head.transform.LayerNorm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'mlp.dense1.weight', 'mlp.dense1.bias', 'mlp.dense2.weight', 'mlp.dense2.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +[INFO|configuration_utils.py:445] 2022-08-27 00:02:55,967 >> loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at .cache/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e +[INFO|configuration_utils.py:481] 2022-08-27 00:02:55,968 >> Model config BertConfig { + "architectures": [ + "BertForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embedding_type": "absolute", + "transformers_version": "4.2.1", + "type_vocab_size": 2, + "use_cache": true, + "vocab_size": 30522 +} + +[INFO|modeling_utils.py:1027] 2022-08-27 00:02:56,268 >> loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache at .cache/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f +Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +[INFO|modeling_utils.py:1143] 2022-08-27 00:03:05,402 >> All model checkpoint weights were used when initializing BertForPreTraining. + +[WARNING|modeling_utils.py:1146] 2022-08-27 00:03:05,415 >> Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +08/27/2022 00:03:05 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at .cache/csv/default-2f6794b69ce47e79/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-51922e954a887dd0.arrow +08/27/2022 00:03:05 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at .cache/csv/default-2f6794b69ce47e79/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-51922e954a887dd0.arrow +08/27/2022 00:03:05 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at .cache/csv/default-2f6794b69ce47e79/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-51922e954a887dd0.arrow +08/27/2022 00:03:05 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at .cache/csv/default-2f6794b69ce47e79/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-51922e954a887dd0.arrow +Dataset({ + features: ['bin_mask', 'input_ids', 'token_type_ids', 'attention_mask'], + num_rows: 142158 +})Dataset({ + features: ['bin_mask', 'input_ids', 'token_type_ids', 'attention_mask'], + num_rows: 142158 +})Dataset({ + features: ['bin_mask', 'input_ids', 'token_type_ids', 'attention_mask'], + num_rows: 142158 +})Dataset({ + features: ['bin_mask', 'input_ids', 'token_type_ids', 'attention_mask'], + num_rows: 142158 +}) + + + +[INFO|trainer.py:442] 2022-08-27 00:03:19,523 >> The following columns in the training set don't have a corresponding argument in `BertForMabel.forward` and have been ignored: . +[INFO|trainer.py:358] 2022-08-27 00:03:19,528 >> Using amp fp16 backend +08/27/2022 00:03:20 - INFO - trainer - ***** Running training ***** +08/27/2022 00:03:20 - INFO - trainer - Num examples = 142158 +08/27/2022 00:03:20 - INFO - trainer - Num Epochs = 2 +08/27/2022 00:03:20 - INFO - trainer - Instantaneous batch size per device = 32 +08/27/2022 00:03:20 - INFO - trainer - Total train batch size (w. parallel, distributed & accumulation) = 128 +08/27/2022 00:03:20 - INFO - trainer - Gradient Accumulation steps = 1 +08/27/2022 00:03:20 - INFO - trainer - Total optimization steps = 2222 + 0%| | 0/2222 [00:00 +Traceback (most recent call last): + File "train.py", line 629, in +Traceback (most recent call last): + File "train.py", line 629, in + {'train_runtime': 2399.7982, 'train_samples_per_second': 0.926, 'epoch': 2.0} + 100%|██████████| 2222/2222 [39:59<00:00, 1.01s/it] 100%|██████████| 2222/2222 [39:59<00:00, 1.08s/it] + main() + File "train.py", line 611, in main +[INFO|trainer.py:1344] 2022-08-27 00:43:20,039 >> Saving model checkpoint to out/mabel-joint-cl-al1-mlm-bs-32-lr-5e-5-msl-128-ep-2 + main() + File "train.py", line 611, in main +[INFO|configuration_utils.py:300] 2022-08-27 00:43:20,042 >> Configuration saved in out/mabel-joint-cl-al1-mlm-bs-32-lr-5e-5-msl-128-ep-2/config.json + results = trainer.evaluate(eval_senteval_transfer=True) +TypeError: evaluate() got an unexpected keyword argument 'eval_senteval_transfer' + main() + File "train.py", line 611, in main + results = trainer.evaluate(eval_senteval_transfer=True) +TypeError: evaluate() got an unexpected keyword argument 'eval_senteval_transfer' + results = trainer.evaluate(eval_senteval_transfer=True) +TypeError: evaluate() got an unexpected keyword argument 'eval_senteval_transfer' +[INFO|modeling_utils.py:817] 2022-08-27 00:43:21,328 >> Model weights saved in out/mabel-joint-cl-al1-mlm-bs-32-lr-5e-5-msl-128-ep-2/pytorch_model.bin +08/27/2022 00:43:21 - INFO - __main__ - ***** Train results ***** +08/27/2022 00:43:21 - INFO - __main__ - epoch = 2.0 +08/27/2022 00:43:21 - INFO - __main__ - train_runtime = 2399.7982 +08/27/2022 00:43:21 - INFO - __main__ - train_samples_per_second = 0.926 +08/27/2022 00:43:21 - INFO - __main__ - *** Evaluate *** +Traceback (most recent call last): + File "train.py", line 629, in + main() + File "train.py", line 611, in main + results = trainer.evaluate(eval_senteval_transfer=True) +TypeError: evaluate() got an unexpected keyword argument 'eval_senteval_transfer' +/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated +and will be removed in future. Use torchrun. +Note that --use_env is set by default in torchrun. +If your script expects `--local_rank` argument to be set, please +change it to read from `os.environ['LOCAL_RANK']` instead. See +https://pytorch.org/docs/stable/distributed.html#launch-utility for +further instructions + + FutureWarning, +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24668 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 24669) of binary: /project/jonmay_231/jacqueline/miniconda3/envs/env/bin/python +Traceback (most recent call last): + File "/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/runpy.py", line 193, in _run_module_as_main + "__main__", mod_spec) + File "/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/runpy.py", line 85, in _run_code + exec(code, run_globals) + File "/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/site-packages/torch/distributed/launch.py", line 193, in + main() + File "/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/site-packages/torch/distributed/launch.py", line 189, in main + launch(args) + File "/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/site-packages/torch/distributed/launch.py", line 174, in launch + run(args) + File "/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/site-packages/torch/distributed/run.py", line 713, in run + )(*cmd_args) + File "/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/project/jonmay_231/jacqueline/miniconda3/envs/env/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent + failures=result.failures, +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +train.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-08-27_00:43:23 + host : a11-03.hpc.usc.edu + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 24670) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2022-08-27_00:43:23 + host : a11-03.hpc.usc.edu + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 24671) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-08-27_00:43:23 + host : a11-03.hpc.usc.edu + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 24669) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================