End of training

Browse files

Files changed (14) hide show

20250508_133646.log +3 -0
20250508_133827.log +3 -0
20250508_133909.log +28 -0
20250508_134223.log +28 -0
20250508_134354.log +284 -0
20250508_141001.log +205 -0
20250508_141301.log +0 -0
README.md +58 -0
adapter_config.json +34 -0
adapter_model.safetensors +3 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0
tokenizer_config.json +44 -0
training_args.bin +3 -0

20250508_133646.log ADDED Viewed

	@@ -0,0 +1,3 @@

+[2025-05-08 13:36:46] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+[2025-05-08 13:36:47] Chat mode disabled
+[2025-05-08 13:36:47] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model

20250508_133827.log ADDED Viewed

	@@ -0,0 +1,3 @@

+[2025-05-08 13:38:27] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+[2025-05-08 13:38:27] Chat mode disabled
+[2025-05-08 13:38:27] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model

20250508_133909.log ADDED Viewed

	@@ -0,0 +1,28 @@

+[2025-05-08 13:39:09] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+[2025-05-08 13:39:09] Chat mode disabled
+[2025-05-08 13:39:09] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
+[2025-05-08 13:39:09] Model size is 3B or smaller (7 B). Using full fine-tuning.
+[2025-05-08 13:39:09] No QA format data will be used
+[2025-05-08 13:39:09] =======================================
+[2025-05-08 13:39:09] Starting training for model: meta-llama/Llama-2-7b-hf
+[2025-05-08 13:39:09] =======================================
+[2025-05-08 13:39:09] CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+[2025-05-08 13:39:09] WANDB_PROJECT: wikidyk-ar
+[2025-05-08 13:39:09] DATA_PATH: data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json
+[2025-05-08 13:39:09] Global Batch Size: 256
+[2025-05-08 13:39:09] Data Size: -1
+[2025-05-08 13:39:09] Executing command: torchrun --nproc_per_node "8" --master-port 29503 src/train.py     --model_name_or_path "meta-llama/Llama-2-7b-hf"     --data_path "data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json"     --output_dir "train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000"     --num_upsample "1000"     --per_device_train_batch_size "32"     --gradient_accumulation_steps "1"     --learning_rate "2e-5"     --num_train_epochs "1"     --model_max_length "4096"     --report_to wandb --logging_steps 50 --save_strategy no     --bf16 True --use_flash_attention_2 True     --qa_data_ratio "-1"     --predict_mask "false"
+[2025-05-08 13:39:09] Training started at 2025年 05月 08日 星期四 13:39:09 CST
+W0508 13:39:10.719000 3283180 site-packages/torch/distributed/run.py:792]
+W0508 13:39:10.719000 3283180 site-packages/torch/distributed/run.py:792] *****************************************
+W0508 13:39:10.719000 3283180 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W0508 13:39:10.719000 3283180 site-packages/torch/distributed/run.py:792] *****************************************
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000

20250508_134223.log ADDED Viewed

	@@ -0,0 +1,28 @@

+[2025-05-08 13:42:23] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+[2025-05-08 13:42:23] Chat mode disabled
+[2025-05-08 13:42:23] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
+[2025-05-08 13:42:23] Model size is 3B or smaller (7 B). Using full fine-tuning.
+[2025-05-08 13:42:23] No QA format data will be used
+[2025-05-08 13:42:23] =======================================
+[2025-05-08 13:42:23] Starting training for model: meta-llama/Llama-2-7b-hf
+[2025-05-08 13:42:23] =======================================
+[2025-05-08 13:42:23] CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+[2025-05-08 13:42:23] WANDB_PROJECT: wikidyk-ar
+[2025-05-08 13:42:23] DATA_PATH: data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json
+[2025-05-08 13:42:23] Global Batch Size: 256
+[2025-05-08 13:42:23] Data Size: -1
+[2025-05-08 13:42:23] Executing command: torchrun --nproc_per_node "8" --master-port 29503 src/train.py     --model_name_or_path "meta-llama/Llama-2-7b-hf"     --data_path "data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json"     --output_dir "train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000"     --num_upsample "1000"     --per_device_train_batch_size "32"     --gradient_accumulation_steps "1"     --learning_rate "2e-5"     --num_train_epochs "1"     --model_max_length "4096"     --report_to wandb --logging_steps 50 --save_strategy no     --bf16 True --use_flash_attention_2 True     --qa_data_ratio "-1"     --predict_mask "false"
+[2025-05-08 13:42:23] Training started at 2025年 05月 08日 星期四 13:42:23 CST
+W0508 13:42:24.401000 3283386 site-packages/torch/distributed/run.py:792]
+W0508 13:42:24.401000 3283386 site-packages/torch/distributed/run.py:792] *****************************************
+W0508 13:42:24.401000 3283386 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W0508 13:42:24.401000 3283386 site-packages/torch/distributed/run.py:792] *****************************************
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000

20250508_134354.log ADDED Viewed

	@@ -0,0 +1,284 @@

+[2025-05-08 13:43:54] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+[2025-05-08 13:43:54] Chat mode disabled
+[2025-05-08 13:43:54] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
+[2025-05-08 13:43:54] Model size is 3B or smaller (7 B). Using full fine-tuning.
+[2025-05-08 13:43:54] No QA format data will be used
+[2025-05-08 13:43:54] =======================================
+[2025-05-08 13:43:54] Starting training for model: meta-llama/Llama-2-7b-hf
+[2025-05-08 13:43:54] =======================================
+[2025-05-08 13:43:54] CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+[2025-05-08 13:43:54] WANDB_PROJECT: wikidyk-ar
+[2025-05-08 13:43:54] DATA_PATH: data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json
+[2025-05-08 13:43:54] Global Batch Size: 256
+[2025-05-08 13:43:54] Data Size: -1
+[2025-05-08 13:43:54] Executing command: torchrun --nproc_per_node "8" --master-port 29503 src/train.py     --model_name_or_path "meta-llama/Llama-2-7b-hf"     --data_path "data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json"     --output_dir "train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000"     --num_upsample "1000"     --per_device_train_batch_size "32"     --gradient_accumulation_steps "1"     --learning_rate "2e-5"     --num_train_epochs "1"     --model_max_length "4096"     --report_to wandb --logging_steps 50 --save_strategy no     --bf16 True --use_flash_attention_2 True     --qa_data_ratio "-1"     --predict_mask "false"
+[2025-05-08 13:43:54] Training started at 2025年 05月 08日 星期四 13:43:54 CST
+W0508 13:43:55.946000 3283594 site-packages/torch/distributed/run.py:792]
+W0508 13:43:55.946000 3283594 site-packages/torch/distributed/run.py:792] *****************************************
+W0508 13:43:55.946000 3283594 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W0508 13:43:55.946000 3283594 site-packages/torch/distributed/run.py:792] *****************************************
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank0]:     train()
+[rank0]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank0]:     model = load_model(
+[rank0]:             ^^^^^^^^^^^
+[rank0]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank0]:     return AutoModelForCausalLM.from_pretrained(
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank0]:     return model_class.from_pretrained(
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank0]:     return func(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank0]:     config = cls._autoset_attn_implementation(
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank0]:     cls._check_and_enable_flash_attn_2(
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank0]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank0]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank6]:     train()
+[rank6]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank6]:     model = load_model(
+[rank6]:             ^^^^^^^^^^^
+[rank6]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank6]:     return AutoModelForCausalLM.from_pretrained(
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank6]:     return model_class.from_pretrained(
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank6]:     return func(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank6]:     config = cls._autoset_attn_implementation(
+[rank6]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank6]:     cls._check_and_enable_flash_attn_2(
+[rank6]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank6]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank6]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank5]:     train()
+[rank5]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank5]:     model = load_model(
+[rank5]:             ^^^^^^^^^^^
+[rank5]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank5]:     return AutoModelForCausalLM.from_pretrained(
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank5]:     return model_class.from_pretrained(
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank5]:     return func(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank5]:     config = cls._autoset_attn_implementation(
+[rank5]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank5]:     cls._check_and_enable_flash_attn_2(
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank5]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank5]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank7]:     train()
+[rank7]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank7]:     model = load_model(
+[rank7]:             ^^^^^^^^^^^
+[rank7]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank7]:     return AutoModelForCausalLM.from_pretrained(
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank7]:     return model_class.from_pretrained(
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank7]:     return func(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank7]:     config = cls._autoset_attn_implementation(
+[rank7]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank7]:     cls._check_and_enable_flash_attn_2(
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank7]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank7]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank3]:     train()
+[rank3]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank3]:     model = load_model(
+[rank3]:             ^^^^^^^^^^^
+[rank3]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank3]:     return AutoModelForCausalLM.from_pretrained(
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank3]:     return model_class.from_pretrained(
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank3]:     return func(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank3]:     config = cls._autoset_attn_implementation(
+[rank3]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank3]:     cls._check_and_enable_flash_attn_2(
+[rank3]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank3]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank3]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank1]:     train()
+[rank1]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank1]:     model = load_model(
+[rank1]:             ^^^^^^^^^^^
+[rank1]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank1]:     return AutoModelForCausalLM.from_pretrained(
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank1]:     return model_class.from_pretrained(
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank1]:     return func(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank1]:     config = cls._autoset_attn_implementation(
+[rank1]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank1]:     cls._check_and_enable_flash_attn_2(
+[rank1]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank1]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank1]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank2]:     train()
+[rank2]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank2]:     model = load_model(
+[rank2]:             ^^^^^^^^^^^
+[rank2]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank2]:     return AutoModelForCausalLM.from_pretrained(
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank2]:     return model_class.from_pretrained(
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank2]:     return func(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank2]:     config = cls._autoset_attn_implementation(
+[rank2]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank2]:     cls._check_and_enable_flash_attn_2(
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank2]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank2]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank4]:     train()
+[rank4]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank4]:     model = load_model(
+[rank4]:             ^^^^^^^^^^^
+[rank4]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank4]:     return AutoModelForCausalLM.from_pretrained(
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank4]:     return model_class.from_pretrained(
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank4]:     return func(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank4]:     config = cls._autoset_attn_implementation(
+[rank4]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank4]:     cls._check_and_enable_flash_attn_2(
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank4]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank4]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+[rank0]:[W508 13:48:52.287600796 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+W0508 13:48:53.492000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283661 closing signal SIGTERM
+W0508 13:48:53.493000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283662 closing signal SIGTERM
+W0508 13:48:53.493000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283663 closing signal SIGTERM
+W0508 13:48:53.494000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283664 closing signal SIGTERM
+W0508 13:48:53.494000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283665 closing signal SIGTERM
+W0508 13:48:53.494000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283666 closing signal SIGTERM
+W0508 13:48:53.494000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283667 closing signal SIGTERM
+E0508 13:48:54.373000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 0 (pid: 3283660) of binary: /root/miniconda3/bin/python
+Traceback (most recent call last):
+  File "/root/miniconda3/bin/torchrun", line 8, in <module>
+    sys.exit(main())
+             ^^^^^^
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/run.py", line 918, in main
+    run(args)
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/run.py", line 909, in run
+    elastic_launch(
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+src/train.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-05-08_13:48:53
+  host      : TENCENT64.site
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 3283660)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+[2025-05-08 13:48:54] ERROR: Training failed for meta-llama/Llama-2-7b-hf with exit code 1
+[2025-05-08 13:48:54] ERROR: Training failed for meta-llama/Llama-2-7b-hf with exit code 1
+[2025-05-08 13:48:54] Check error log for details: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000/20250508_134354.log
+[2025-05-08 13:48:54] Resource usage after training meta-llama/Llama-2-7b-hf:
+[2025-05-08 13:48:54] GPU memory usage:
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+[2025-05-08 13:48:54] Disk space usage for model outputs:
+34K	train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+[2025-05-08 13:48:54]

20250508_141001.log ADDED Viewed

	@@ -0,0 +1,205 @@

+[2025-05-08 14:10:02] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+[2025-05-08 14:10:02] Chat mode disabled
+[2025-05-08 14:10:02] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
+[2025-05-08 14:10:02] Model size is over 3B (7 B). Using LoRA training.
+[2025-05-08 14:10:02] Adjusted learning rate for LoRA: 2e-4
+[2025-05-08 14:10:02] No QA format data will be used
+[2025-05-08 14:10:02] =======================================
+[2025-05-08 14:10:02] Starting training for model: meta-llama/Llama-2-7b-hf
+[2025-05-08 14:10:02] =======================================
+[2025-05-08 14:10:02] CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+[2025-05-08 14:10:02] WANDB_PROJECT: wikidyk-ar
+[2025-05-08 14:10:02] DATA_PATH: data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json
+[2025-05-08 14:10:02] Global Batch Size: 256
+[2025-05-08 14:10:02] Data Size: -1
+[2025-05-08 14:10:02] Executing command: torchrun --nproc_per_node "8" --master-port 29503 src/train.py     --model_name_or_path "meta-llama/Llama-2-7b-hf"     --data_path "data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json"     --output_dir "train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000"     --num_upsample "1000"     --per_device_train_batch_size "32"     --gradient_accumulation_steps "1"     --learning_rate "2e-4"     --num_train_epochs "1"     --model_max_length "4096"     --report_to wandb --logging_steps 50 --save_strategy no     --bf16 True --use_flash_attention_2 True     --qa_data_ratio "-1"     --predict_mask "false"          --use_lora --lora_r 32 --lora_alpha 16
+[2025-05-08 14:10:02] Training started at 2025年 05月 08日 星期四 14:10:02 CST
+W0508 14:10:03.166000 3286582 site-packages/torch/distributed/run.py:792]
+W0508 14:10:03.166000 3286582 site-packages/torch/distributed/run.py:792] *****************************************
+W0508 14:10:03.166000 3286582 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W0508 14:10:03.166000 3286582 site-packages/torch/distributed/run.py:792] *****************************************
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank5]:     train()
+[rank5]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank5]:     model = load_model(
+[rank5]:             ^^^^^^^^^^^
+[rank5]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank5]:     return AutoModelForCausalLM.from_pretrained(
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank5]:     return model_class.from_pretrained(
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank5]:     return func(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank5]:     config = cls._autoset_attn_implementation(
+[rank5]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank5]:     cls._check_and_enable_flash_attn_2(
+[rank5]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank5]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank5]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank0]:     train()
+[rank0]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank0]:     model = load_model(
+[rank0]:             ^^^^^^^^^^^
+[rank0]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank0]:     return AutoModelForCausalLM.from_pretrained(
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank0]:     return model_class.from_pretrained(
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank0]:     return func(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank0]:     config = cls._autoset_attn_implementation(
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank0]:     cls._check_and_enable_flash_attn_2(
+[rank0]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank0]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank0]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank2]:     train()
+[rank2]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank2]:     model = load_model(
+[rank2]:             ^^^^^^^^^^^
+[rank2]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank2]:     return AutoModelForCausalLM.from_pretrained(
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank2]:     return model_class.from_pretrained(
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank2]:     return func(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank2]:     config = cls._autoset_attn_implementation(
+[rank2]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank2]:     cls._check_and_enable_flash_attn_2(
+[rank2]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank2]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank2]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank7]:     train()
+[rank7]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank7]:     model = load_model(
+[rank7]:             ^^^^^^^^^^^
+[rank7]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank7]:     return AutoModelForCausalLM.from_pretrained(
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank7]:     return model_class.from_pretrained(
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank7]:     return func(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank7]:     config = cls._autoset_attn_implementation(
+[rank7]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank7]:     cls._check_and_enable_flash_attn_2(
+[rank7]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank7]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank7]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+[rank0]:[W508 14:10:17.329577027 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
+[rank4]:     train()
+[rank4]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
+[rank4]:     model = load_model(
+[rank4]:             ^^^^^^^^^^^
+[rank4]:   File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
+[rank4]:     return AutoModelForCausalLM.from_pretrained(
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank4]:     return model_class.from_pretrained(
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
+[rank4]:     return func(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
+[rank4]:     config = cls._autoset_attn_implementation(
+[rank4]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
+[rank4]:     cls._check_and_enable_flash_attn_2(
+[rank4]:   File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
+[rank4]:     raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+[rank4]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
+W0508 14:10:17.120000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286648 closing signal SIGTERM
+W0508 14:10:17.120000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286649 closing signal SIGTERM
+W0508 14:10:17.121000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286650 closing signal SIGTERM
+W0508 14:10:17.121000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286651 closing signal SIGTERM
+W0508 14:10:17.123000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286652 closing signal SIGTERM
+W0508 14:10:17.123000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286654 closing signal SIGTERM
+W0508 14:10:17.123000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286655 closing signal SIGTERM
+E0508 14:10:18.479000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 5 (pid: 3286653) of binary: /root/miniconda3/bin/python
+Traceback (most recent call last):
+  File "/root/miniconda3/bin/torchrun", line 8, in <module>
+    sys.exit(main())
+             ^^^^^^
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/run.py", line 918, in main
+    run(args)
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/run.py", line 909, in run
+    elastic_launch(
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+src/train.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-05-08_14:10:17
+  host      : TENCENT64.site
+  rank      : 5 (local_rank: 5)
+  exitcode  : 1 (pid: 3286653)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+[2025-05-08 14:10:18] ERROR: Training failed for meta-llama/Llama-2-7b-hf with exit code 1
+[2025-05-08 14:10:18] ERROR: Training failed for meta-llama/Llama-2-7b-hf with exit code 1
+[2025-05-08 14:10:18] Check error log for details: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000/20250508_141001.log
+[2025-05-08 14:10:18] Resource usage after training meta-llama/Llama-2-7b-hf:
+[2025-05-08 14:10:18] GPU memory usage:
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+0 MiB, 97871 MiB
+[2025-05-08 14:10:18] Disk space usage for model outputs:
+52K	train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
+[2025-05-08 14:10:18]

20250508_141301.log ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+library_name: peft
+license: llama2
+base_model: meta-llama/Llama-2-7b-hf
+tags:
+- generated_from_trainer
+model-index:
+- name: meta-llama_Llama-2-7b-hf_full_upsample1000
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# meta-llama_Llama-2-7b-hf_full_upsample1000
+This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on an unknown dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 32
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- total_train_batch_size: 256
+- total_eval_batch_size: 64
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- num_epochs: 1.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.51.3
+- Pytorch 2.6.0+cu124
+- Datasets 3.6.0
+- Tokenizers 0.21.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd2f1c20fcb08afed084bec3a388d31d484e702e32dce49606750c23a5029011
+size 67126104

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fa1ce91fe1c38a9ee0a74d5408b5b7f03dc8e8408c58bbddba5b0b237533fb5
+size 5432