wyu1 commited on
Commit
050e463
·
verified ·
1 Parent(s): f33818d

End of training

Browse files
20250508_133646.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [2025-05-08 13:36:46] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
2
+ [2025-05-08 13:36:47] Chat mode disabled
3
+ [2025-05-08 13:36:47] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
20250508_133827.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [2025-05-08 13:38:27] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
2
+ [2025-05-08 13:38:27] Chat mode disabled
3
+ [2025-05-08 13:38:27] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
20250508_133909.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-05-08 13:39:09] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
2
+ [2025-05-08 13:39:09] Chat mode disabled
3
+ [2025-05-08 13:39:09] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
4
+ [2025-05-08 13:39:09] Model size is 3B or smaller (7 B). Using full fine-tuning.
5
+ [2025-05-08 13:39:09] No QA format data will be used
6
+ [2025-05-08 13:39:09] =======================================
7
+ [2025-05-08 13:39:09] Starting training for model: meta-llama/Llama-2-7b-hf
8
+ [2025-05-08 13:39:09] =======================================
9
+ [2025-05-08 13:39:09] CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
10
+ [2025-05-08 13:39:09] WANDB_PROJECT: wikidyk-ar
11
+ [2025-05-08 13:39:09] DATA_PATH: data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json
12
+ [2025-05-08 13:39:09] Global Batch Size: 256
13
+ [2025-05-08 13:39:09] Data Size: -1
14
+ [2025-05-08 13:39:09] Executing command: torchrun --nproc_per_node "8" --master-port 29503 src/train.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --data_path "data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json" --output_dir "train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000" --num_upsample "1000" --per_device_train_batch_size "32" --gradient_accumulation_steps "1" --learning_rate "2e-5" --num_train_epochs "1" --model_max_length "4096" --report_to wandb --logging_steps 50 --save_strategy no --bf16 True --use_flash_attention_2 True --qa_data_ratio "-1" --predict_mask "false"
15
+ [2025-05-08 13:39:09] Training started at 2025年 05月 08日 星期四 13:39:09 CST
16
+ W0508 13:39:10.719000 3283180 site-packages/torch/distributed/run.py:792]
17
+ W0508 13:39:10.719000 3283180 site-packages/torch/distributed/run.py:792] *****************************************
18
+ W0508 13:39:10.719000 3283180 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
19
+ W0508 13:39:10.719000 3283180 site-packages/torch/distributed/run.py:792] *****************************************
20
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
21
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
22
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
23
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
24
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
25
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
26
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
27
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
28
+
20250508_134223.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-05-08 13:42:23] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
2
+ [2025-05-08 13:42:23] Chat mode disabled
3
+ [2025-05-08 13:42:23] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
4
+ [2025-05-08 13:42:23] Model size is 3B or smaller (7 B). Using full fine-tuning.
5
+ [2025-05-08 13:42:23] No QA format data will be used
6
+ [2025-05-08 13:42:23] =======================================
7
+ [2025-05-08 13:42:23] Starting training for model: meta-llama/Llama-2-7b-hf
8
+ [2025-05-08 13:42:23] =======================================
9
+ [2025-05-08 13:42:23] CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
10
+ [2025-05-08 13:42:23] WANDB_PROJECT: wikidyk-ar
11
+ [2025-05-08 13:42:23] DATA_PATH: data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json
12
+ [2025-05-08 13:42:23] Global Batch Size: 256
13
+ [2025-05-08 13:42:23] Data Size: -1
14
+ [2025-05-08 13:42:23] Executing command: torchrun --nproc_per_node "8" --master-port 29503 src/train.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --data_path "data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json" --output_dir "train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000" --num_upsample "1000" --per_device_train_batch_size "32" --gradient_accumulation_steps "1" --learning_rate "2e-5" --num_train_epochs "1" --model_max_length "4096" --report_to wandb --logging_steps 50 --save_strategy no --bf16 True --use_flash_attention_2 True --qa_data_ratio "-1" --predict_mask "false"
15
+ [2025-05-08 13:42:23] Training started at 2025年 05月 08日 星期四 13:42:23 CST
16
+ W0508 13:42:24.401000 3283386 site-packages/torch/distributed/run.py:792]
17
+ W0508 13:42:24.401000 3283386 site-packages/torch/distributed/run.py:792] *****************************************
18
+ W0508 13:42:24.401000 3283386 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
19
+ W0508 13:42:24.401000 3283386 site-packages/torch/distributed/run.py:792] *****************************************
20
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
21
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
22
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
23
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
24
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
25
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
26
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
27
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
28
+
20250508_134354.log ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-05-08 13:43:54] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
2
+ [2025-05-08 13:43:54] Chat mode disabled
3
+ [2025-05-08 13:43:54] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
4
+ [2025-05-08 13:43:54] Model size is 3B or smaller (7 B). Using full fine-tuning.
5
+ [2025-05-08 13:43:54] No QA format data will be used
6
+ [2025-05-08 13:43:54] =======================================
7
+ [2025-05-08 13:43:54] Starting training for model: meta-llama/Llama-2-7b-hf
8
+ [2025-05-08 13:43:54] =======================================
9
+ [2025-05-08 13:43:54] CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
10
+ [2025-05-08 13:43:54] WANDB_PROJECT: wikidyk-ar
11
+ [2025-05-08 13:43:54] DATA_PATH: data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json
12
+ [2025-05-08 13:43:54] Global Batch Size: 256
13
+ [2025-05-08 13:43:54] Data Size: -1
14
+ [2025-05-08 13:43:54] Executing command: torchrun --nproc_per_node "8" --master-port 29503 src/train.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --data_path "data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json" --output_dir "train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000" --num_upsample "1000" --per_device_train_batch_size "32" --gradient_accumulation_steps "1" --learning_rate "2e-5" --num_train_epochs "1" --model_max_length "4096" --report_to wandb --logging_steps 50 --save_strategy no --bf16 True --use_flash_attention_2 True --qa_data_ratio "-1" --predict_mask "false"
15
+ [2025-05-08 13:43:54] Training started at 2025年 05月 08日 星期四 13:43:54 CST
16
+ W0508 13:43:55.946000 3283594 site-packages/torch/distributed/run.py:792]
17
+ W0508 13:43:55.946000 3283594 site-packages/torch/distributed/run.py:792] *****************************************
18
+ W0508 13:43:55.946000 3283594 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
19
+ W0508 13:43:55.946000 3283594 site-packages/torch/distributed/run.py:792] *****************************************
20
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
21
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
22
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
23
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
24
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
25
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
26
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
27
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
28
+
29
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
30
+
31
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
32
+ [rank0]: Traceback (most recent call last):
33
+ [rank0]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
34
+ [rank0]: train()
35
+ [rank0]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
36
+ [rank0]: model = load_model(
37
+ [rank0]: ^^^^^^^^^^^
38
+ [rank0]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
39
+ [rank0]: return AutoModelForCausalLM.from_pretrained(
40
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
42
+ [rank0]: return model_class.from_pretrained(
43
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
44
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
45
+ [rank0]: return func(*args, **kwargs)
46
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
47
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
48
+ [rank0]: config = cls._autoset_attn_implementation(
49
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
50
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
51
+ [rank0]: cls._check_and_enable_flash_attn_2(
52
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
53
+ [rank0]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
54
+ [rank0]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
55
+ [rank6]: Traceback (most recent call last):
56
+ [rank6]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
57
+ [rank6]: train()
58
+ [rank6]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
59
+ [rank6]: model = load_model(
60
+ [rank6]: ^^^^^^^^^^^
61
+ [rank6]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
62
+ [rank6]: return AutoModelForCausalLM.from_pretrained(
63
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
64
+ [rank6]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
65
+ [rank6]: return model_class.from_pretrained(
66
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
67
+ [rank6]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
68
+ [rank6]: return func(*args, **kwargs)
69
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^
70
+ [rank6]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
71
+ [rank6]: config = cls._autoset_attn_implementation(
72
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
73
+ [rank6]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
74
+ [rank6]: cls._check_and_enable_flash_attn_2(
75
+ [rank6]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
76
+ [rank6]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
77
+ [rank6]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
78
+
79
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
80
+ [rank5]: Traceback (most recent call last):
81
+ [rank5]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
82
+ [rank5]: train()
83
+ [rank5]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
84
+ [rank5]: model = load_model(
85
+ [rank5]: ^^^^^^^^^^^
86
+ [rank5]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
87
+ [rank5]: return AutoModelForCausalLM.from_pretrained(
88
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
89
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
90
+ [rank5]: return model_class.from_pretrained(
91
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
92
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
93
+ [rank5]: return func(*args, **kwargs)
94
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^
95
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
96
+ [rank5]: config = cls._autoset_attn_implementation(
97
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
98
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
99
+ [rank5]: cls._check_and_enable_flash_attn_2(
100
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
101
+ [rank5]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
102
+ [rank5]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
103
+
104
+
105
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
106
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
107
+
108
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
109
+ [rank7]: Traceback (most recent call last):
110
+ [rank7]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
111
+ [rank7]: train()
112
+ [rank7]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
113
+ [rank7]: model = load_model(
114
+ [rank7]: ^^^^^^^^^^^
115
+ [rank7]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
116
+ [rank7]: return AutoModelForCausalLM.from_pretrained(
117
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
118
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
119
+ [rank7]: return model_class.from_pretrained(
120
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
121
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
122
+ [rank7]: return func(*args, **kwargs)
123
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^
124
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
125
+ [rank7]: config = cls._autoset_attn_implementation(
126
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
128
+ [rank7]: cls._check_and_enable_flash_attn_2(
129
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
130
+ [rank7]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
131
+ [rank7]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
132
+ [rank3]: Traceback (most recent call last):
133
+ [rank3]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
134
+ [rank3]: train()
135
+ [rank3]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
136
+ [rank3]: model = load_model(
137
+ [rank3]: ^^^^^^^^^^^
138
+ [rank3]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
139
+ [rank3]: return AutoModelForCausalLM.from_pretrained(
140
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
141
+ [rank3]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
142
+ [rank3]: return model_class.from_pretrained(
143
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
144
+ [rank3]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
145
+ [rank3]: return func(*args, **kwargs)
146
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^
147
+ [rank3]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
148
+ [rank3]: config = cls._autoset_attn_implementation(
149
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
150
+ [rank3]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
151
+ [rank3]: cls._check_and_enable_flash_attn_2(
152
+ [rank3]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
153
+ [rank3]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
154
+ [rank3]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
155
+ [rank1]: Traceback (most recent call last):
156
+ [rank1]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
157
+ [rank1]: train()
158
+ [rank1]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
159
+ [rank1]: model = load_model(
160
+ [rank1]: ^^^^^^^^^^^
161
+ [rank1]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
162
+ [rank1]: return AutoModelForCausalLM.from_pretrained(
163
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
164
+ [rank1]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
165
+ [rank1]: return model_class.from_pretrained(
166
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
167
+ [rank1]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
168
+ [rank1]: return func(*args, **kwargs)
169
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^
170
+ [rank1]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
171
+ [rank1]: config = cls._autoset_attn_implementation(
172
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
173
+ [rank1]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
174
+ [rank1]: cls._check_and_enable_flash_attn_2(
175
+ [rank1]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
176
+ [rank1]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
177
+ [rank1]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
178
+
179
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
180
+ [rank2]: Traceback (most recent call last):
181
+ [rank2]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
182
+ [rank2]: train()
183
+ [rank2]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
184
+ [rank2]: model = load_model(
185
+ [rank2]: ^^^^^^^^^^^
186
+ [rank2]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
187
+ [rank2]: return AutoModelForCausalLM.from_pretrained(
188
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
189
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
190
+ [rank2]: return model_class.from_pretrained(
191
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
192
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
193
+ [rank2]: return func(*args, **kwargs)
194
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^
195
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
196
+ [rank2]: config = cls._autoset_attn_implementation(
197
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
198
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
199
+ [rank2]: cls._check_and_enable_flash_attn_2(
200
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
201
+ [rank2]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
202
+ [rank2]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
203
+
204
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
205
+ [rank4]: Traceback (most recent call last):
206
+ [rank4]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
207
+ [rank4]: train()
208
+ [rank4]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
209
+ [rank4]: model = load_model(
210
+ [rank4]: ^^^^^^^^^^^
211
+ [rank4]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
212
+ [rank4]: return AutoModelForCausalLM.from_pretrained(
213
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
214
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
215
+ [rank4]: return model_class.from_pretrained(
216
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
217
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
218
+ [rank4]: return func(*args, **kwargs)
219
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^
220
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
221
+ [rank4]: config = cls._autoset_attn_implementation(
222
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
223
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
224
+ [rank4]: cls._check_and_enable_flash_attn_2(
225
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
226
+ [rank4]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
227
+ [rank4]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
228
+ [rank0]:[W508 13:48:52.287600796 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
229
+ W0508 13:48:53.492000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283661 closing signal SIGTERM
230
+ W0508 13:48:53.493000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283662 closing signal SIGTERM
231
+ W0508 13:48:53.493000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283663 closing signal SIGTERM
232
+ W0508 13:48:53.494000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283664 closing signal SIGTERM
233
+ W0508 13:48:53.494000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283665 closing signal SIGTERM
234
+ W0508 13:48:53.494000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283666 closing signal SIGTERM
235
+ W0508 13:48:53.494000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3283667 closing signal SIGTERM
236
+ E0508 13:48:54.373000 3283594 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 0 (pid: 3283660) of binary: /root/miniconda3/bin/python
237
+ Traceback (most recent call last):
238
+ File "/root/miniconda3/bin/torchrun", line 8, in <module>
239
+ sys.exit(main())
240
+ ^^^^^^
241
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
242
+ return f(*args, **kwargs)
243
+ ^^^^^^^^^^^^^^^^^^
244
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/run.py", line 918, in main
245
+ run(args)
246
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/run.py", line 909, in run
247
+ elastic_launch(
248
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
249
+ return launch_agent(self._config, self._entrypoint, list(args))
250
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
251
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
252
+ raise ChildFailedError(
253
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
254
+ ============================================================
255
+ src/train.py FAILED
256
+ ------------------------------------------------------------
257
+ Failures:
258
+ <NO_OTHER_FAILURES>
259
+ ------------------------------------------------------------
260
+ Root Cause (first observed failure):
261
+ [0]:
262
+ time : 2025-05-08_13:48:53
263
+ host : TENCENT64.site
264
+ rank : 0 (local_rank: 0)
265
+ exitcode : 1 (pid: 3283660)
266
+ error_file: <N/A>
267
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
268
+ ============================================================
269
+ [2025-05-08 13:48:54] ERROR: Training failed for meta-llama/Llama-2-7b-hf with exit code 1
270
+ [2025-05-08 13:48:54] ERROR: Training failed for meta-llama/Llama-2-7b-hf with exit code 1
271
+ [2025-05-08 13:48:54] Check error log for details: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000/20250508_134354.log
272
+ [2025-05-08 13:48:54] Resource usage after training meta-llama/Llama-2-7b-hf:
273
+ [2025-05-08 13:48:54] GPU memory usage:
274
+ 0 MiB, 97871 MiB
275
+ 0 MiB, 97871 MiB
276
+ 0 MiB, 97871 MiB
277
+ 0 MiB, 97871 MiB
278
+ 0 MiB, 97871 MiB
279
+ 0 MiB, 97871 MiB
280
+ 0 MiB, 97871 MiB
281
+ 0 MiB, 97871 MiB
282
+ [2025-05-08 13:48:54] Disk space usage for model outputs:
283
+ 34K train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
284
+ [2025-05-08 13:48:54]
20250508_141001.log ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-05-08 14:10:02] Created output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
2
+ [2025-05-08 14:10:02] Chat mode disabled
3
+ [2025-05-08 14:10:02] Set MODEL_MAX_LENGTH to 4096 for Llama-2 model
4
+ [2025-05-08 14:10:02] Model size is over 3B (7 B). Using LoRA training.
5
+ [2025-05-08 14:10:02] Adjusted learning rate for LoRA: 2e-4
6
+ [2025-05-08 14:10:02] No QA format data will be used
7
+ [2025-05-08 14:10:02] =======================================
8
+ [2025-05-08 14:10:02] Starting training for model: meta-llama/Llama-2-7b-hf
9
+ [2025-05-08 14:10:02] =======================================
10
+ [2025-05-08 14:10:02] CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
11
+ [2025-05-08 14:10:02] WANDB_PROJECT: wikidyk-ar
12
+ [2025-05-08 14:10:02] DATA_PATH: data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json
13
+ [2025-05-08 14:10:02] Global Batch Size: 256
14
+ [2025-05-08 14:10:02] Data Size: -1
15
+ [2025-05-08 14:10:02] Executing command: torchrun --nproc_per_node "8" --master-port 29503 src/train.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --data_path "data/wikidyk2022-2025_01082025_gpt-4o_evalv2_pages_formatted_combined_v2.json" --output_dir "train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000" --num_upsample "1000" --per_device_train_batch_size "32" --gradient_accumulation_steps "1" --learning_rate "2e-4" --num_train_epochs "1" --model_max_length "4096" --report_to wandb --logging_steps 50 --save_strategy no --bf16 True --use_flash_attention_2 True --qa_data_ratio "-1" --predict_mask "false" --use_lora --lora_r 32 --lora_alpha 16
16
+ [2025-05-08 14:10:02] Training started at 2025年 05月 08日 星期四 14:10:02 CST
17
+ W0508 14:10:03.166000 3286582 site-packages/torch/distributed/run.py:792]
18
+ W0508 14:10:03.166000 3286582 site-packages/torch/distributed/run.py:792] *****************************************
19
+ W0508 14:10:03.166000 3286582 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0508 14:10:03.166000 3286582 site-packages/torch/distributed/run.py:792] *****************************************
21
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
22
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
23
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
24
+ [rank5]: Traceback (most recent call last):
25
+ [rank5]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
26
+ [rank5]: train()
27
+ [rank5]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
28
+ [rank5]: model = load_model(
29
+ [rank5]: ^^^^^^^^^^^
30
+ [rank5]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
31
+ [rank5]: return AutoModelForCausalLM.from_pretrained(
32
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
33
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
34
+ [rank5]: return model_class.from_pretrained(
35
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
37
+ [rank5]: return func(*args, **kwargs)
38
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^
39
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
40
+ [rank5]: config = cls._autoset_attn_implementation(
41
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
42
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
43
+ [rank5]: cls._check_and_enable_flash_attn_2(
44
+ [rank5]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
45
+ [rank5]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
46
+ [rank5]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
47
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
48
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
49
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
50
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
51
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
52
+ WARNING:root:Output directory: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
53
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
54
+ [rank0]: Traceback (most recent call last):
55
+ [rank0]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
56
+ [rank0]: train()
57
+ [rank0]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
58
+ [rank0]: model = load_model(
59
+ [rank0]: ^^^^^^^^^^^
60
+ [rank0]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
61
+ [rank0]: return AutoModelForCausalLM.from_pretrained(
62
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
63
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
64
+ [rank0]: return model_class.from_pretrained(
65
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
66
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
67
+ [rank0]: return func(*args, **kwargs)
68
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
69
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
70
+ [rank0]: config = cls._autoset_attn_implementation(
71
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
72
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
73
+ [rank0]: cls._check_and_enable_flash_attn_2(
74
+ [rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
75
+ [rank0]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
76
+ [rank0]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
77
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
78
+ [rank2]: Traceback (most recent call last):
79
+ [rank2]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
80
+ [rank2]: train()
81
+ [rank2]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
82
+ [rank2]: model = load_model(
83
+ [rank2]: ^^^^^^^^^^^
84
+ [rank2]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
85
+ [rank2]: return AutoModelForCausalLM.from_pretrained(
86
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
87
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
88
+ [rank2]: return model_class.from_pretrained(
89
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
90
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
91
+ [rank2]: return func(*args, **kwargs)
92
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^
93
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
94
+ [rank2]: config = cls._autoset_attn_implementation(
95
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
96
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
97
+ [rank2]: cls._check_and_enable_flash_attn_2(
98
+ [rank2]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
99
+ [rank2]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
100
+ [rank2]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
101
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
102
+ [rank7]: Traceback (most recent call last):
103
+ [rank7]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
104
+ [rank7]: train()
105
+ [rank7]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
106
+ [rank7]: model = load_model(
107
+ [rank7]: ^^^^^^^^^^^
108
+ [rank7]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
109
+ [rank7]: return AutoModelForCausalLM.from_pretrained(
110
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
111
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
112
+ [rank7]: return model_class.from_pretrained(
113
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
114
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
115
+ [rank7]: return func(*args, **kwargs)
116
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^
117
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
118
+ [rank7]: config = cls._autoset_attn_implementation(
119
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
120
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
121
+ [rank7]: cls._check_and_enable_flash_attn_2(
122
+ [rank7]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
123
+ [rank7]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
124
+ [rank7]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
125
+ [rank0]:[W508 14:10:17.329577027 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
126
+ The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
127
+ [rank4]: Traceback (most recent call last):
128
+ [rank4]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 134, in <module>
129
+ [rank4]: train()
130
+ [rank4]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/train.py", line 81, in train
131
+ [rank4]: model = load_model(
132
+ [rank4]: ^^^^^^^^^^^
133
+ [rank4]: File "/cq_1/share_1603164/user/wenhaowyu/WikiDYKEvalV2/src/utils/tools.py", line 119, in load_model
134
+ [rank4]: return AutoModelForCausalLM.from_pretrained(
135
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
136
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
137
+ [rank4]: return model_class.from_pretrained(
138
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
139
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 279, in _wrapper
140
+ [rank4]: return func(*args, **kwargs)
141
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^
142
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4336, in from_pretrained
143
+ [rank4]: config = cls._autoset_attn_implementation(
144
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
145
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2109, in _autoset_attn_implementation
146
+ [rank4]: cls._check_and_enable_flash_attn_2(
147
+ [rank4]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2252, in _check_and_enable_flash_attn_2
148
+ [rank4]: raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
149
+ [rank4]: ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
150
+ W0508 14:10:17.120000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286648 closing signal SIGTERM
151
+ W0508 14:10:17.120000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286649 closing signal SIGTERM
152
+ W0508 14:10:17.121000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286650 closing signal SIGTERM
153
+ W0508 14:10:17.121000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286651 closing signal SIGTERM
154
+ W0508 14:10:17.123000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286652 closing signal SIGTERM
155
+ W0508 14:10:17.123000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286654 closing signal SIGTERM
156
+ W0508 14:10:17.123000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3286655 closing signal SIGTERM
157
+ E0508 14:10:18.479000 3286582 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 5 (pid: 3286653) of binary: /root/miniconda3/bin/python
158
+ Traceback (most recent call last):
159
+ File "/root/miniconda3/bin/torchrun", line 8, in <module>
160
+ sys.exit(main())
161
+ ^^^^^^
162
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
163
+ return f(*args, **kwargs)
164
+ ^^^^^^^^^^^^^^^^^^
165
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/run.py", line 918, in main
166
+ run(args)
167
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/run.py", line 909, in run
168
+ elastic_launch(
169
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
170
+ return launch_agent(self._config, self._entrypoint, list(args))
171
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
172
+ File "/root/miniconda3/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
173
+ raise ChildFailedError(
174
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
175
+ ============================================================
176
+ src/train.py FAILED
177
+ ------------------------------------------------------------
178
+ Failures:
179
+ <NO_OTHER_FAILURES>
180
+ ------------------------------------------------------------
181
+ Root Cause (first observed failure):
182
+ [0]:
183
+ time : 2025-05-08_14:10:17
184
+ host : TENCENT64.site
185
+ rank : 5 (local_rank: 5)
186
+ exitcode : 1 (pid: 3286653)
187
+ error_file: <N/A>
188
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
189
+ ============================================================
190
+ [2025-05-08 14:10:18] ERROR: Training failed for meta-llama/Llama-2-7b-hf with exit code 1
191
+ [2025-05-08 14:10:18] ERROR: Training failed for meta-llama/Llama-2-7b-hf with exit code 1
192
+ [2025-05-08 14:10:18] Check error log for details: train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000/20250508_141001.log
193
+ [2025-05-08 14:10:18] Resource usage after training meta-llama/Llama-2-7b-hf:
194
+ [2025-05-08 14:10:18] GPU memory usage:
195
+ 0 MiB, 97871 MiB
196
+ 0 MiB, 97871 MiB
197
+ 0 MiB, 97871 MiB
198
+ 0 MiB, 97871 MiB
199
+ 0 MiB, 97871 MiB
200
+ 0 MiB, 97871 MiB
201
+ 0 MiB, 97871 MiB
202
+ 0 MiB, 97871 MiB
203
+ [2025-05-08 14:10:18] Disk space usage for model outputs:
204
+ 52K train_results_ar/meta-llama_Llama-2-7b-hf_full_upsample1000
205
+ [2025-05-08 14:10:18]
20250508_141301.log ADDED
The diff for this file is too large to render. See raw diff
 
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: llama2
4
+ base_model: meta-llama/Llama-2-7b-hf
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: meta-llama_Llama-2-7b-hf_full_upsample1000
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # meta-llama_Llama-2-7b-hf_full_upsample1000
16
+
17
+ This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on an unknown dataset.
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 0.0002
37
+ - train_batch_size: 32
38
+ - eval_batch_size: 8
39
+ - seed: 42
40
+ - distributed_type: multi-GPU
41
+ - num_devices: 8
42
+ - total_train_batch_size: 256
43
+ - total_eval_batch_size: 64
44
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
+ - lr_scheduler_type: linear
46
+ - num_epochs: 1.0
47
+
48
+ ### Training results
49
+
50
+
51
+
52
+ ### Framework versions
53
+
54
+ - PEFT 0.15.2
55
+ - Transformers 4.51.3
56
+ - Pytorch 2.6.0+cu124
57
+ - Datasets 3.6.0
58
+ - Tokenizers 0.21.1
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "q_proj",
28
+ "v_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd2f1c20fcb08afed084bec3a388d31d484e702e32dce49606750c23a5029011
3
+ size 67126104
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 4096,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fa1ce91fe1c38a9ee0a74d5408b5b7f03dc8e8408c58bbddba5b0b237533fb5
3
+ size 5432