diff --git "a/slurm.out" "b/slurm.out" new file mode 100644--- /dev/null +++ "b/slurm.out" @@ -0,0 +1,2061 @@ +1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] +1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] ***************************************** +1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] ***************************************** +0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] +0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] ***************************************** +0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] ***************************************** +3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] +3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] ***************************************** +3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] ***************************************** +2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] +2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] ***************************************** +2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] ***************************************** +0: [2025-09-02 19:37:04,095] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3622631] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +0: [2025-09-02 19:37:04,095] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3622631] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +3: [2025-09-02 19:37:05,674] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:2169992] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +3: [2025-09-02 19:37:05,674] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:2169992] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +1: [2025-09-02 19:37:05,843] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3051647] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +1: [2025-09-02 19:37:05,843] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3051647] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +2: [2025-09-02 19:37:05,901] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3738989] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +2: [2025-09-02 19:37:05,902] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3738989] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +0: [2025-09-02 19:37:07,766] [INFO] [axolotl.cli.config.load_cfg:245] [PID:3622631] [RANK:0] config: +0: { +0: "activation_offloading": false, +0: "auto_resume_from_checkpoints": true, +0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1756826506457874101.yaml", +0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning", +0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning", +0: "batch_size": 16, +0: "bf16": true, +0: "capabilities": { +0: "bf16": true, +0: "compute_capability": "sm_90", +0: "fp8": false, +0: "n_gpu": 16, +0: "n_node": 1 +0: }, +0: "chat_template": "qwen_25", +0: "context_parallel_size": 1, +0: "dataloader_num_workers": 16, +0: "dataloader_pin_memory": true, +0: "dataloader_prefetch_factor": 256, +0: "dataset_prepared_path": "/lustre/fsn1/projects/rech/dgo/udv55np/dataset_math/Qwen3-235B-A22B/1", +0: "dataset_processes": 192, +0: "datasets": [ +0: { +0: "chat_template": "tokenizer_default", +0: "field_messages": "conversations", +0: "message_property_mappings": { +0: "content": "content", +0: "role": "role" +0: }, +0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/math/hf/thinking_text/generator/default-68225543a18d39ac/0.0.0", +0: "trust_remote_code": false, +0: "type": "chat_template" +0: } +0: ], +0: "ddp": true, +0: "deepspeed": { +0: "bf16": { +0: "enabled": true +0: }, +0: "gradient_accumulation_steps": "auto", +0: "gradient_clipping": "auto", +0: "train_batch_size": "auto", +0: "train_micro_batch_size_per_gpu": "auto", +0: "wall_clock_breakdown": false, +0: "zero_optimization": { +0: "contiguous_gradients": true, +0: "overlap_comm": true, +0: "reduce_bucket_size": "auto", +0: "stage": 3, +0: "stage3_gather_16bit_weights_on_model_save": true, +0: "stage3_param_persistence_threshold": "auto", +0: "stage3_prefetch_bucket_size": "auto", +0: "sub_group_size": 0 +0: } +0: }, +0: "device": "cuda:0", +0: "device_map": { +0: "": 0 +0: }, +0: "dion_rank_fraction": 1.0, +0: "dion_rank_multiple_of": 1, +0: "env_capabilities": { +0: "torch_version": "2.6.0" +0: }, +0: "eval_batch_size": 1, +0: "eval_causal_lm_metrics": [ +0: "sacrebleu", +0: "comet", +0: "ter", +0: "chrf" +0: ], +0: "eval_max_new_tokens": 128, +0: "eval_sample_packing": true, +0: "eval_table_size": 0, +0: "evals_per_epoch": 0, +0: "flash_attention": true, +0: "fp16": false, +0: "gradient_accumulation_steps": 1, +0: "gradient_checkpointing": true, +0: "gradient_checkpointing_kwargs": { +0: "use_reentrant": true +0: }, +0: "learning_rate": 2e-05, +0: "lisa_layers_attribute": "model.layers", +0: "load_best_model_at_end": false, +0: "load_in_4bit": false, +0: "load_in_8bit": false, +0: "local_rank": 0, +0: "logging_steps": 10, +0: "lora_dropout": 0.0, +0: "loraplus_lr_embedding": 1e-06, +0: "lr_scheduler": "warmup_stable_decay", +0: "lr_scheduler_kwargs": { +0: "min_lr_ratio": 0.1, +0: "num_decay_steps": 300 +0: }, +0: "max_prompt_len": 512, +0: "mean_resizing_embeddings": false, +0: "micro_batch_size": 1, +0: "model_config_type": "qwen2", +0: "num_epochs": 1.0, +0: "optimizer": "adamw_torch_fused", +0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1", +0: "pad_to_sequence_len": true, +0: "pretrain_multipack_attn": true, +0: "pretrain_multipack_buffer_size": 10000, +0: "profiler_steps_start": 0, +0: "qlora_sharded_model_loading": false, +0: "ray_num_workers": 1, +0: "resources_per_worker": { +0: "GPU": 1 +0: }, +0: "sample_packing": true, +0: "sample_packing_bin_size": 200, +0: "sample_packing_group_size": 100000, +0: "save_only_model": false, +0: "save_safetensors": true, +0: "save_steps": 0.2, +0: "save_total_limit": 20, +0: "sequence_len": 16384, +0: "shuffle_before_merging_datasets": false, +0: "shuffle_merged_datasets": true, +0: "skip_prepare_dataset": false, +0: "special_tokens": { +0: "bos_token": "<|im_start|>", +0: "eos_token": "<|im_end|>", +0: "pad_token": "<|endoftext|>" +0: }, +0: "strict": false, +0: "tensor_parallel_size": 1, +0: "tf32": false, +0: "tiled_mlp_use_original_mlp": true, +0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning", +0: "torch_dtype": "torch.bfloat16", +0: "train_on_inputs": false, +0: "trl": { +0: "log_completions": false, +0: "mask_truncated_completions": false, +0: "ref_model_mixup_alpha": 0.9, +0: "ref_model_sync_steps": 64, +0: "scale_rewards": true, +0: "sync_ref_model": false, +0: "use_vllm": false, +0: "vllm_server_host": "0.0.0.0", +0: "vllm_server_port": 8000 +0: }, +0: "use_ray": false, +0: "use_tensorboard": true, +0: "val_set_size": 0.0, +0: "vllm": { +0: "device": "auto", +0: "dtype": "auto", +0: "gpu_memory_utilization": 0.9, +0: "host": "0.0.0.0", +0: "port": 8000 +0: }, +0: "warmup_steps": 150, +0: "weight_decay": 0.0, +0: "world_size": 16 +0: } +0: [2025-09-02 19:37:07,768] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:3622631] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used. +0: [2025-09-02 19:37:08,143] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:478] [PID:3622631] [RANK:0] Unable to find prepared dataset in /lustre/fsn1/projects/rech/dgo/udv55np/dataset_math/Qwen3-235B-A22B/1/21006e43c19b80ce1023552634abc92d +0: [2025-09-02 19:37:08,143] [INFO] [axolotl.utils.data.sft._load_raw_datasets:314] [PID:3622631] [RANK:0] Loading raw datasets... +0: [2025-09-02 19:37:08,143] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:316] [PID:3622631] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. +0: [2025-09-02 19:37:08,415] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:88] [PID:3622631] [RANK:0] Loading dataset: /lustre/fswork/projects/rech/qwv/udv55np/dataset/math/hf/thinking_text/generator/default-68225543a18d39ac/0.0.0 with base_type: chat_template and prompt_style: None +0: [2025-09-02 19:37:08,421] [INFO] [axolotl.prompt_strategies.chat_template.__call__:957] [PID:3622631] [RANK:0] Using chat template: +0: --- +0: {%- if tools %} +0: {{- '<|im_start|>system\n' }} +0: {%- if messages[0]['role'] == 'system' %} +0: {{- messages[0]['content'] }} +0: {%- else %} +0: {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} +0: {%- endif %} +0: {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} +0: {%- for tool in tools %} +0: {{- "\n" }} +0: {{- tool | tojson }} +0: {%- endfor %} +0: {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +0: {%- else %} +0: {%- if messages[0]['role'] == 'system' %} +0: {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} +0: {%- else %} +0: {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} +0: {%- endif %} +0: {%- endif %} +0: {%- for message in messages %} +0: {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} +0: {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} +0: {%- elif message.role == "assistant" %} +0: {{- '<|im_start|>' + message.role }} +0: {%- if message.content %} +0: {{- '\n' + message.content }} +0: {%- endif %} +0: {%- for tool_call in message.tool_calls %} +0: {%- if tool_call.function is defined %} +0: {%- set tool_call = tool_call.function %} +0: {%- endif %} +0: {{- '\n\n{"name": "' }} +0: {{- tool_call.name }} +0: {{- '", "arguments": ' }} +0: {{- tool_call.arguments | tojson }} +0: {{- '}\n' }} +0: {%- endfor %} +0: {{- '<|im_end|>\n' }} +0: {%- elif message.role == "tool" %} +0: {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} +0: {{- '<|im_start|>user' }} +0: {%- endif %} +0: {{- '\n\n' }} +0: {{- message.content }} +0: {{- '\n' }} +0: {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} +0: {{- '<|im_end|>\n' }} +0: {%- endif %} +0: {%- endif %} +0: {%- endfor %} +0: {%- if add_generation_prompt %} +0: {{- '<|im_start|>assistant\n' }} +0: {%- endif %} +0: +0: --- +0: Tokenizing Prompts (num_proc=192): 0%| | 0/322369 [00:0016384) (num_proc=192): 0%| | 0/322369 [00:0016384) (num_proc=192): 0%| | 1000/322369 [00:06<36:09, 148.14 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 1%| | 4000/322369 [00:06<07:00, 757.58 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 2%|▏ | 6000/322369 [00:07<04:10, 1261.67 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 3%|▎ | 10000/322369 [00:07<01:56, 2692.18 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 5%|▍ | 15000/322369 [00:07<01:01, 5034.59 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 6%|▌ | 19000/322369 [00:07<00:42, 7217.43 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 8%|▊ | 25000/322369 [00:07<00:25, 11484.18 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 10%|█ | 33000/322369 [00:07<00:15, 18799.94 examples/s] Dropping Lo +0: ng Sequences (>16384) (num_proc=192): 13%|█▎ | 43000/322369 [00:07<00:09, 28769.90 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 15%|█▌ | 49000/322369 [00:07<00:08, 33450.33 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 18%|█▊ | 59000/322369 [00:08<00:05, 44377.24 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 20%|██ | 66000/322369 [00:08<00:05, 47104.39 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 23%|██▎ | 73000/322369 [00:08<00:05, 44052.55 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 25%|██▍ | 79000/322369 [00:08<00:05, 46681.30 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 26%|██▋ | 85000/322369 [00:08<00:06, 36246.85 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 28%|██▊ | 90000/322369 [00:09<00:07, 30121.31 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 29%|██▉ | 94000/32236 +0: 9 [00:09<00:08, 25538.92 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 30%|███ | 98000/322369 [00:09<00:08, 25422.39 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 32%|███▏ | 103000/322369 [00:09<00:07, 28478.49 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 33%|███▎ | 107000/322369 [00:09<00:07, 30388.17 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 35%|███▍ | 112000/322369 [00:09<00:06, 32943.40 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 36%|███▌ | 116000/322369 [00:09<00:06, 31420.77 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 40%|████ | 129000/322369 [00:10<00:03, 52910.28 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 42%|████▏ | 137000/322369 [00:10<00:03, 58067.13 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 45%|████▍ | 144000/322369 [00:10<00:02, 60217.10 examples/s] Dr +0: opping Long Sequences (>16384) (num_proc=192): 47%|████▋ | 151000/322369 [00:10<00:02, 61892.45 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 49%|████▉ | 158000/322369 [00:10<00:02, 63424.25 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 51%|█████ | 165000/322369 [00:10<00:02, 59524.72 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 53%|█████▎ | 172000/322369 [00:10<00:02, 61209.42 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 56%|█████▌ | 180000/322369 [00:10<00:02, 65579.19 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 58%|█████▊ | 187000/322369 [00:10<00:02, 62434.71 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 60%|██████ | 194037/322369 [00:12<00:07, 16980.70 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 62%|██████▏ | 198790/322369 [00:12<00:07, 16811.26 examples/s] Dropping Long +0: Sequences (>16384) (num_proc=192): 63%|██████▎ | 202864/322369 [00:12<00:06, 17545.05 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 64%|██████▍ | 206259/322369 [00:12<00:06, 19099.19 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 66%|██████▌ | 211691/322369 [00:12<00:04, 22159.56 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 67%|██████▋ | 217123/322369 [00:12<00:03, 27099.77 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 69%|██████▉ | 221876/322369 [00:13<00:03, 29819.74 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 72%|███████▏ | 230703/322369 [00:13<00:02, 40026.85 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 73%|███████▎ | 236136/322369 [00:13<00:02, 38594.30 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 75%|███████▍ | 240889/322369 [00:13<00:02, 40423.40 examples/s] +0: Dropping Long Sequences (>16384) (num_proc=192): 76%|███████▌ | 245642/322369 [00:13<00:01, 40269.97 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 78%|███████▊ | 251753/322369 [00:13<00:01, 44576.83 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 80%|███████▉ | 257185/322369 [00:13<00:01, 43078.37 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 81%|████████▏ | 261938/322369 [00:13<00:01, 40329.01 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 84%|████████▍ | 272123/322369 [00:14<00:00, 54881.05 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 88%|████████▊ | 282308/322369 [00:14<00:00, 65008.56 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 91%|█████████ | 291814/322369 [00:14<00:00, 72724.62 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 93%|█████████▎| 300641/322369 +0: [00:14<00:00, 75828.80 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 96%|█████████▌| 308789/322369 [00:14<00:00, 68320.98 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 98%|█████████▊| 316258/322369 [00:14<00:00, 67013.11 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 100%|██████████| 322369/322369 [00:15<00:00, 21177.10 examples/s] +0: [2025-09-02 19:40:53,734] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:251] [PID:3622631] [RANK:0] Dropped 31920 samples from dataset +0: Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 0/290449 [00:00