File size: 4,326 Bytes
3a60a49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# This is an example slurm launcher config that should be added to the main config.yaml file under the hydra section. This cannot be run directly.
hydra:
  launcher:
    name: ${get_slurm_name:}
    # See https://hydra.cc/docs/configure_hydra/workdir/
    submitit_folder: ${hydra.sweep.dir}/%j
    nodes: ${nodes} # Number of nodes. This value is *per* node
    mem_gb: ${eval:'${mem_per_gpu} * ${trainer.devices}'} # 40GB per gpu. This value is *per* node
    gpus_per_node: ${trainer.devices}
    partition: ${partition}
    constraint: ${constraint}
    exclude: ${exclude_nodes:}

    timeout_min: ${timeout_min}
    max_num_timeout: 12 # Num requeue exlcuding pre-emptions
    comment: aswerdlo
    stderr_to_stdout: true

    # Be careful with changing anything below.
    # see: https://github.com/stas00/ml-engineering/tree/master/training/fault-tolerance#approach-b2-choosing-which-process-to-send-the-signal-to
    # see: https://github.com/huggingface/accelerate/issues/1918

    # The accelerate launcher w/1 initial process and then spawn 1 per GPU
    tasks_per_node: 1
    cpus_per_task: ${eval:'${cpus_per_gpu} * ${trainer.devices}'}
    python: |
            bash -c "torchrun --nnodes $SLURM_NNODES --nproc_per_node $SLURM_GPUS_PER_NODE --role \$(hostname -s|tr -dc '0-9'): --node_rank \$SLURM_PROCID --max-restarts=2 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \

    python_suffix: ' --dummy-arg $SLURM_JOB_ID" &'
    signal: 'B:USR2@360'
    post_srun_commands:
      - ''
      - wait

    srun_args:
      - '--jobid $SLURM_JOB_ID'

    setup:
      - |
        export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
        export MASTER_PORT=$(( ($SLURM_JOB_ID % 20001) + 30000 ))
        export NUM_PROCESSES=$((SLURM_NNODES * SLURM_GPUS_PER_NODE))
        export NCCL_DEBUG=INFO
        export NCCL_NSOCKS_PERTHREAD=4
        export NCCL_SOCKET_NTHREADS=2
        export OMP_NUM_THREADS=2
        export PYTHONUNBUFFERED=1
        export STDOUT_PATH=$(scontrol show job $SLURM_JOB_ID | grep -oP "StdOut=\K[^ ]+")
        export LOCAL_JOB_FOLDER=$(dirname $STDOUT_PATH)
        export NCCL_TOPO_DUMP_FILE="$LOCAL_JOB_FOLDER/nccl_topo.xml"
        if [ -n "$SLURM_RESTART_COUNT" ]; then
          export RESTART_COUNT=$SLURM_RESTART_COUNT
        else
          export RESTART_COUNT=0
        fi
        export MAIN_LOG_PATH="$LOCAL_JOB_FOLDER/log_$RESTART_COUNT.txt"

        mkdir -p $LOCAL_JOB_FOLDER
        printenv > "$LOCAL_JOB_FOLDER"/env_"$SLURM_LOCALID_$RESTART_COUNT.txt"

        echo "ibstatus: $(ibstatus)"
        echo "ibdev2netdev: $(ibdev2netdev)"
        echo "rdma device: $(rdma link)"
        echo "environment: $(env | grep NCCL)"
        echo "NUM_PROCESSES: $NUM_PROCESSES, SLURM_NNODES: $SLURM_NNODES SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE"
        echo "NODE_ID: $SLURM_NODEID, SLURM_PROCID: $SLURM_PROCID, MASTER_ADDR: $MASTER_ADDR, MASTER_PORT: $MASTER_PORT"
        echo "PWD: $PWD, LOCAL_JOB_FOLDER: $LOCAL_JOB_FOLDER, MAIN_LOG_PATH: $MAIN_LOG_PATH"

        trap 'echo "SIGUSR2 received for $SLURM_JOB_ID"; \
        if [ -n "$SLURM_ARRAY_JOB_ID" ]; then echo "SLURM_ARRAY_JOB_ID: $SLURM_ARRAY_JOB_ID"; fi; \
        if [ -n "$SLURM_ARRAY_TASK_ID" ]; then echo "SLURM_ARRAY_TASK_ID: $SLURM_ARRAY_TASK_ID"; fi; \
        # ps auxww | grep $USER; \
        pid=$(pgrep -u $USER -f "python.*(accelerate|torchrun|deepspeed|distributed\.run).*dummy-arg $SLURM_JOB_ID"); \
        echo "Found parent PIDs: $pid"; \
        for p in $pid; do \
          echo "Parent PID has cmd: $(ps -p $p -o cmd=)"; \
          children=$(pgrep -P $p); \
          echo "Children: $children"; \
          if [ -n "$children" ]; then \
            for child in $children; do \
              ppid=$(ps -o ppid= -p $child | tr -d " ")
              if [ "$ppid" -eq "$p" ]; then
                echo "Killing direct child process: PID $child with cmd: $(ps -p $child -o cmd=)"
                kill -USR2 $child &
              else
                echo "Skipping non-direct child process: PID $child with PPID $ppid"
              fi
            done; \
            echo "Sent kill signals to children of $p"; \
          else \
            echo "No children found for $p"; \
          fi; \
        done; \
        wait;' SIGUSR2