File size: 2,287 Bytes
131da64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/bash
#SBATCH --job-name=precompute_tokens
#SBATCH --partition=all
#SBATCH --nodes=1
#SBATCH --gpus-per-node=2
#SBATCH --cpus-per-gpu=8
#SBATCH --mem-per-gpu=32G
#SBATCH --time=06:00:00
#SBATCH --output=outputs/logs/%A_%a_%n_log.out
#SBATCH --signal=B:USR2@600

echo "ibstatus: $(ibstatus)"
echo "ibdev2netdev: $(ibdev2netdev)"
echo "rdma device: $(rdma link)"

unset NCCL_P2P_LEVEL
export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=1
export NCCL_DEBUG=INFO
export NCCL_NSOCKS_PERTHREAD=4
export NCCL_SOCKET_NTHREADS=2

export LOGLEVEL=INFO
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
echo MASTER_ADDR: $MASTER_ADDR
echo MASTER_PORT: $MASTER_PORT
echo "environment: $(env | grep NCCL)"
echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
echo "SLURM_NNODES: $SLURM_NNODES"

trap 'echo "SIGUSR2"; \
if [ -n "$SLURM_ARRAY_JOB_ID" ]; then echo "SLURM_ARRAY_JOB_ID: $SLURM_ARRAY_JOB_ID"; fi; \
if [ -n "$SLURM_ARRAY_TASK_ID" ]; then echo "SLURM_ARRAY_TASK_ID: $SLURM_ARRAY_TASK_ID"; fi; \
# ps auxww | grep $USER; \
pid=$(pgrep -u $USER -f "python.*(accelerate|torchrun|deepspeed|distributed\.run).*"); \
echo "Found parent PIDs: $pid"; \
for p in $pid; do \
    echo "Parent PID has cmd: $(ps -p $p -o cmd=)"; \
    children=$(pgrep -P $p); \
    echo "Children: $children"; \
    if [ -n "$children" ]; then \
    for child in $children; do \
        ppid=$(ps -o ppid= -p $child | tr -d " ")
        if [ "$ppid" -eq "$p" ]; then
        echo "Killing direct child process: PID $child with cmd: $(ps -p $child -o cmd=)"
        kill -USR2 $child &
        else
        echo "Skipping non-direct child process: PID $child with PPID $ppid"
        fi
    done; \
    echo "Sent kill signals to children of $p"; \
    else \
    echo "No children found for $p"; \
    fi; \
done; \
wait;' SIGUSR2

num_processes=$((SLURM_NNODES * SLURM_GPUS_PER_NODE))
echo "num_processes: $num_processes"
srun --label accelerate launch \
    --rdzv_backend c10d \
    --machine_rank $SLURM_NODEID \
    --num_processes $num_processes \
    --num_machines $SLURM_NNODES \
    --dynamo_backend no \
    --mixed_precision no \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    "$@"