unidisc / configs /slurm_example.yaml
aswerdlow's picture
Fixed demo instructions & yaml config
3a60a49
# This is an example slurm launcher config that should be added to the main config.yaml file under the hydra section. This cannot be run directly.
hydra:
launcher:
name: ${get_slurm_name:}
# See https://hydra.cc/docs/configure_hydra/workdir/
submitit_folder: ${hydra.sweep.dir}/%j
nodes: ${nodes} # Number of nodes. This value is *per* node
mem_gb: ${eval:'${mem_per_gpu} * ${trainer.devices}'} # 40GB per gpu. This value is *per* node
gpus_per_node: ${trainer.devices}
partition: ${partition}
constraint: ${constraint}
exclude: ${exclude_nodes:}
timeout_min: ${timeout_min}
max_num_timeout: 12 # Num requeue exlcuding pre-emptions
comment: aswerdlo
stderr_to_stdout: true
# Be careful with changing anything below.
# see: https://github.com/stas00/ml-engineering/tree/master/training/fault-tolerance#approach-b2-choosing-which-process-to-send-the-signal-to
# see: https://github.com/huggingface/accelerate/issues/1918
# The accelerate launcher w/1 initial process and then spawn 1 per GPU
tasks_per_node: 1
cpus_per_task: ${eval:'${cpus_per_gpu} * ${trainer.devices}'}
python: |
bash -c "torchrun --nnodes $SLURM_NNODES --nproc_per_node $SLURM_GPUS_PER_NODE --role \$(hostname -s|tr -dc '0-9'): --node_rank \$SLURM_PROCID --max-restarts=2 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
python_suffix: ' --dummy-arg $SLURM_JOB_ID" &'
signal: 'B:USR2@360'
post_srun_commands:
- ''
- wait
srun_args:
- '--jobid $SLURM_JOB_ID'
setup:
- |
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=$(( ($SLURM_JOB_ID % 20001) + 30000 ))
export NUM_PROCESSES=$((SLURM_NNODES * SLURM_GPUS_PER_NODE))
export NCCL_DEBUG=INFO
export NCCL_NSOCKS_PERTHREAD=4
export NCCL_SOCKET_NTHREADS=2
export OMP_NUM_THREADS=2
export PYTHONUNBUFFERED=1
export STDOUT_PATH=$(scontrol show job $SLURM_JOB_ID | grep -oP "StdOut=\K[^ ]+")
export LOCAL_JOB_FOLDER=$(dirname $STDOUT_PATH)
export NCCL_TOPO_DUMP_FILE="$LOCAL_JOB_FOLDER/nccl_topo.xml"
if [ -n "$SLURM_RESTART_COUNT" ]; then
export RESTART_COUNT=$SLURM_RESTART_COUNT
else
export RESTART_COUNT=0
fi
export MAIN_LOG_PATH="$LOCAL_JOB_FOLDER/log_$RESTART_COUNT.txt"
mkdir -p $LOCAL_JOB_FOLDER
printenv > "$LOCAL_JOB_FOLDER"/env_"$SLURM_LOCALID_$RESTART_COUNT.txt"
echo "ibstatus: $(ibstatus)"
echo "ibdev2netdev: $(ibdev2netdev)"
echo "rdma device: $(rdma link)"
echo "environment: $(env | grep NCCL)"
echo "NUM_PROCESSES: $NUM_PROCESSES, SLURM_NNODES: $SLURM_NNODES SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE"
echo "NODE_ID: $SLURM_NODEID, SLURM_PROCID: $SLURM_PROCID, MASTER_ADDR: $MASTER_ADDR, MASTER_PORT: $MASTER_PORT"
echo "PWD: $PWD, LOCAL_JOB_FOLDER: $LOCAL_JOB_FOLDER, MAIN_LOG_PATH: $MAIN_LOG_PATH"
trap 'echo "SIGUSR2 received for $SLURM_JOB_ID"; \
if [ -n "$SLURM_ARRAY_JOB_ID" ]; then echo "SLURM_ARRAY_JOB_ID: $SLURM_ARRAY_JOB_ID"; fi; \
if [ -n "$SLURM_ARRAY_TASK_ID" ]; then echo "SLURM_ARRAY_TASK_ID: $SLURM_ARRAY_TASK_ID"; fi; \
# ps auxww | grep $USER; \
pid=$(pgrep -u $USER -f "python.*(accelerate|torchrun|deepspeed|distributed\.run).*dummy-arg $SLURM_JOB_ID"); \
echo "Found parent PIDs: $pid"; \
for p in $pid; do \
echo "Parent PID has cmd: $(ps -p $p -o cmd=)"; \
children=$(pgrep -P $p); \
echo "Children: $children"; \
if [ -n "$children" ]; then \
for child in $children; do \
ppid=$(ps -o ppid= -p $child | tr -d " ")
if [ "$ppid" -eq "$p" ]; then
echo "Killing direct child process: PID $child with cmd: $(ps -p $child -o cmd=)"
kill -USR2 $child &
else
echo "Skipping non-direct child process: PID $child with PPID $ppid"
fi
done; \
echo "Sent kill signals to children of $p"; \
else \
echo "No children found for $p"; \
fi; \
done; \
wait;' SIGUSR2