wanchichen's picture
Add files using upload-large-folder tool
bf919b3 verified
raw
history blame contribute delete
104 kB
# Running on cnode7-008
# Started at Thu Nov 28 15:14:44 CST 2024
# SLURMD_NODENAME=cnode7-008
# SLURM_CLUSTER_NAME=slurm
# SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf
# SLURM_CPUS_ON_NODE=224
# SLURM_CPUS_PER_TASK=128
# SLURM_EXPORT_ENV=PATH
# SLURM_GET_USER_ENV=1
# SLURM_GPUS_ON_NODE=8
# SLURM_GPUS_PER_NODE=8
# SLURM_GTIDS=0
# SLURM_JOBID=4946
# SLURM_JOB_CPUS_PER_NODE='224(x2)'
# SLURM_JOB_END_TIME=1764314080
# SLURM_JOB_GID=1026
# SLURM_JOB_GPUS=0,1,2,3,4,5,6,7
# SLURM_JOB_ID=4946
# SLURM_JOB_NAME=exp_owsm/s2t_train_05b_ds_raw_bpe50000/train.log
# SLURM_JOB_NODELIST='cnode7-[008-009]'
# SLURM_JOB_NUM_NODES=2
# SLURM_JOB_PARTITION=p2
# SLURM_JOB_QOS=normal
# SLURM_JOB_START_TIME=1732778080
# SLURM_JOB_UID=1026
# SLURM_JOB_USER=williamchen
# SLURM_LOCALID=0
# SLURM_MEM_PER_NODE=2048000
# SLURM_NNODES=2
# SLURM_NODEID=0
# SLURM_NODELIST='cnode7-[008-009]'
# SLURM_NODE_ALIASES='(null)'
# SLURM_OPEN_MODE=a
# SLURM_PRIO_PROCESS=0
# SLURM_PROCID=0
# SLURM_SUBMIT_DIR=/mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1
# SLURM_SUBMIT_HOST=154-T2-P1-NVR
# SLURM_TASKS_PER_NODE='1(x2)'
# SLURM_TASK_PID=1503870
# SLURM_TOPOLOGY_ADDR=cnode7-008
# SLURM_TOPOLOGY_ADDR_PATTERN=node
# SLURM_WORKING_CLUSTER=slurm:154-T2-P1-NVR:6817:9984:109
# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_prev_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_ctc_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_shape --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_4c8213f8-bbff-47c8-873a-9e190d4afe53
[2024-11-28 15:14:54,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:14:57,866] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/bin/python3 /mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_prev_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_ctc_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_shape --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_4c8213f8-bbff-47c8-873a-9e190d4afe53
/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/bin/python3 /mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_prev_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_ctc_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_shape --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_4c8213f8-bbff-47c8-873a-9e190d4afe53
[2024-11-28 15:15:17,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:15,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:15,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:18,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:15,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:18,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:18,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:18,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:15,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:18,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:18,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:18,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:16,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:16,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:16,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-28 15:15:16,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[W1128 15:15:22.590933009 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:25.162658043 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:26.368137303 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:23.025406997 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:23.042933992 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:23.052782040 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:26.666921758 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:26.667129398 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:26.684706832 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:23.474303364 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:26.885105709 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:26.887277046 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:23.504727705 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:23.504793967 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[W1128 15:15:26.907160538 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[cnode7-008:0/16] 2024-11-28 15:15:26,589 (s2t:462) INFO: Vocabulary size: 50002
[W1128 15:15:23.523326241 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
[cnode7-008:0/16] 2024-11-28 15:15:33,575 (abs_task:1383) INFO: pytorch.version=2.4.0+cu121, cuda.available=True, cudnn.version=90100, cudnn.benchmark=False, cudnn.deterministic=True
[cnode7-008:0/16] 2024-11-28 15:15:33,578 (abs_task:1384) INFO: Model structure:
ESPnetS2TModel(
(frontend): DefaultFrontend(
(stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True)
(frontend): Frontend()
(logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False)
)
(specaug): SpecAug(
(freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq)
(time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time)
)
(normalize): GlobalMVN(stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True)
(encoder): TransformerEncoder(
(embed): Conv2dSubsampling(
(conv): Sequential(
(0): Conv2d(1, 2048, kernel_size=(3, 3), stride=(2, 2))
(1): ReLU()
(2): Conv2d(2048, 2048, kernel_size=(3, 3), stride=(2, 2))
(3): ReLU()
)
(out): Sequential(
(0): Linear(in_features=38912, out_features=2048, bias=True)
(1): PositionalEncoding(
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(encoders): MultiSequential(
(0): EncoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(1): EncoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(2): EncoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(3): EncoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(4): EncoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(5): EncoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(6): EncoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(7): EncoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(after_norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
)
(decoder): TransformerDecoder(
(embed): Sequential(
(0): Embedding(50002, 2048)
(1): PositionalEncoding(
(dropout): Dropout(p=0.1, inplace=False)
)
)
(after_norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(output_layer): Linear(in_features=2048, out_features=50002, bias=True)
(decoders): MultiSequential(
(0): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(1): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(2): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(3): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(4): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(5): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(6): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(7): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=2048, out_features=2048, bias=True)
(linear_k): Linear(in_features=2048, out_features=2048, bias=True)
(linear_v): Linear(in_features=2048, out_features=2048, bias=True)
(linear_out): Linear(in_features=2048, out_features=2048, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=2048, out_features=8192, bias=True)
(w_2): Linear(in_features=8192, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(criterion_att): LabelSmoothingLoss(
(criterion): KLDivLoss()
)
(ctc): CTC(
(ctc_lo): Linear(in_features=2048, out_features=50002, bias=True)
(ctc_loss): CTCLoss()
)
)
Model summary:
Class Name: ESPnetS2TModel
Total Number of model parameters: 1.36 B
Number of trainable parameters: 1.36 B (100.0%)
Size: 5.46 GB
Type: torch.float32
[cnode7-008:0/16] 2024-11-28 15:15:33,578 (abs_task:1387) INFO: Optimizer:
Adadelta (
Parameter Group 0
capturable: False
differentiable: False
eps: 1e-06
foreach: None
lr: 1.0
maximize: False
rho: 0.9
weight_decay: 0
)
[cnode7-008:0/16] 2024-11-28 15:15:33,578 (abs_task:1388) INFO: Scheduler: None
[cnode7-008:0/16] 2024-11-28 15:15:33,580 (abs_task:1397) INFO: Saving the configuration in exp_owsm/s2t_train_05b_ds_raw_bpe50000/config.yaml
[cnode7-008:0/16] 2024-11-28 15:15:38,133 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:38,949] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:38,952] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:36,294] [INFO] [comm.py:652:init_distributed] cdb=None
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:36,297] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
[2024-11-28 15:15:36,299] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:36,300] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:39,051] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:39,052] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
[cnode7-008:0/16] 2024-11-28 15:15:39,071 (abs_task:1807) INFO: [valid] dataset:
ESPnetDataset(
speech: {"path": "dump/raw/dev_v3/wav.scp", "type": "kaldi_ark"}
text_prev: {"path": "dump/raw/dev_v3/text.prev", "type": "text"}
text_ctc: {"path": "dump/raw/dev_v3/text.ctc", "type": "text"}
text: {"path": "dump/raw/dev_v3/text", "type": "text"}
preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x15523e0a9870>)
[cnode7-008:0/16] 2024-11-28 15:15:39,071 (abs_task:1808) INFO: [valid] Batch sampler: UnsortedBatchSampler(N-batch=74743, batch_size=16, key_file=exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape,
[cnode7-008:0/16] 2024-11-28 15:15:39,077 (abs_task:1809) INFO: [valid] mini-batch sizes summary: N-batch=74743, mean=16.0, min=16, max=17
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:39,089] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:39,090] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:39,113] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:39,114] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:39,179] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:39,180] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:36,473] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:36,473] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:36,522] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:36,523] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[cnode7-008:0/16] 2024-11-28 15:15:39,327 (distributed_utils:129) WARNING:
=================================================================
Found OMP_NUM_THREADS=1 in environment variables. With some advanced features, DeepSpeed may have heavy cpu workload so that OMP_NUM_THREADS=1 is not sufficient. Try to increase it in your path.sh
=================================================================
[2024-11-28 15:15:39,327] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:39,328] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed info: version=0.15.3, git-hash=unknown, git-branch=unknown
[2024-11-28 15:15:39,328] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
cnode7-008:1504977:1504977 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504977:1504977 [0] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.170<0>
cnode7-008:1504977:1504977 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-008:1504977:1504977 [0] NCCL INFO cudaDriverVersion 12020
NCCL version 2.20.5+cuda12.4
cnode7-008:1504978:1504978 [1] NCCL INFO cudaDriverVersion 12020
cnode7-008:1504978:1504978 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504978:1504978 [1] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.170<0>
cnode7-008:1504978:1504978 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-009:635262:635262 [5] NCCL INFO cudaDriverVersion 12020
cnode7-009:635262:635262 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635262:635262 [5] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.171<0>
cnode7-009:635262:635262 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-008:1504982:1504982 [5] NCCL INFO cudaDriverVersion 12020
cnode7-008:1504982:1504982 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504982:1504982 [5] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.170<0>
cnode7-008:1504982:1504982 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-008:1504984:1504984 [7] NCCL INFO cudaDriverVersion 12020
cnode7-008:1504984:1504984 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504984:1504984 [7] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.170<0>
cnode7-008:1504984:1504984 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-009:635257:635257 [0] NCCL INFO cudaDriverVersion 12020
cnode7-009:635257:635257 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635257:635257 [0] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.171<0>
cnode7-009:635257:635257 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-009:635264:635264 [7] NCCL INFO cudaDriverVersion 12020
cnode7-009:635264:635264 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635264:635264 [7] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.171<0>
cnode7-009:635264:635264 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-008:1504980:1504980 [3] NCCL INFO cudaDriverVersion 12020
cnode7-008:1504980:1504980 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504980:1504980 [3] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.170<0>
cnode7-008:1504980:1504980 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-008:1504983:1504983 [6] NCCL INFO cudaDriverVersion 12020
cnode7-008:1504983:1504983 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504983:1504983 [6] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.170<0>
cnode7-008:1504983:1504983 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-009:635259:635259 [2] NCCL INFO cudaDriverVersion 12020
cnode7-009:635259:635259 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635259:635259 [2] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.171<0>
cnode7-009:635259:635259 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:36,918] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:36,919] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
cnode7-008:1504978:1505583 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504983:1505585 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504982:1505587 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504984:1505584 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504977:1505582 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504980:1505586 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635260:635260 [3] NCCL INFO cudaDriverVersion 12020
cnode7-009:635260:635260 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635260:635260 [3] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.171<0>
cnode7-009:635260:635260 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-008:1504978:1505583 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.170<0>
cnode7-008:1504978:1505583 [1] NCCL INFO Using non-device net plugin version 0
cnode7-008:1504978:1505583 [1] NCCL INFO Using network IB
cnode7-008:1504983:1505585 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.170<0>
cnode7-008:1504983:1505585 [6] NCCL INFO Using non-device net plugin version 0
cnode7-008:1504983:1505585 [6] NCCL INFO Using network IB
cnode7-009:635259:636004 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635262:636000 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504982:1505587 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.170<0>
cnode7-008:1504982:1505587 [5] NCCL INFO Using non-device net plugin version 0
cnode7-008:1504982:1505587 [5] NCCL INFO Using network IB
cnode7-008:1504984:1505584 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.170<0>
cnode7-008:1504984:1505584 [7] NCCL INFO Using non-device net plugin version 0
cnode7-008:1504984:1505584 [7] NCCL INFO Using network IB
cnode7-009:635257:636003 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635264:636002 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504977:1505582 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.170<0>
cnode7-008:1504977:1505582 [0] NCCL INFO Using non-device net plugin version 0
cnode7-008:1504977:1505582 [0] NCCL INFO Using network IB
cnode7-008:1504980:1505586 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.170<0>
cnode7-008:1504980:1505586 [3] NCCL INFO Using non-device net plugin version 0
cnode7-008:1504980:1505586 [3] NCCL INFO Using network IB
cnode7-009:635259:636004 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.171<0>
cnode7-009:635259:636004 [2] NCCL INFO Using non-device net plugin version 0
cnode7-009:635259:636004 [2] NCCL INFO Using network IB
cnode7-009:635262:636000 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.171<0>
cnode7-009:635262:636000 [5] NCCL INFO Using non-device net plugin version 0
cnode7-009:635262:636000 [5] NCCL INFO Using network IB
cnode7-009:635257:636003 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.171<0>
cnode7-009:635257:636003 [0] NCCL INFO Using non-device net plugin version 0
cnode7-009:635257:636003 [0] NCCL INFO Using network IB
cnode7-009:635264:636002 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.171<0>
cnode7-009:635264:636002 [7] NCCL INFO Using non-device net plugin version 0
cnode7-009:635264:636002 [7] NCCL INFO Using network IB
cnode7-009:635260:636022 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635260:636022 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.171<0>
cnode7-009:635260:636022 [3] NCCL INFO Using non-device net plugin version 0
cnode7-009:635260:636022 [3] NCCL INFO Using network IB
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:39,946] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:39,948] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:37,258] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:37,260] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
cnode7-008:1504981:1504981 [4] NCCL INFO cudaDriverVersion 12020
cnode7-008:1504981:1504981 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504981:1504981 [4] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.170<0>
cnode7-008:1504981:1504981 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-009:635261:635261 [4] NCCL INFO cudaDriverVersion 12020
cnode7-009:635261:635261 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635261:635261 [4] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.171<0>
cnode7-009:635261:635261 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:38,095] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:38,097] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
cnode7-008:1504981:1505653 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635263:635263 [6] NCCL INFO cudaDriverVersion 12020
cnode7-009:635263:635263 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635263:635263 [6] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.171<0>
cnode7-009:635263:635263 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
[2024-11-28 15:15:40,908] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:40,910] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
cnode7-009:635261:636079 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504979:1504979 [2] NCCL INFO cudaDriverVersion 12020
cnode7-008:1504979:1504979 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504979:1504979 [2] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.170<0>
cnode7-008:1504979:1504979 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-009:635261:636079 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.171<0>
cnode7-009:635261:636079 [4] NCCL INFO Using non-device net plugin version 0
cnode7-009:635261:636079 [4] NCCL INFO Using network IB
cnode7-008:1504981:1505653 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.170<0>
cnode7-008:1504981:1505653 [4] NCCL INFO Using non-device net plugin version 0
cnode7-008:1504981:1505653 [4] NCCL INFO Using network IB
cnode7-009:635263:636084 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:238: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch
def valid_one_epoch(
cnode7-009:635263:636084 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.171<0>
cnode7-009:635263:636084 [6] NCCL INFO Using non-device net plugin version 0
cnode7-009:635263:636084 [6] NCCL INFO Using network IB
[2024-11-28 15:15:38,687] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-28 15:15:38,688] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16
cnode7-008:1504979:1505667 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-008:1504979:1505667 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.170<0>
cnode7-008:1504979:1505667 [2] NCCL INFO Using non-device net plugin version 0
cnode7-008:1504979:1505667 [2] NCCL INFO Using network IB
cnode7-009:635258:635258 [1] NCCL INFO cudaDriverVersion 12020
cnode7-009:635258:635258 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635258:635258 [1] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.171<0>
cnode7-009:635258:635258 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
cnode7-009:635258:636105 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet
cnode7-009:635258:636105 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.171<0>
cnode7-009:635258:636105 [1] NCCL INFO Using non-device net plugin version 0
cnode7-009:635258:636105 [1] NCCL INFO Using network IB
cnode7-008:1504979:1505667 [2] NCCL INFO comm 0x555563055650 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x90771211591af19e - Init START
cnode7-008:1504978:1505583 [1] NCCL INFO comm 0x555562abe5e0 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x90771211591af19e - Init START
cnode7-008:1504983:1505585 [6] NCCL INFO comm 0x555562abd3b0 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x90771211591af19e - Init START
cnode7-009:635259:636004 [2] NCCL INFO comm 0x5555694db980 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x90771211591af19e - Init START
cnode7-009:635262:636000 [5] NCCL INFO comm 0x55556255d900 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x90771211591af19e - Init START
cnode7-008:1504981:1505653 [4] NCCL INFO comm 0x5555694d8a50 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x90771211591af19e - Init START
cnode7-008:1504982:1505587 [5] NCCL INFO comm 0x555564b79df0 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x90771211591af19e - Init START
cnode7-008:1504984:1505584 [7] NCCL INFO comm 0x555572152360 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x90771211591af19e - Init START
cnode7-009:635263:636084 [6] NCCL INFO comm 0x555564b7b720 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x90771211591af19e - Init START
cnode7-009:635257:636003 [0] NCCL INFO comm 0x5555bdc83010 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x90771211591af19e - Init START
cnode7-009:635264:636002 [7] NCCL INFO comm 0x5555638e85b0 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x90771211591af19e - Init START
cnode7-009:635261:636079 [4] NCCL INFO comm 0x5555694dc520 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x90771211591af19e - Init START
cnode7-008:1504977:1505582 [0] NCCL INFO comm 0x5555bdcea200 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x90771211591af19e - Init START
cnode7-008:1504980:1505586 [3] NCCL INFO comm 0x5555638e7630 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x90771211591af19e - Init START
cnode7-009:635260:636022 [3] NCCL INFO comm 0x555564b7d7e0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x90771211591af19e - Init START
cnode7-009:635258:636105 [1] NCCL INFO comm 0x5555638e63a0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x90771211591af19e - Init START
cnode7-008:1504978:1505583 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS multicast support is available on dev 1
cnode7-008:1504977:1505582 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS multicast support is available on dev 0
cnode7-008:1504980:1505586 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS multicast support is available on dev 3
cnode7-008:1504983:1505585 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS multicast support is available on dev 6
cnode7-008:1504984:1505584 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS multicast support is available on dev 7
cnode7-008:1504979:1505667 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS multicast support is available on dev 2
cnode7-008:1504981:1505653 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS multicast support is available on dev 4
cnode7-008:1504982:1505587 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS multicast support is available on dev 5
cnode7-009:635262:636000 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000
cnode7-009:635262:636000 [5] NCCL INFO NVLS multicast support is available on dev 5
cnode7-009:635264:636002 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000
cnode7-009:635264:636002 [7] NCCL INFO NVLS multicast support is available on dev 7
cnode7-009:635259:636004 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff
cnode7-009:635259:636004 [2] NCCL INFO NVLS multicast support is available on dev 2
cnode7-009:635260:636022 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff
cnode7-009:635260:636022 [3] NCCL INFO NVLS multicast support is available on dev 3
cnode7-009:635261:636079 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000
cnode7-009:635261:636079 [4] NCCL INFO NVLS multicast support is available on dev 4
cnode7-009:635258:636105 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff
cnode7-009:635258:636105 [1] NCCL INFO NVLS multicast support is available on dev 1
cnode7-008:1504980:1505586 [3] NCCL INFO comm 0x5555638e7630 rank 3 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS Head 0: 0 8
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS Head 1: 1 9
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS Head 2: 2 10
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS Head 3: 3 11
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS Head 4: 4 12
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS Head 5: 5 13
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS Head 6: 6 14
cnode7-008:1504980:1505586 [3] NCCL INFO NVLS Head 7: 7 15
cnode7-008:1504980:1505586 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/11/-1->3->-1 [4] -1/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->11 [12] -1/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2
cnode7-008:1504980:1505586 [3] NCCL INFO P2P Chunksize set to 131072
cnode7-008:1504977:1505582 [0] NCCL INFO comm 0x5555bdcea200 rank 0 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS Head 0: 0 8
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS Head 1: 1 9
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS Head 2: 2 10
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS Head 3: 3 11
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS Head 4: 4 12
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS Head 5: 5 13
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS Head 6: 6 14
cnode7-008:1504977:1505582 [0] NCCL INFO NVLS Head 7: 7 15
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 00/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 01/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 02/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 03/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 04/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 06/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 08/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 09/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 10/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 11/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 12/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 14/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7
cnode7-008:1504977:1505582 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->7 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 [8] 1/-1/-1->0->8 [9] -1/-1/-1->0->7 [10] 1/-1/-1->0->7 [11] 1/-1/-1->0->7 [12] 1/-1/-1->0->7 [13] 1/-1/-1->0->7 [14] 1/-1/-1->0->7 [15] 1/-1/-1->0->7
cnode7-008:1504977:1505582 [0] NCCL INFO P2P Chunksize set to 131072
cnode7-009:635262:636000 [5] NCCL INFO comm 0x55556255d900 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0
cnode7-009:635262:636000 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12
cnode7-009:635262:636000 [5] NCCL INFO P2P Chunksize set to 131072
cnode7-009:635264:636002 [7] NCCL INFO comm 0x5555638e85b0 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0
cnode7-009:635264:636002 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1
cnode7-009:635264:636002 [7] NCCL INFO P2P Chunksize set to 131072
cnode7-009:635263:636084 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000
cnode7-009:635263:636084 [6] NCCL INFO NVLS multicast support is available on dev 6
cnode7-009:635263:636084 [6] NCCL INFO comm 0x555564b7b720 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0
cnode7-009:635263:636084 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13
cnode7-009:635263:636084 [6] NCCL INFO P2P Chunksize set to 131072
cnode7-009:635261:636079 [4] NCCL INFO comm 0x5555694dc520 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0
cnode7-009:635261:636079 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11
cnode7-009:635261:636079 [4] NCCL INFO P2P Chunksize set to 131072
cnode7-008:1504981:1505653 [4] NCCL INFO comm 0x5555694d8a50 rank 4 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS Head 0: 0 8
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS Head 1: 1 9
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS Head 2: 2 10
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS Head 3: 3 11
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS Head 4: 4 12
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS Head 5: 5 13
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS Head 6: 6 14
cnode7-008:1504981:1505653 [4] NCCL INFO NVLS Head 7: 7 15
cnode7-008:1504981:1505653 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/12/-1->4->-1 [5] -1/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->12 [13] -1/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3
cnode7-008:1504981:1505653 [4] NCCL INFO P2P Chunksize set to 131072
cnode7-009:635260:636022 [3] NCCL INFO comm 0x555564b7d7e0 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0
cnode7-009:635260:636022 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10
cnode7-009:635260:636022 [3] NCCL INFO P2P Chunksize set to 131072
cnode7-009:635259:636004 [2] NCCL INFO comm 0x5555694db980 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0
cnode7-009:635259:636004 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9
cnode7-009:635259:636004 [2] NCCL INFO P2P Chunksize set to 131072
cnode7-009:635257:636003 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff
cnode7-009:635257:636003 [0] NCCL INFO NVLS multicast support is available on dev 0
cnode7-009:635257:636003 [0] NCCL INFO comm 0x5555bdc83010 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0
cnode7-009:635257:636003 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15
cnode7-009:635257:636003 [0] NCCL INFO P2P Chunksize set to 131072
cnode7-009:635258:636105 [1] NCCL INFO comm 0x5555638e63a0 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0
cnode7-009:635258:636105 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8
cnode7-009:635258:636105 [1] NCCL INFO P2P Chunksize set to 131072
cnode7-008:1504978:1505583 [1] NCCL INFO comm 0x555562abe5e0 rank 1 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS Head 0: 0 8
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS Head 1: 1 9
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS Head 2: 2 10
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS Head 3: 3 11
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS Head 4: 4 12
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS Head 5: 5 13
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS Head 6: 6 14
cnode7-008:1504978:1505583 [1] NCCL INFO NVLS Head 7: 7 15
cnode7-008:1504978:1505583 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] -1/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->9 [10] -1/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0
cnode7-008:1504978:1505583 [1] NCCL INFO P2P Chunksize set to 131072
cnode7-008:1504979:1505667 [2] NCCL INFO comm 0x555563055650 rank 2 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS Head 0: 0 8
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS Head 1: 1 9
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS Head 2: 2 10
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS Head 3: 3 11
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS Head 4: 4 12
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS Head 5: 5 13
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS Head 6: 6 14
cnode7-008:1504979:1505667 [2] NCCL INFO NVLS Head 7: 7 15
cnode7-008:1504979:1505667 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/10/-1->2->-1 [3] -1/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->10 [11] -1/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1
cnode7-008:1504979:1505667 [2] NCCL INFO P2P Chunksize set to 131072
cnode7-008:1504983:1505585 [6] NCCL INFO comm 0x555562abd3b0 rank 6 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS Head 0: 0 8
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS Head 1: 1 9
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS Head 2: 2 10
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS Head 3: 3 11
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS Head 4: 4 12
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS Head 5: 5 13
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS Head 6: 6 14
cnode7-008:1504983:1505585 [6] NCCL INFO NVLS Head 7: 7 15
cnode7-008:1504983:1505585 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/14/-1->6->-1 [7] -1/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->14 [15] -1/-1/-1->6->5
cnode7-008:1504983:1505585 [6] NCCL INFO P2P Chunksize set to 131072
cnode7-008:1504982:1505587 [5] NCCL INFO comm 0x555564b79df0 rank 5 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS Head 0: 0 8
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS Head 1: 1 9
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS Head 2: 2 10
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS Head 3: 3 11
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS Head 4: 4 12
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS Head 5: 5 13
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS Head 6: 6 14
cnode7-008:1504982:1505587 [5] NCCL INFO NVLS Head 7: 7 15
cnode7-008:1504982:1505587 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/13/-1->5->-1 [6] -1/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->13 [14] -1/-1/-1->5->4 [15] 6/-1/-1->5->4
cnode7-008:1504982:1505587 [5] NCCL INFO P2P Chunksize set to 131072
cnode7-008:1504984:1505584 [7] NCCL INFO comm 0x555572152360 rank 7 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS Head 0: 0 8
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS Head 1: 1 9
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS Head 2: 2 10
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS Head 3: 3 11
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS Head 4: 4 12
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS Head 5: 5 13
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS Head 6: 6 14
cnode7-008:1504984:1505584 [7] NCCL INFO NVLS Head 7: 7 15
cnode7-008:1504984:1505584 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] 0/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/15/-1->7->-1 [8] -1/-1/-1->7->6 [9] 0/-1/-1->7->6 [10] 0/-1/-1->7->6 [11] 0/-1/-1->7->6 [12] 0/-1/-1->7->6 [13] 0/-1/-1->7->6 [14] 0/-1/-1->7->6 [15] 0/-1/-1->7->15
cnode7-008:1504984:1505584 [7] NCCL INFO P2P Chunksize set to 131072
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA
cnode7-009:635259:636004 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA
cnode7-009:635262:636000 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA
cnode7-009:635262:636000 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA
cnode7-009:635262:636000 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM
cnode7-008:1504981:1505653 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM
cnode7-008:1504982:1505587 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA
cnode7-009:635263:636084 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA
cnode7-009:635263:636084 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA
cnode7-009:635263:636084 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA
cnode7-009:635263:636084 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM
cnode7-009:635263:636084 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM
cnode7-009:635257:636003 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA
cnode7-009:635257:636003 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA
cnode7-009:635257:636003 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA
cnode7-009:635257:636003 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA
cnode7-009:635264:636002 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA
cnode7-009:635264:636002 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA
cnode7-009:635264:636002 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA
cnode7-009:635264:636002 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA
cnode7-009:635264:636002 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM
cnode7-009:635264:636002 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA
cnode7-009:635261:636079 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA
cnode7-009:635261:636079 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA
cnode7-009:635261:636079 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA
cnode7-009:635261:636079 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM
cnode7-009:635261:636079 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 08/0 : 0[0] -> 7[7] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 10/0 : 0[0] -> 7[7] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 12/0 : 0[0] -> 7[7] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 14/0 : 0[0] -> 7[7] via P2P/CUMEM
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA
cnode7-008:1504977:1505582 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM
cnode7-008:1504980:1505586 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA
cnode7-009:635260:636022 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA
cnode7-009:635260:636022 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA
cnode7-009:635260:636022 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA
cnode7-009:635260:636022 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM
cnode7-009:635260:636022 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA
cnode7-009:635258:636105 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA
cnode7-009:635258:636105 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA
cnode7-009:635258:636105 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA
cnode7-009:635258:636105 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM
cnode7-009:635258:636105 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM
cnode7-008:1504979:1505667 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM
cnode7-008:1504978:1505583 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM
cnode7-008:1504983:1505585 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA
cnode7-009:635259:636004 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA
cnode7-009:635259:636004 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA
cnode7-009:635259:636004 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA
cnode7-009:635259:636004 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM
cnode7-009:635259:636004 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM
cnode7-009:635262:636000 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM
cnode7-008:1504984:1505584 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
slurmstepd: error: *** STEP 4946.0 ON cnode7-008 CANCELLED AT 2024-11-28T15:15:48 ***