# Running on cnode7-012 # Started at Thu Dec 5 00:13:34 CST 2024 # SLURMD_NODENAME=cnode7-012 # SLURM_CLUSTER_NAME=slurm # SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf # SLURM_CPUS_ON_NODE=224 # SLURM_CPUS_PER_TASK=128 # SLURM_EXPORT_ENV=PATH # SLURM_GET_USER_ENV=1 # SLURM_GPUS_ON_NODE=8 # SLURM_GPUS_PER_NODE=8 # SLURM_GTIDS=0 # SLURM_JOBID=6487 # SLURM_JOB_CPUS_PER_NODE='224(x2)' # SLURM_JOB_END_TIME=1764864812 # SLURM_JOB_GID=1026 # SLURM_JOB_GPUS=0,1,2,3,4,5,6,7 # SLURM_JOB_ID=6487 # SLURM_JOB_NAME=exp_owsm/s2t_train_05b_ds_raw_bpe50000/train.log # SLURM_JOB_NODELIST='cnode7-[012-013]' # SLURM_JOB_NUM_NODES=2 # SLURM_JOB_PARTITION=p2 # SLURM_JOB_QOS=normal # SLURM_JOB_START_TIME=1733328812 # SLURM_JOB_UID=1026 # SLURM_JOB_USER=williamchen # SLURM_LOCALID=0 # SLURM_MEM_PER_NODE=2048000 # SLURM_NNODES=2 # SLURM_NODEID=0 # SLURM_NODELIST='cnode7-[012-013]' # SLURM_NODE_ALIASES='(null)' # SLURM_OPEN_MODE=a # SLURM_PRIO_PROCESS=0 # SLURM_PROCID=0 # SLURM_SUBMIT_DIR=/mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1 # SLURM_SUBMIT_HOST=154-T2-P1-NVR # SLURM_TASKS_PER_NODE='1(x2)' # SLURM_TASK_PID=58074 # SLURM_TOPOLOGY_ADDR=cnode7-012 # SLURM_TOPOLOGY_ADDR_PATTERN=node # SLURM_WORKING_CLUSTER=slurm:154-T2-P1-NVR:6817:9984:109 # srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_01910170-3ea0-4a59-afb8-c92b8efbf9d8 [2024-12-05 00:13:47,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:13:47,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/bin/python3 /mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_01910170-3ea0-4a59-afb8-c92b8efbf9d8 /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/bin/python3 /mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_01910170-3ea0-4a59-afb8-c92b8efbf9d8 [2024-12-05 00:14:05,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:07,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:07,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,926] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:08,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:09,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:09,149] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:14:09,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [W1205 00:14:13.333494063 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:15.625868003 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:15.625868624 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.658937625 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.739629202 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.316054898 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.149243254 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.340223453 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.358427271 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.371834430 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.385621562 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.606034801 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.606359034 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.800530364 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1205 00:14:16.899190381 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [cnode7-012:0/16] 2024-12-05 00:14:16,945 (s2t:462) INFO: Vocabulary size: 50002 [W1205 00:14:16.937603609 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [cnode7-012:0/16] 2024-12-05 00:14:20,569 (abs_task:1383) INFO: pytorch.version=2.4.0+cu121, cuda.available=True, cudnn.version=90100, cudnn.benchmark=False, cudnn.deterministic=True [cnode7-012:0/16] 2024-12-05 00:14:20,575 (abs_task:1384) INFO: Model structure: ESPnetS2TModel( (frontend): DefaultFrontend( (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True) (frontend): Frontend() (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) ) (specaug): SpecAug( (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq) (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time) ) (normalize): GlobalMVN(stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True) (encoder): TransformerEncoder( (embed): Conv2dSubsampling( (conv): Sequential( (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2)) (1): ReLU() (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) (3): ReLU() ) (out): Sequential( (0): Linear(in_features=19456, out_features=1024, bias=True) (1): PositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) ) ) (encoders): MultiSequential( (0): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (1): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (2): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (3): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (4): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (5): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (6): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (7): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (8): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (9): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (10): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (11): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (12): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (13): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (14): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (15): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) ) (decoder): TransformerDecoder( (embed): Sequential( (0): Embedding(50002, 1024) (1): PositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) ) (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (output_layer): Linear(in_features=1024, out_features=50002, bias=True) (decoders): MultiSequential( (0): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (1): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (2): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (3): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (4): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (5): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (6): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (7): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (8): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (9): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (10): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (11): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (12): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (13): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (14): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (15): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) (criterion_att): LabelSmoothingLoss( (criterion): KLDivLoss() ) (ctc): CTC( (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True) (ctc_loss): CTCLoss() ) ) Model summary: Class Name: ESPnetS2TModel Total Number of model parameters: 653.37 M Number of trainable parameters: 653.37 M (100.0%) Size: 2.61 GB Type: torch.float32 [cnode7-012:0/16] 2024-12-05 00:14:20,575 (abs_task:1387) INFO: Optimizer: Adadelta ( Parameter Group 0 capturable: False differentiable: False eps: 1e-06 foreach: None lr: 1.0 maximize: False rho: 0.9 weight_decay: 0 ) [cnode7-012:0/16] 2024-12-05 00:14:20,575 (abs_task:1388) INFO: Scheduler: None [cnode7-012:0/16] 2024-12-05 00:14:20,578 (abs_task:1397) INFO: Saving the configuration in exp_owsm/s2t_train_05b_ds_raw_bpe50000/config.yaml [cnode7-012:0/16] 2024-12-05 00:14:25,253 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() [cnode7-012:0/16] 2024-12-05 00:14:27,873 (abs_task:1807) INFO: [valid] dataset: ESPnetDataset( speech: {"path": "dump/raw/dev_v3/wav.scp", "type": "kaldi_ark"} text_prev: {"path": "dump/raw/dev_v3/text.prev", "type": "text"} text_ctc: {"path": "dump/raw/dev_v3/text.ctc", "type": "text"} text: {"path": "dump/raw/dev_v3/text", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-05 00:14:27,873 (abs_task:1808) INFO: [valid] Batch sampler: SortedBatchSampler(N-batch=74743, batch_size=16, shape_file=exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-05 00:14:27,879 (abs_task:1809) INFO: [valid] mini-batch sizes summary: N-batch=74743, mean=16.0, min=16, max=17 /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() cnode7-012:59132:59132 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59132:59132 [0] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:59132:59132 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:59132:59132 [0] NCCL INFO cudaDriverVersion 12020 NCCL version 2.20.5+cuda12.4 /mnt/home/williamchen/espnet/espnet2/train/trainer.py:216: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() cnode7-012:59132:59934 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59132:59934 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:59132:59934 [0] NCCL INFO Using non-device net plugin version 0 cnode7-012:59132:59934 [0] NCCL INFO Using network IB cnode7-012:59132:59934 [0] NCCL INFO comm 0x5555646cf110 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x82de82028939aff8 - Init START cnode7-012:59132:59934 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:59132:59934 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-012:59132:59934 [0] NCCL INFO comm 0x5555646cf110 rank 0 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-012:59132:59934 [0] NCCL INFO NVLS Head 0: 0 8 cnode7-012:59132:59934 [0] NCCL INFO NVLS Head 1: 1 9 cnode7-012:59132:59934 [0] NCCL INFO NVLS Head 2: 2 10 cnode7-012:59132:59934 [0] NCCL INFO NVLS Head 3: 3 11 cnode7-012:59132:59934 [0] NCCL INFO NVLS Head 4: 4 12 cnode7-012:59132:59934 [0] NCCL INFO NVLS Head 5: 5 13 cnode7-012:59132:59934 [0] NCCL INFO NVLS Head 6: 6 14 cnode7-012:59132:59934 [0] NCCL INFO NVLS Head 7: 7 15 cnode7-012:59132:59934 [0] NCCL INFO Channel 00/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:59132:59934 [0] NCCL INFO Channel 01/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:59132:59934 [0] NCCL INFO Channel 02/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:59132:59934 [0] NCCL INFO Channel 03/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:59132:59934 [0] NCCL INFO Channel 04/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:59132:59934 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:59132:59934 [0] NCCL INFO Channel 06/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:59132:59934 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:59132:59934 [0] NCCL INFO Channel 08/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:59132:59934 [0] NCCL INFO Channel 09/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:59132:59934 [0] NCCL INFO Channel 10/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:59132:59934 [0] NCCL INFO Channel 11/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:59132:59934 [0] NCCL INFO Channel 12/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:59132:59934 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:59132:59934 [0] NCCL INFO Channel 14/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:59132:59934 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:59132:59934 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->7 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 [8] 1/-1/-1->0->8 [9] -1/-1/-1->0->7 [10] 1/-1/-1->0->7 [11] 1/-1/-1->0->7 [12] 1/-1/-1->0->7 [13] 1/-1/-1->0->7 [14] 1/-1/-1->0->7 [15] 1/-1/-1->0->7 cnode7-012:59132:59934 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-012:59132:59934 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-013:3719333:3719333 [2] NCCL INFO cudaDriverVersion 12020 cnode7-013:3719333:3719333 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719333:3719333 [2] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3719333:3719333 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3719333:3719991 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719333:3719991 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3719333:3719991 [2] NCCL INFO Using non-device net plugin version 0 cnode7-013:3719333:3719991 [2] NCCL INFO Using network IB cnode7-013:3719333:3719991 [2] NCCL INFO comm 0x55557bf14d90 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x82de82028939aff8 - Init START cnode7-013:3719333:3719991 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3719333:3719991 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-013:3719333:3719991 [2] NCCL INFO comm 0x55557bf14d90 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-013:3719333:3719991 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9 cnode7-013:3719333:3719991 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3719333:3719991 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Connected all rings cnode7-013:3719333:3719991 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 02/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 05/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 10/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-012:59132:59934 [0] NCcnode7-012:59134:59134 [2] NCCL INFO cudaDriverVersion 12020 cnode7-012:59134:59134 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59134:59134 [2] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:59134:59134 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:59134:59941 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59134:59941 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:59134:59941 [2] NCCL INFO Using non-device net plugin version 0 cnode7-012:59134:59941 [2] NCCL INFO Using network IB cnode7-012:59134:59941 [2] NCCL INFO comm 0x555560b14ba0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x82de82028939aff8 - Init START cnode7-012:59134:59941 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:59134:59941 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-012:59134:59941 [2] NCCL INFO comm 0x555560b14ba0 rank 2 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-012:59134:59941 [2] NCCL INFO NVLS Head 0: 0 8 cnode7-012:59134:59941 [2] NCCL INFO NVLS Head 1: 1 9 cnode7-012:59134:59941 [2] NCCL INFO NVLS Head 2: 2 10 cnode7-012:59134:59941 [2] NCCL INFO NVLS Head 3: 3 11 cnode7-012:59134:59941 [2] NCCL INFO NVLS Head 4: 4 12 cnode7-012:59134:59941 [2] NCCL INFO NVLS Head 5: 5 13 cnode7-012:59134:59941 [2] NCCL INFO NVLS Head 6: 6 14 cnode7-012:59134:59941 [2] NCCL INFO NVLS Head 7: 7 15 cnode7-012:59134:59941 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/10/-1->2->-1 [3] -1/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->10 [11] -1/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 cnode7-012:59134:59941 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-012:59134:59941 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Connected all rings cnode7-012:59134:59941 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 13/0 : 10[2] -> 11[3] via P2P/CUMEMcnode7-013:3719334:3719334 [3] NCCL INFO cudaDriverVersion 12020 cnode7-013:3719334:3719334 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719334:3719334 [3] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3719334:3719334 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3719334:3719989 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719334:3719989 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3719334:3719989 [3] NCCL INFO Using non-device net plugin version 0 cnode7-013:3719334:3719989 [3] NCCL INFO Using network IB cnode7-013:3719334:3719989 [3] NCCL INFO comm 0x55557b718920 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x82de82028939aff8 - Init START cnode7-013:3719334:3719989 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3719334:3719989 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-013:3719334:3719989 [3] NCCL INFO comm 0x55557b718920 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-013:3719334:3719989 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10 cnode7-013:3719334:3719989 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3719334:3719989 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Connected all rings cnode7-013:3719334:3719989 [3] NCCL INFO Channel 01/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 03/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 05/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 07/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 09/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 11/0 : 11[3] -> 12[4] via P2P/CUMEM cnodcnode7-012:59135:59135 [3] NCCL INFO cudaDriverVersion 12020 cnode7-012:59135:59135 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59135:59135 [3] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:59135:59135 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:59135:59937 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59135:59937 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:59135:59937 [3] NCCL INFO Using non-device net plugin version 0 cnode7-012:59135:59937 [3] NCCL INFO Using network IB cnode7-012:59135:59937 [3] NCCL INFO comm 0x555575317060 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x82de82028939aff8 - Init START cnode7-012:59135:59937 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:59135:59937 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-012:59135:59937 [3] NCCL INFO comm 0x555575317060 rank 3 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-012:59135:59937 [3] NCCL INFO NVLS Head 0: 0 8 cnode7-012:59135:59937 [3] NCCL INFO NVLS Head 1: 1 9 cnode7-012:59135:59937 [3] NCCL INFO NVLS Head 2: 2 10 cnode7-012:59135:59937 [3] NCCL INFO NVLS Head 3: 3 11 cnode7-012:59135:59937 [3] NCCL INFO NVLS Head 4: 4 12 cnode7-012:59135:59937 [3] NCCL INFO NVLS Head 5: 5 13 cnode7-012:59135:59937 [3] NCCL INFO NVLS Head 6: 6 14 cnode7-012:59135:59937 [3] NCCL INFO NVLS Head 7: 7 15 cnode7-012:59135:59937 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/11/-1->3->-1 [4] -1/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->11 [12] -1/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 cnode7-012:59135:59937 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-012:59135:59937 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Connected all rings cnode7-012:59135:59937 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM cnodcnode7-012:59136:59136 [4] NCCL INFO cudaDriverVersion 12020 cnode7-012:59136:59136 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59136:59136 [4] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:59136:59136 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:59136:59938 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59136:59938 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:59136:59938 [4] NCCL INFO Using non-device net plugin version 0 cnode7-012:59136:59938 [4] NCCL INFO Using network IB cnode7-012:59136:59938 [4] NCCL INFO comm 0x555572916570 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x82de82028939aff8 - Init START cnode7-012:59136:59938 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:59136:59938 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-012:59136:59938 [4] NCCL INFO comm 0x555572916570 rank 4 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-012:59136:59938 [4] NCCL INFO NVLS Head 0: 0 8 cnode7-012:59136:59938 [4] NCCL INFO NVLS Head 1: 1 9 cnode7-012:59136:59938 [4] NCCL INFO NVLS Head 2: 2 10 cnode7-012:59136:59938 [4] NCCL INFO NVLS Head 3: 3 11 cnode7-012:59136:59938 [4] NCCL INFO NVLS Head 4: 4 12 cnode7-012:59136:59938 [4] NCCL INFO NVLS Head 5: 5 13 cnode7-012:59136:59938 [4] NCCL INFO NVLS Head 6: 6 14 cnode7-012:59136:59938 [4] NCCL INFO NVLS Head 7: 7 15 cnode7-012:59136:59938 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/12/-1->4->-1 [5] -1/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->12 [13] -1/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 cnode7-012:59136:59938 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-012:59136:59938 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Connected all rings cnode7-012:59136:59938 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] viacnode7-012:59137:59137 [5] NCCL INFO cudaDriverVersion 12020 cnode7-012:59137:59137 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59137:59137 [5] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:59137:59137 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:59137:59939 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59137:59939 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:59137:59939 [5] NCCL INFO Using non-device net plugin version 0 cnode7-012:59137:59939 [5] NCCL INFO Using network IB cnode7-012:59137:59939 [5] NCCL INFO comm 0x55557b318f20 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x82de82028939aff8 - Init START cnode7-012:59137:59939 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:59137:59939 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-012:59137:59939 [5] NCCL INFO comm 0x55557b318f20 rank 5 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-012:59137:59939 [5] NCCL INFO NVLS Head 0: 0 8 cnode7-012:59137:59939 [5] NCCL INFO NVLS Head 1: 1 9 cnode7-012:59137:59939 [5] NCCL INFO NVLS Head 2: 2 10 cnode7-012:59137:59939 [5] NCCL INFO NVLS Head 3: 3 11 cnode7-012:59137:59939 [5] NCCL INFO NVLS Head 4: 4 12 cnode7-012:59137:59939 [5] NCCL INFO NVLS Head 5: 5 13 cnode7-012:59137:59939 [5] NCCL INFO NVLS Head 6: 6 14 cnode7-012:59137:59939 [5] NCCL INFO NVLS Head 7: 7 15 cnode7-012:59137:59939 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/13/-1->5->-1 [6] -1/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->13 [14] -1/-1/-1->5->4 [15] 6/-1/-1->5->4 cnode7-012:59137:59939 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-012:59137:59939 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Connected all rings cnode7-012:59137:59939 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 13/0 : 11[3] cnode7-013:3719335:3719335 [4] NCCL INFO cudaDriverVersion 12020 cnode7-013:3719335:3719335 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719335:3719335 [4] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3719335:3719335 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3719335:3719992 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719335:3719992 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3719335:3719992 [4] NCCL INFO Using non-device net plugin version 0 cnode7-013:3719335:3719992 [4] NCCL INFO Using network IB cnode7-013:3719335:3719992 [4] NCCL INFO comm 0x55557bb1b8a0 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x82de82028939aff8 - Init START cnode7-013:3719335:3719992 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3719335:3719992 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-013:3719335:3719992 [4] NCCL INFO comm 0x55557bb1b8a0 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-013:3719335:3719992 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11 cnode7-013:3719335:3719992 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3719335:3719992 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Connected all rings cnode7-013:3719335:3719992 [4] NCCL INFO Channel 01/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 03/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 04/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 07/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 09/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 11/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channecnode7-013:3719336:3719336 [5] NCCL INFO cudaDriverVersion 12020 cnode7-013:3719336:3719336 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719336:3719336 [5] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3719336:3719336 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3719336:3719990 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719336:3719990 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3719336:3719990 [5] NCCL INFO Using non-device net plugin version 0 cnode7-013:3719336:3719990 [5] NCCL INFO Using network IB cnode7-013:3719336:3719990 [5] NCCL INFO comm 0x55557c718560 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x82de82028939aff8 - Init START cnode7-013:3719336:3719990 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3719336:3719990 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-013:3719336:3719990 [5] NCCL INFO comm 0x55557c718560 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-013:3719336:3719990 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12 cnode7-013:3719336:3719990 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3719336:3719990 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Connected all rings cnode7-013:3719336:3719990 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 03/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 07/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 09/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 11/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channecnode7-013:3719331:3719331 [0] NCCL INFO cudaDriverVersion 12020 cnode7-013:3719331:3719331 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719331:3719331 [0] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3719331:3719331 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3719331:3719993 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719331:3719993 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3719331:3719993 [0] NCCL INFO Using non-device net plugin version 0 cnode7-013:3719331:3719993 [0] NCCL INFO Using network IB cnode7-013:3719331:3719993 [0] NCCL INFO comm 0x5555646c6070 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x82de82028939aff8 - Init START cnode7-013:3719331:3719993 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3719331:3719993 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-013:3719331:3719993 [0] NCCL INFO comm 0x5555646c6070 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-013:3719331:3719993 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15 cnode7-013:3719331:3719993 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3719331:3719993 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Connected all rings cnode7-013:3719331:3719993 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 05/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 13/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] viacnode7-012:59138:59138 [6] NCCL INFO cudaDriverVersion 12020 cnode7-012:59138:59138 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59138:59138 [6] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:59138:59138 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:59138:59936 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59138:59936 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:59138:59936 [6] NCCL INFO Using non-device net plugin version 0 cnode7-012:59138:59936 [6] NCCL INFO Using network IB cnode7-012:59138:59936 [6] NCCL INFO comm 0x555560551020 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x82de82028939aff8 - Init START cnode7-012:59138:59936 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:59138:59936 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-012:59138:59936 [6] NCCL INFO comm 0x555560551020 rank 6 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-012:59138:59936 [6] NCCL INFO NVLS Head 0: 0 8 cnode7-012:59138:59936 [6] NCCL INFO NVLS Head 1: 1 9 cnode7-012:59138:59936 [6] NCCL INFO NVLS Head 2: 2 10 cnode7-012:59138:59936 [6] NCCL INFO NVLS Head 3: 3 11 cnode7-012:59138:59936 [6] NCCL INFO NVLS Head 4: 4 12 cnode7-012:59138:59936 [6] NCCL INFO NVLS Head 5: 5 13 cnode7-012:59138:59936 [6] NCCL INFO NVLS Head 6: 6 14 cnode7-012:59138:59936 [6] NCCL INFO NVLS Head 7: 7 15 cnode7-012:59138:59936 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/14/-1->6->-1 [7] -1/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->14 [15] -1/-1/-1->6->5 cnode7-012:59138:59936 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-012:59138:59936 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Connected all rings cnode7-012:59138:59936 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channecnode7-013:3719337:3719337 [6] NCCL INFO cudaDriverVersion 12020 cnode7-013:3719337:3719337 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719337:3719337 [6] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3719337:3719337 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3719337:3719988 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719337:3719988 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3719337:3719988 [6] NCCL INFO Using non-device net plugin version 0 cnode7-013:3719337:3719988 [6] NCCL INFO Using network IB cnode7-013:3719337:3719988 [6] NCCL INFO comm 0x555572b172a0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x82de82028939aff8 - Init START cnode7-013:3719337:3719988 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3719337:3719988 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-013:3719337:3719988 [6] NCCL INFO comm 0x555572b172a0 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-013:3719337:3719988 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13 cnode7-013:3719337:3719988 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3719337:3719988 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Connected all rings cnode7-013:3719337:3719988 [6] NCCL INFO Channel 01/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 03/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 05/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 06/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 09/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 11/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] viacnode7-012:59133:59133 [1] NCCL INFO cudaDriverVersion 12020 cnode7-012:59133:59133 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59133:59133 [1] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:59133:59133 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:59133:59935 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59133:59935 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:59133:59935 [1] NCCL INFO Using non-device net plugin version 0 cnode7-012:59133:59935 [1] NCCL INFO Using network IB cnode7-012:59133:59935 [1] NCCL INFO comm 0x555583912220 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x82de82028939aff8 - Init START cnode7-012:59133:59935 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:59133:59935 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-012:59133:59935 [1] NCCL INFO comm 0x555583912220 rank 1 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-012:59133:59935 [1] NCCL INFO NVLS Head 0: 0 8 cnode7-012:59133:59935 [1] NCCL INFO NVLS Head 1: 1 9 cnode7-012:59133:59935 [1] NCCL INFO NVLS Head 2: 2 10 cnode7-012:59133:59935 [1] NCCL INFO NVLS Head 3: 3 11 cnode7-012:59133:59935 [1] NCCL INFO NVLS Head 4: 4 12 cnode7-012:59133:59935 [1] NCCL INFO NVLS Head 5: 5 13 cnode7-012:59133:59935 [1] NCCL INFO NVLS Head 6: 6 14 cnode7-012:59133:59935 [1] NCCL INFO NVLS Head 7: 7 15 cnode7-012:59133:59935 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] -1/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->9 [10] -1/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 cnode7-012:59133:59935 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-012:59133:59935 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Connected all rings cnode7-012:59133:59935 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channecnode7-013:3719332:3719332 [1] NCCL INFO cudaDriverVersion 12020 cnode7-013:3719332:3719332 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719332:3719332 [1] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3719332:3719332 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3719332:3719994 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719332:3719994 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3719332:3719994 [1] NCCL INFO Using non-device net plugin version 0 cnode7-013:3719332:3719994 [1] NCCL INFO Using network IB cnode7-013:3719332:3719994 [1] NCCL INFO comm 0x55556cd194c0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x82de82028939aff8 - Init START cnode7-013:3719332:3719994 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3719332:3719994 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-013:3719332:3719994 [1] NCCL INFO comm 0x55556cd194c0 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-013:3719332:3719994 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8 cnode7-013:3719332:3719994 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3719332:3719994 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Connected all rings cnode7-013:3719332:3719994 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 05/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 13/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-012cnode7-012:59139:59139 [7] NCCL INFO cudaDriverVersion 12020 cnode7-012:59139:59139 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59139:59139 [7] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:59139:59139 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:59139:59940 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:59139:59940 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:59139:59940 [7] NCCL INFO Using non-device net plugin version 0 cnode7-012:59139:59940 [7] NCCL INFO Using network IB cnode7-012:59139:59940 [7] NCCL INFO comm 0x555574f17060 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x82de82028939aff8 - Init START cnode7-012:59139:59940 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:59139:59940 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-012:59139:59940 [7] NCCL INFO comm 0x555574f17060 rank 7 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-012:59139:59940 [7] NCCL INFO NVLS Head 0: 0 8 cnode7-012:59139:59940 [7] NCCL INFO NVLS Head 1: 1 9 cnode7-012:59139:59940 [7] NCCL INFO NVLS Head 2: 2 10 cnode7-012:59139:59940 [7] NCCL INFO NVLS Head 3: 3 11 cnode7-012:59139:59940 [7] NCCL INFO NVLS Head 4: 4 12 cnode7-012:59139:59940 [7] NCCL INFO NVLS Head 5: 5 13 cnode7-012:59139:59940 [7] NCCL INFO NVLS Head 6: 6 14 cnode7-012:59139:59940 [7] NCCL INFO NVLS Head 7: 7 15 cnode7-012:59139:59940 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] 0/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/15/-1->7->-1 [8] -1/-1/-1->7->6 [9] 0/-1/-1->7->6 [10] 0/-1/-1->7->6 [11] 0/-1/-1->7->6 [12] 0/-1/-1->7->6 [13] 0/-1/-1->7->6 [14] 0/-1/-1->7->6 [15] 0/-1/-1->7->15 cnode7-012:59139:59940 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-012:59139:59940 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Connected all rings cnode7-012:59139:59940 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFOcnode7-013:3719338:3719338 [7] NCCL INFO cudaDriverVersion 12020 cnode7-013:3719338:3719338 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719338:3719338 [7] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3719338:3719338 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3719338:3719995 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3719338:3719995 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3719338:3719995 [7] NCCL INFO Using non-device net plugin version 0 cnode7-013:3719338:3719995 [7] NCCL INFO Using network IB cnode7-013:3719338:3719995 [7] NCCL INFO comm 0x55556191e4c0 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x82de82028939aff8 - Init START cnode7-013:3719338:3719995 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3719338:3719995 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-013:3719338:3719995 [7] NCCL INFO comm 0x55556191e4c0 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-013:3719338:3719995 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1 cnode7-013:3719338:3719995 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3719338:3719995 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Connected all rings cnode7-013:3719338:3719995 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 01/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 03/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 05/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 07/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO ChaCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 08/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 10/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 12/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 14/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Connected all rings cnode7-012:59132:59934 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 01/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 03/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 05/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 07/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 09/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 11/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 13/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 15/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:59132:59934 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Connected all trees cnode7-012:59132:59934 [0] NCCL INFO NVLS comm 0x5555646cf110 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:59132:59934 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [receive] vi P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:59136:59938 [4] NCCL INFO Connected all trees cnode7-012:59136:59938 [4] NCCL INFO NVLS comm 0x555572916570 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:59136:59938 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:59136:59938 [4] NCCL INFO Connected NVLS tree cnode7-012:59136:59938 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:59136:59938 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels pe P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:59138:59936 [6] NCCL INFO Connected all trees cnode7-012:59138:59936 [6] NCCL INFO NVLS comm 0x555560551020 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:59138:59936 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:59138:59936 [6] NCCL INFO Connected NVLS tree cnode7-012:59138:59936 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:59138:59936 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels pee7-012:59134:59941 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:59134:59941 [2] NCCL INFO Connected all trees cnode7-012:59134:59941 [2] NCCL INFO NVLS comm 0x555560b14ba0 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:59134:59941 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:59134:59941 [2] NCCL INFO Connected NVLS tree cnode7-012:59134:59941 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:59134:59941 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3719338:3719995 [7] NCCL INFO Ch Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 04/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 12/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3719332:3719994 [1] NCCL INFO Connected all trees cnode7-013:3719332:3719994 [1] NCCL INFO NVLS comm 0x55556cd194c0 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3719332:3719994 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3719332:3719994 [1] NCCL INFO Connected NVLS tree cnode7-013:3719332:3719994 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3719332:3719994 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3719332:3719994 [1] NCCL INFO comm 0x55556cd194c0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x82de82028939aff8 - Inl 13/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 15/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 00/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 02/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 04/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 06/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 08/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 10/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 12/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Channel 14/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3719336:3719990 [5] NCCL INFO Connected all trees cnode7-013:3719336:3719990 [5] NCCL INFO NVLS comm 0x55557c718560 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3719336:3719990 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3719336:3719990 [5] NCCL INFO Connected NVLS tree cnode7-013:3719336:3719990 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3719336:3719990 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:371933-> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 15/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 02/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 06/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 10/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Channel 14/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3719334:3719989 [3] NCCL INFO Connected all trees cnode7-013:3719334:3719989 [3] NCCL INFO NVLS comm 0x55557b718920 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3719334:3719989 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3719334:3719989 [3] NCCL INFO Connected NVLS tree cnode7-013:3719334:3719989 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3719334:3719989 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3719334:3719989 [3] Nannel 09/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 11/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 13/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 15/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 00/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 02/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 04/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 06/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 08/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 10/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 12/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Channel 14/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3719338:3719995 [7] NCCL INFO Connected all trees cnode7-013:3719338:3719995 [7] NCCL INFO NVLS comm 0x55556191e4c0 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3719338:3719995 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3719338:3719995 [7] NCCL INFO Connected NVLS tree cnode7-013:3719338:3719995 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3719338:3719995 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-0a NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:59132:59934 [0] NCCL INFO Connected NVLS tree cnode7-012:59132:59934 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:59132:59934 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:59132:59934 [0] NCCL INFO comm 0x5555646cf110 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x82de82028939aff8 - Init COMPLETE r peer cnode7-012:59136:59938 [4] NCCL INFO comm 0x555572916570 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x82de82028939aff8 - Init COMPLETE [rank0]:[W1205 00:14:38.174192204 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [rank4]:[W1205 00:14:38.174200741 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) nnel 04/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:59139:59940 [7] NCCL INFO Connected all trees cnode7-012:59139:59940 [7] NCCL INFO NVLS comm 0x555574f17060 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:59139:59940 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:59139:59940 [7] NCCL INFO Connected NVLS tree cnode7-012:59139:59940 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:59139:59940 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:59139:59940 [7] NCCL INFO comm 0x555574f17060 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x82de82028939aff8 - Init COMPLETE [rank7]:[W1205 00:14:38.174667710 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) r peer cnode7-012:59138:59936 [6] NCCL INFO comm 0x555560551020 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x82de82028939aff8 - Init COMPLETE [rank6]:[W1205 00:14:38.176702563 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) e7-012:59135:59937 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:59135:59937 [3] NCCL INFO Connected all trees cnode7-012:59135:59937 [3] NCCL INFO NVLS comm 0x555575317060 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:59135:59937 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:59135:59937 [3] NCCL INFO Connected NVLS tree cnode7-012:59135:59937 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:59135:59937 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:59135:59937 [3] NCCL INFO comm 0x555575317060 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x82de82028939aff8 - Init COMPLETE [rank3]:[W1205 00:14:38.176735639 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) 12:59134:59941 [2] NCCL INFO comm 0x555560b14ba0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x82de82028939aff8 - Init COMPLETE [rank2]:[W1205 00:14:38.176799981 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) :59133:59935 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:59133:59935 [1] NCCL INFO Connected all trees cnode7-012:59133:59935 [1] NCCL INFO NVLS comm 0x555583912220 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:59133:59935 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:59133:59935 [1] NCCL INFO Connected NVLS tree cnode7-012:59133:59935 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:59133:59935 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:59133:59935 [1] NCCL INFO comm 0x555583912220 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x82de82028939aff8 - Init COMPLETE [rank1]:[W1205 00:14:38.176979921 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:59137:59939 [5] NCCL INFO Connected all trees cnode7-012:59137:59939 [5] NCCL INFO NVLS comm 0x55557b318f20 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:59137:59939 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:59137:59939 [5] NCCL INFO Connected NVLS tree cnode7-012:59137:59939 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:59137:59939 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:59137:59939 [5] NCCL INFO comm 0x55557b318f20 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x82de82028939aff8 - Init COMPLETE [rank5]:[W1205 00:14:38.178864175 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) cnode7-013:3719338:3719995 [7] NCCL INFO comm 0l 13/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 14/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 00/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 02/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 04/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 08/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 10/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Channel 12/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3719337:3719988 [6] NCCL INFO Connected all trees cnode7-013:3719337:3719988 [6] NCCL INFO NVLS comm 0x555572b172a0 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3719337:3719988 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3719337:3719988 [6] NCCL INFO Connected NVLS tree cnode7-013:3719337:3719988 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3719337:3719988 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer [rank14]:[W1205 00:14:38.977614476 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [rank11]:[W1205 00:14:38.977617333 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [rank12]:[W1205 00:14:38.977626092 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [rank13]:[W1205 00:14:38.977627665 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [rank9]:[W1205 00:14:38.977630355 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [rank10]:[W1205 00:14:38.977630803 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [rank15]:[W1205 00:14:38.977638899 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [rank8]:[W1205 00:14:38.977641762 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) cnode7-013:3719337:3719988 [6] NCCL INFO comm 0x555572b172a0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x82de82028939aff8 - Init COMPLETE CCL INFO comm 0x55557b718920 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x82de82028939aff8 - Init COMPLETE l 12/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 15/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 08/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 10/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Channel 14/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3719335:3719992 [4] NCCL INFO Connected all trees cnode7-013:3719335:3719992 [4] NCCL INFO NVLS comm 0x55557bb1b8a0 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3719335:3719992 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3719335:3719992 [4] NCCL INFO Connected NVLS tree cnode7-013:3719335:3719992 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3719335:3719992 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3719335:3719992 [4] NCCL INFO comm 0x55557bb1b8a0 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x82de82028939aff8 - Init COMPLETE 6:3719990 [5] NCCL INFO comm 0x55557c718560 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x82de82028939aff8 - Init COMPLETE it COMPLETE cnode7-013:3719333:3719991 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3719333:3719991 [2] NCCL INFO Connected all trees cnode7-013:3719333:3719991 [2] NCCL INFO NVLS comm 0x55557bf14d90 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3719333:3719991 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3719333:3719991 [2] NCCL INFO Connected NVLS tree cnode7-013:3719333:3719991 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3719333:3719991 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3719333:3719991 [2] NCCL INFO comm 0x55557bf14d90 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x82de82028939aff8 - Init COMPLETE x55556191e4c0 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x82de82028939aff8 - Init COMPLETE l 15/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 10/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 12/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 14/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3719331:3719993 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Connected all trees cnode7-013:3719331:3719993 [0] NCCL INFO NVLS comm 0x5555646c6070 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3719331:3719993 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3719331:3719993 [0] NCCL INFO Connected NVLS tree cnode7-013:3719331:3719993 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3719331:3719993 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3719331:3719993 [0] NCCL INFO comm 0x5555646c6070 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x82de82028939aff8 - Init COMPLETE [cnode7-012:0/16] 2024-12-05 00:14:38,558 (trainer:311) INFO: 1/45epoch started [cnode7-012:0/16] 2024-12-05 00:14:38,560 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-05 00:15:04,835 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-05 00:15:20,324 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-05 00:15:20,324 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-05 00:15:20,327 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-05 00:15:43,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:43,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:43,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:44,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:44,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:44,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:44,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:45,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:45,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:45,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:45,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:45,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:46,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:46,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:46,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:15:46,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:30,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:30,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:31,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:31,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:31,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:32,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:32,596] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:32,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:32,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:32,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:33,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:33,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:35,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:36,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:36,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:16:44,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:15,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:18,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:18,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:19,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:19,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:19,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:19,559] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:19,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:20,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:20,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:20,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:22,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:23,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:26,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:33,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:17:36,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:03,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:04,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:05,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:06,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:07,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:07,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:07,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:11,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:12,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:12,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:12,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:12,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:15,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:17,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:22,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-05 00:18:31,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/train/trainer.py:609: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast( /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): [cnode7-012:0/16] 2024-12-05 00:18:57,882 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,889 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,910 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,911 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,920 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,921 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,930 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,931 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,940 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,941 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,950 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,951 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,960 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,961 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,970 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type [cnode7-012:0/16] 2024-12-05 00:18:57,971 (attention:234) WARNING: Flash attn has exception: FlashAttention only support fp16 and bf16 data type Process SpawnProcess-1: Process SpawnProcess-8: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-1: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-2: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-8: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-3: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-7: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-4: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-6: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-5: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-7: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-6: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-3: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-5: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-4: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float Process SpawnProcess-2: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 317, in run all_steps_are_invalid = cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/trainer.py", line 677, in train_one_epoch scaler.scale(loss).backward() File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward torch.autograd.backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 289, in backward _engine_run_backward( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 306, in apply return user_fn(self, *args) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 652, in backward outputs = ctx.run_function(*detached_inputs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/encoder_layer.py", line 89, in forward x = self.norm1(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/layer_norm.py", line 37, in forward return super(LayerNorm, self).forward(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 202, in forward return F.layer_norm( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 2576, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: expected scalar type BFloat16 but found Float cnode7-013:3719336:3720143 [5] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:3719332:3720139 [1] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:3719337:3720136 [6] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:3719335:3720137 [4] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:3719333:3720140 [2] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:3719334:3720141 [3] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:3719331:3720150 [0] NCCL INFO [Service thread] Connection closed by localRank 7 W1205 00:19:04.717000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 3719331 via signal SIGTERM W1205 00:19:04.718000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 3719332 via signal SIGTERM W1205 00:19:04.718000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 3719333 via signal SIGTERM W1205 00:19:04.719000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 3719334 via signal SIGTERM W1205 00:19:04.720000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 3719335 via signal SIGTERM W1205 00:19:04.720000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 3719336 via signal SIGTERM W1205 00:19:04.721000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 3719337 via signal SIGTERM cnode7-012:59135:60041 [3] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-012:59133:60034 [1] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-012:59132:60039 [0] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-012:59134:60038 [2] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-012:59134:60038 [2] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-012:59134:60038 [2] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:59132:60039 [0] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-012:59132:60039 [0] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:59139:60035 [7] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-012:59139:60035 [7] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-012:59139:60035 [7] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:59135:60041 [3] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-012:59135:60041 [3] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:59133:60034 [1] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-012:59133:60034 [1] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:59135:60041 [3] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-012:59135:60041 [3] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-012:59133:60034 [1] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-012:59133:60034 [1] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-012:59134:60038 [2] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-012:59134:60038 [2] NCCL INFO [Service thread] Connection closed by localRank 0 W1205 00:19:05.586000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 59132 via signal SIGTERM W1205 00:19:05.586000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 59133 via signal SIGTERM W1205 00:19:05.586000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 59134 via signal SIGTERM W1205 00:19:05.587000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 59135 via signal SIGTERM W1205 00:19:05.588000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 59139 via signal SIGTERM Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 23, in main() File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 19, in main S2TTask.main(cmd=cmd) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1273, in main while not ProcessContext(processes, error_files).join(): File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 178, in join raise ProcessExitedException( torch.multiprocessing.spawn.ProcessExitedException: process 7 terminated with exit code 1 Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 23, in main() File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 19, in main S2TTask.main(cmd=cmd) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1273, in main while not ProcessContext(processes, error_files).join(): File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 178, in join raise ProcessExitedException( torch.multiprocessing.spawn.ProcessExitedException: process 4 terminated with exit code 1 srun: error: cnode7-013: task 1: Exited with exit code 1 srun: error: cnode7-012: task 0: Exited with exit code 1 # Accounting: begin_time=1733328814 # Accounting: end_time=1733329148 # Accounting: time=334 threads=1 # Finished at Thu Dec 5 00:19:08 CST 2024 with status 1